def generateTestFeatures(client_socket, infile, featurefile): # ------------------------------------------------ doc = Document(infile) # ------------------------------------------------ # Load pickle for label picklefile = DIR["DATA"] + "test-labels-pickle" global test_labels with open(picklefile, "rb") as pfile: test_labels = pickle.load(pfile) # ------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r"(.+)-parscit-section\.xml", filename).group(1) # ------------------------------------------------ test_sents, sent_indices = getRankedSent(doc, fcode) # ----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = "" for key in sorted(block.keys()): sentences += str(block[key]) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) # ----------------------------------------- for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices, sec_indices): key = fcode + "-" + str(sent_idx) feature_string = test_data[key]["reallbl"] tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) test_data[key]["depparse"] = getTree(tree) test_data[key]["features"] = feature_string writeToFile(featurefile, feature_string + "\n", "a")
def generateTestFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) #------------------------------------------------ # Load pickle for label picklefile = DIR['DATA'] + 'test-labels-pickle' global test_labels with open(picklefile, 'rb') as pfile: test_labels = pickle.load(pfile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ test_sents, sent_indices = getRankedSent(doc, fcode) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(test_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = test_data[key]['reallbl'] tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) test_data[key]['depparse'] = getTree(tree) test_data[key]['features'] = feature_string writeToFile(featurefile, feature_string + '\n', 'a')
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "File processed to create feature vectors for training."
def generateTrainFeatures(client_socket, infile, featurefile): # ------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() # ------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences("abstract") sent_indices = range(offset, offset + len(pos_sents)) # ----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = "" for key in sorted(block.keys()): sentences += str(block[key]) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) # ----------------------------------------- # Count ranker # count_ranker = Ranker(all_sentences, tfidf=False) # ----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = "+1" tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + "\n", "a") # ------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode("utf-8")) num -= 1 sec_indices = sent2Section(doc, sent_indices) # ------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = "-1" tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + "\n", "a") # ------------------------------------------------ print "File processed to create feature vectors for training."
for i in range(07): set = [] for k in range(11): curr = choice(bucket) set.append(curr) bucket.remove(curr) all_sets.append(set) for i in range(07): test_set = all_sets[i] train_set = [] for set in [all_sets[z] for z in range(07) if z != i]: train_set.extend(set) for key in train_set: writeToFile(featurefile, data[key]['features'] + '\n', 'a') trainSvm(featurefile, model, gamma=1) predictSvm(featurefile, model, outfile) outstring = "Training Fold : " + str(i) print "************* " + outstring + " *************" analyze(featurefile, outfile, resfile, outstring) deleteFiles([featurefile, outfile]) for key in test_set: writeToFile(featurefile, data[key]['features'] + '\n', 'a') predictSvm(featurefile, model, outfile) outstring = "Testing Fold : " + str(i) pre, rec = analyze(featurefile, outfile, resfile, outstring) precision.append(pre) recall.append(rec)