def summarize_secitons(document, sections, coef=0.8): logit(document) doc = Document(document) all_sentences, all_offset = doc.all_sentences() summ = [] for section_name in sections: sec_sentences, sec_offset = doc.section_sentences(section_name) limit = len(sec_sentences) # Ranker ranker = SectionMMR(all_sentences) ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef) sentencs = ranker.scores summary = [] for x in range(num): idx = sentencs[x][0] + sec_offset sent = doc[idx].sentence summary.append((sent, sentencs[x][1], doc.get_section_name(idx))) summ.append(sent) text = '' logit("\nSection : " + section_name) for sent, score, section in summary: text += '\n' + sent.encode('utf-8') logit(text) file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join(summ).encode('utf-8'))
def getUnwanted(outfile): dir = DIR['BASE'] + "demo/" os.chdir(dir) samples = '' total = 0 num = 12 samples = '' for file in glob("*.xml"): try: doc = Document(file) sentences, offset = doc.all_sentences() # Ranker ranker = TextRank(sentences) ranker.rank() scores = sorted(ranker.scores, key=itemgetter(1)) for x in range(num): idx = scores[x][0] + offset samples += doc[idx].sentence.encode('utf-8') + '\n' total += 1 print(file + " : Done") except Exception as e: print(file + str(e)) # for now this is the location of the file writeToFile(outfile, samples, 'w') print ("Total number of files processed successfully : " + str(total))
def getUnwanted(outfile): dir = DIR['BASE'] + "demo/" os.chdir(dir) samples = '' total = 0 num = 12 samples = '' for file in glob("*.xml"): try: doc = Document(file) sentences, offset = doc.all_sentences() # Ranker ranker = TextRank(sentences) ranker.rank() scores = sorted(ranker.scores, key=itemgetter(1)) for x in range(num): idx = scores[x][0] + offset samples += doc[idx].sentence.encode('utf-8') + '\n' total += 1 print(file + " : Done") except Exception as e: print(file + str(e)) # for now this is the location of the file writeToFile(outfile, samples, 'w') print("Total number of files processed successfully : " + str(total))
def summarize_secitons(document, sections, coef=0.8): logit(document) doc = Document(document) all_sentences, all_offset = doc.all_sentences() summ = [] for section_name in sections: sec_sentences, sec_offset = doc.section_sentences(section_name) limit = len(sec_sentences) # Ranker ranker = SectionMMR(all_sentences) ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef) sentencs = ranker.scores summary = [] for x in range(num): idx = sentencs[x][0] + sec_offset sent = doc[idx].sentence summary.append((sent, sentencs[x][1], doc.get_section_name(idx))) summ.append(sent) text = "" logit("\nSection : " + section_name) for sent, score, section in summary: text += "\n" + sent.encode("utf-8") logit(text) file = DIR["BASE"] + "data/Summary.txt" with open(file, "w") as sfile: sfile.write("\n".join(summ).encode("utf-8"))
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "All input files processed to create feature vectors for training."
def get_test_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 7 x = 0 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x += 1 #if not validSentence(doc[idx]): # continue #else: # sent_idx.append(idx) # samples += doc[idx].sentence.encode('utf-8') + '\n' # num -= 1 sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 #--------------------------------------------------- # Storing the sentence in the dictionary for pickling for display infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1) key = infi + "-" + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx, sent_idx
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() #----------------------------------------- sents, sent_indices = getSecRankedSent(doc) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_indices = sent2Section(doc, sent_indices) summary = [] classified = [] sum_len = 0 for sent, sec_idx in zip(sents, sec_indices): #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, sent)) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx, False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) classified.append((sent, sent_val)) for sent, val in sorted(classified, key=itemgetter(1)): summary.append(sent) sum_len += len(sent.split(' ')) if sum_len > 130: break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def summarize(document, all=True): doc = Document(document) sentences, offset = (doc.all_sentences() if all else doc.filtered_sentences()) # Ranker ranker = TextRank(sentences) ranker.rank() scores = ranker.scores # Selector summary = [] sum_len = 0 for x in range(num): idx = scores[x][0] + offset sent = doc[idx].sentence if sum_len + len(sent.split(' ')) > MAXLEN: break summary.append((sent, scores[x][1], doc.get_section_name(idx))) sum_len += len(sent.split(' ')) text = '' logit("\nP10-1024") logit("\nAll Sentences" if all else "\nFiltered Sentences") logit("Length of summary : " + str(sum_len)) for sent, score, section in summary: text += '\n' + "[" + section.encode('utf-8') + "] " + \ sent.encode('utf-8') #"[" + str(score) + "] " + sent.encode('utf-8') logit(text) # Printer # this has to be automated file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join([sent for sent, sc, sec in summary]).encode('utf-8')) # Evaluator guess_summary_list = [file] ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]] recall, precision, F_measure = PythonROUGE(guess_summary_list, ref_summary_list, ngram_order=1) logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision, F_measure))
def summarize(document, all=True): doc = Document(document) sentences, offset = (doc.all_sentences() if all else doc.filtered_sentences()) # Ranker ranker = TextRank(sentences) ranker.rank() scores = ranker.scores # Selector summary = [] sum_len = 0 for x in range(num): idx = scores[x][0] + offset sent = doc[idx].sentence if sum_len + len(sent.split(' ')) > MAXLEN: break summary.append((sent, scores[x][1], doc.get_section_name(idx))) sum_len += len(sent.split(' ')) text = '' logit("\nP10-1024") logit("\nAll Sentences" if all else "\nFiltered Sentences") logit("Length of summary : " + str(sum_len)) for sent, score, section in summary: text += '\n' + "[" + section.encode('utf-8') + "] " + \ sent.encode('utf-8') #"[" + str(score) + "] " + sent.encode('utf-8') logit(text) # Printer # this has to be automated file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join([sent for sent, sc, sec in summary]). encode('utf-8')) # Evaluator guess_summary_list = [file] ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]] recall, precision, F_measure = PythonROUGE(guess_summary_list, ref_summary_list, ngram_order=1) logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision, F_measure))
def generateTestFeatures(infile): doc = Document(infile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ all_sentences, all_offset = doc.all_sentences() ranker = TextRank(all_sentences) ranker.rank() num = 7 x = 0 test_sents = [] sent_indices = [] while num > 0: idx = ranker.scores[x][0] + all_offset x += 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) test_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 #------------------------------------------------ # For display and analysis key = fcode + '-' + str(idx) test_data[key] = { 'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2) } #----------------------------------------- for sentence, sent_idx in zip(test_sents, sent_indices): key = fcode + '-' + str(sent_idx) print key print test_data[key]['contextpre'] print "----Main sentence Start----" print test_data[key]['sentence'] print "----Main sentence End----" print test_data[key]['contextpos'] feature_string = raw_input() feature_string += '1' test_data[key]['reallbl'] = feature_string
def generateTestFeatures(infile): doc = Document(infile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ all_sentences, all_offset = doc.all_sentences() ranker = TextRank(all_sentences) ranker.rank() num = 7 x = 0 test_sents = [] sent_indices = [] while num > 0: idx = ranker.scores[x][0] + all_offset x += 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) test_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 #------------------------------------------------ # For display and analysis key = fcode + '-' + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} #----------------------------------------- for sentence, sent_idx in zip(test_sents, sent_indices): key = fcode + '-' + str(sent_idx) print key print test_data[key]['contextpre'] print "----Main sentence Start----" print test_data[key]['sentence'] print "----Main sentence End----" print test_data[key]['contextpos'] feature_string = raw_input() feature_string += '1' test_data[key]['reallbl'] = feature_string
def get_neg_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 5 x = -1 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x -= 1 if not validSentence(doc[idx]): continue else: sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------Negative---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx
def summarize_secitons(document, sections): logit(document) doc = Document(document) all_sentences, all_offset = doc.all_sentences() summ = [] for section_name in sections: sec_sentences, sec_offset = doc.section_sentences(section_name) # Ranker ranker = TextRank(sec_sentences) ranker.rank() sentencs = ranker.scores summary = [] for x in range(num): idx = sentencs[x][0] + sec_offset sent = doc[idx].sentence summary.append((sent, sentencs[x][1], doc.get_section_name(idx))) summ.append(sent) text = '' logit("\nSection : " + section_name) for sent, score, section in summary: text += '\n' + sent.encode('utf-8') logit(text) file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join(summ).encode('utf-8')) # Evaluator guess_summary_list = [file] ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]] recall, precision, F_measure = PythonROUGE(guess_summary_list, ref_summary_list, ngram_order=1) logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision, F_measure))
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees( getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, 1, False) train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'), 'reallbl': '+1', 'features': feature_string} writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): key = fcode + '-' + str(sent_idx) feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, 1, False) train_data[key] = {'sentence': doc[sent_idx].sentence.encode('utf-8'), 'reallbl': '-1', 'features': feature_string} writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "All input files processed to create feature vectors for training."