def summarise_text(self, key_phrases_output, top_sentences_output): """ Summarise the text filtered by the top n sentences count. Put summary sentences together along with the key phrases which helped to determine the summary sentences. Parameters ========== key_phrases_output: text file (JSON) into which key phrases are stored top_sentences_output: text file (JSON) into which top scored sentences are written Return ====== Nothing, writes top n sentences into a text file (JSON) by the name specified in top_sentences_output """ phrases = pytextrank.limit_keyphrases(key_phrases_output, phrase_limit=12) key_phrases = ", ".join([phrase for phrase in phrases]) sentence_iterator = sorted(pytextrank.limit_sentences( \ top_sentences_output, word_limit=150), key=lambda x: x[1]) sentences = [] for sentence_text in sentence_iterator: sentences.append(pytextrank.make_sentence(sentence_text[0])) return " ".join(sentences), key_phrases
def summarize(path_stage1, path_stage2, path_stage3, wordlimit, phraselimit): print('Generating summary...') #Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) try: os.remove(path_stage3) except OSError: pass with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") #print(pytextrank.pretty_print(s._asdict())) #Summarize essay based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=phraselimit) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=wordlimit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,)) return graf_text, phrases
def summarize(self, _id, content_text, word_limit): self.logger.log("_id: " + _id) self.logger.log("word_limit: " + str(word_limit)) # File names path_stage0 = 'process/' + _id + '.json' path_stage1 = 'process/' + _id + '_o1.json' path_stage2 = 'process/' + _id + '_o2.json' path_stage3 = 'process/' + _id + '_o3.json' path_stage4 = 'process/' + _id + '_o4.json' # Create input file with open(path_stage0, 'w') as outfile: json.dump({"id": "123", "text": content_text}, outfile) # Statistical Parsing - Stage 1 # Perform statistical parsing/tagging on a document in JSON format with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Ranked Keyphrases - Stage 2 # Collect and normalize the key phrases from a parsed document graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # Extractive Summarization - Stage 3 # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # Final Output - Stage 4 # Summarize a document based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) return {'excerpts': graf_text, 'keywords': phrases}
def pytrankSummarize(filename): """ This is another TextRank algorithm. It works in four stages, each feeding its output to the next 1. Part-of-Speech Tagging and lemmatization are performed for every sentence in the document. 2. Key phrases are extracted along with their counts, and are normalized. 3. Calculates a score for each sentence by approximating jaccard distance between the sentence and key phrases. 4. Summarizes the document based on most significant sentences and key phrases. """ import pytextrank jsonText = createJSON(filename) path_stage0 = jsonText path_stage1 = "o1.json" with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) path_stage2 = "o2.json" graph, ranks = pytextrank.text_rank(path_stage1) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=50), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) print("") print("####### From PyTextRank #######") print("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, ))
def stage4(path_stage2, path_stage3, phrase_l, word_l): #Stage 4 phrases = [ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=int(phrase_l)) ] sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=int(word_l)), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(" ".join(sent_text)) return phrases, s
def summarise_text(self, key_phrases_output, top_sentences_output): key_phrases = ", ".join( set([ phrase for phrase in pytextrank.limit_keyphrases(key_phrases_output, phrase_limit=12) ])) sentence_iterator = sorted(pytextrank.limit_sentences( top_sentences_output, word_limit=150), key=lambda x: x[1]) sentences = [] for sentence_text, idx in sentence_iterator: sentences.append(pytextrank.make_sentence(sentence_text)) return " ".join(sentences), key_phrases
def summarize_text(input_file): # seriously f**k this API path_stage0 = input_file path_stage1 = 'stage1.txt' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(graf)) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) path_stage2 = 'stage2.txt' with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(rl)) path_stage3 = 'stage3.txt' kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook #print(pytextrank.pretty_print(s._asdict())) phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,)) return ' '.join(s)
def summarise_text(self, key_phrases_output, top_sentences_output): """ Summarise the text filtered by the top n sentences count. Put summary sentences together along with the key phrases which helped to determine the summary sentences. """ key_phrases = ", ".join({ [ phrase for phrase in \ pytextrank.limit_keyphrases(key_phrases_output, phrase_limit=12) ] }) sentence_iterator = sorted(pytextrank.limit_sentences( \ top_sentences_output, word_limit=150), key=lambda x: x[1]) sentences = [] for sentence_text in sentence_iterator: sentences.append(pytextrank.make_sentence(sentence_text)) return " ".join(sentences), key_phrases
def retrieveSentences(content, word_limit): currpath = os.getcwd() folder = os.path.join(currpath, str(uuid.uuid4())) os.mkdir(folder) fname = str(uuid.uuid4()) with open("{0}/{1}.json".format(folder, fname), "w") as f: f.write(json.dumps({"id": fname, "text": content})) f.close() path_stage0 = "{0}/{1}.json".format(folder, fname) path_stage1 = "{0}/o1.json".format(folder) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) f.close() path_stage2 = "{0}/o2.json".format(folder) graph, ranks = pytextrank.text_rank(path_stage1) #pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) f.close() kernel = pytextrank.rank_kernel(path_stage2) path_stage3 = "{0}/o3.json".format(folder) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") f.close() sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) shutil.rmtree(folder) return s
def stage_4(): cur_dir = os.path.dirname(__file__) data_dir = stage_1_dir ids = os.listdir(data_dir) phrase_limit = 15 word_limit = 100 result_dir = stage_4_dir + '_limits_' + str(phrase_limit) + '_' + str( word_limit) if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) for cur_id in ids: phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases( stage_2_dir + '\\' + cur_id, phrase_limit=phrase_limit) ])) sent_iter = sorted(pytextrank.limit_sentences(stage_3_dir + '\\' + cur_id, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) with codecs.open(cur_id[:-5] + '.txt', "w+", "utf_8_sig") as file: file.write("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, )) os.chdir(cur_dir)
'minimal generating sets of solutions for all types of systems are given. ' + \ 'These criteria and the corresponding algorithms for constructing a minimal ' + \ 'supporting set of solutions can be used in solving all the considered types ' + \ 'systems and systems of mixed types.' someothertext = 'Amazon.com, Inc. is located in Seattle, WA and was founded July 5th, 1994 by Jeff Bezos, ' + \ 'allowing customers to buy everything from books to blenders. Seattle is north of Portland and ' + \ 'south of Vancouver, BC. Other notable Seattle - based companies are Starbucks and Boeing.' docs = [{'text': sometext, 'id': 777}] grafs = [{'graf': graf.graf} for graf in pytextrank.parse_doc(docs)] graph, ranks = pytextrank.text_rank(grafs) rank_list = [ rl._asdict() for rl in pytextrank.normalize_key_phrases(grafs, ranks, skip_ner=False) ] kernel = pytextrank.rank_kernel(rank_list) sents = [s._asdict() for s in pytextrank.top_sentences(kernel, grafs)] phrases = [ p[0] for p in pytextrank.limit_keyphrases(rank_list, phrase_limit=6) ] sent_iter = sorted(pytextrank.limit_sentences(sents, word_limit=150), key=lambda x: x[1]) sents = [pytextrank.make_sentence(sent_text) for sent_text, idx in sent_iter] graf_text = ' '.join(sents) print("\n**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, ))
def extract_phrasesfrom_textrank(corpus): record_data = pd.DataFrame({'sentences': corpus}) record_data = pd.DataFrame({ 'id': record_data.index.tolist(), 'text': record_data['sentences'].tolist() }) tweet_items = [] for jdict in record_data.to_dict(orient='records'): tweet_items.append(jdict) new_df_tweet = pd.DataFrame(columns=['text', 'keywords']) path_stage1 = "celebrity1_tweet.json" path_stage2 = "celebrity2_tweet.json" path_stage3 = "celebrity3_tweet.json" for item in tweet_items: items_new = [item] with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(items_new): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=5) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) new_df_tweet = new_df_tweet.append( { 'text': item.get('text'), 'keywords': phrases }, ignore_index=True) celeb_list = [ 'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper', 'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood', 'clint', 'eastwood' ] cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords']) for index, row in new_df_tweet.iterrows(): if any(celeb in row['keywords'] for celeb in celeb_list): cleaned_df_tweet = cleaned_df_tweet.append( { 'sentences': row['text'], 'keywords': row['keywords'] }, ignore_index=True) cleaned_df_tweet.to_csv(phrase_filepath, sep=',', encoding='utf-8', index=False) new_df_tweet.to_csv(all_phrasefile_path, sep=',', encoding='utf-8', index=False) return new_df_tweet, cleaned_df_tweet
sentences_fake_json.append([sentence_dict]) print(pytextrank.pretty_print(sentence_dict)) stage_3_filename = "{publisher}_{version}_textRank.topSentences.".format( version=version, publisher=publisher) stage_3_out = os.path.join(directory, "Stage3Results", "agglomerated", stage_3_filename) pickle.dump(fake_json_graph_dicts, open(stage_3_out, 'wb')) #stage 4: generate a summary of the entire set of books phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(rl_fake_json, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(sentences_fake_json, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) print("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, )) stage_4_filename = "{publisher}_{version}_textRank.summaryText.".format( version=version, publisher=publisher) stage_4_out = os.path.join(directory, "Stage4Results",
with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) #print(pytextrank.pretty_print(rl)) path_stage1 = "o1.json" path_stage2 = "o2.json" path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook print(pytextrank.pretty_print(s._asdict())) path_stage2 = "o2.json" path_stage3 = "o3.json" phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(path_stage2)])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=200), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,))
#!/usr/bin/env python # encoding: utf-8 from pytextrank import limit_keyphrases, limit_sentences, make_sentence import sys ## Stage 4: ## * summarize a document based on most significant sentences and key phrases ## ## INPUTS: <stage2> <stage3> ## OUTPUT: Markdown format if __name__ == "__main__": path_stage2 = sys.argv[1] path_stage3 = sys.argv[2] phrases = ", ".join( [p for p in limit_keyphrases(path_stage2, phrase_limit=12)]) sent_iter = sorted(limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append((make_sentence([w for w in sent_text]))) graf_text = " ".join(s) print("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, ))
path_stage2 = "o2.json" path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook print(pytextrank.pretty_print(s._asdict())) path_stage2 = "o2.json" path_stage3 = "o3.json" phrases = ", ".join( set([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=500), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) print("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, ))
def insert_key_phrases_into_db(list_of_doc_dicts, doctype, collection): ''' Takes in list of doc dictionaries and a doctype ('comment' or 'post'), processes each doc with PyTextRank, obtains key phrases and inserts key phrases into document in Mongodb as 'key_phrases' field. ''' path_stage0 = 'stage0.json' path_stage1 = 'stage1.json' path_stage2 = 'stage2.json' path_stage3 = 'stage3.json' total_docs = len(list_of_doc_dicts) failed_ids = [] for i, doc_dict in enumerate(list_of_doc_dicts): if i % 50 == 0: print(f'processing {i} of {total_docs} documents') doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0] try: with open(path_stage0, 'w') as f: json.dump(doc_dict, f) # Stage 1 with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # print(pytextrank.pretty_print(graf)) # Stage 2 graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook # print(pytextrank.pretty_print(rl)) # Stage 3 kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook # print(pytextrank.pretty_print(s._asdict())) # Stage 4 phrase_list = list( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=15) ])) phrases = ", ".join(phrase_list) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) collection.update_one({f'{doctype}_id': { '$eq': doc_dict['id'] }}, {'$set': { 'key_phrases': phrase_list }}) except: failed_ids.append(doc_dict['id']) print('failed on ', doc_dict['id']) continue
from pytextrank import limit_keyphrases, limit_sentences, make_sentence import sys import json ## Stage 4: ## * summarize a document based on most significant sentences and key phrases ## ## INPUTS: <stage2> <stage3> ## OUTPUT: Markdown format if __name__ == "__main__": path_stage2 = sys.argv[1] path_stage3 = sys.argv[2] phrases = ", ".join(set([p for p in limit_keyphrases(path_stage2, phrase_limit=30)])) sent_iter = sorted(limit_sentences(path_stage3, word_limit=500), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(make_sentence(sent_text)) data = {'sentences' : s} with open('output.json', 'w') as file: json.dump(data, file)