def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): """ Collect and normalise key phrases from the sentences in the paragraph (in the JSON doc) Rank them using PyTextRank, return a graph and ranked tokens Parameters ========== paragraph_output: tagged and parsed JSON document as text file key_phrases_output: output text file (JSON) into which key phrases are stored Return ====== Returns a graph (object) and ranked tokens (dictionary) """ graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as temp_file: for relationship in pytextrank.normalize_key_phrases( paragraph_output, token_ranks): temp_file.write( "%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def stage_2(): cur_dir = os.path.dirname(__file__) data_dir = stage_1_dir ids = os.listdir(data_dir) result_dir = stage_2_dir if os.path.exists(result_dir): shutil.rmtree(result_dir, ignore_errors=True) os.mkdir(result_dir) os.chdir(result_dir) if not os.path.exists('pictures'): os.mkdir('pictures') for cur_id in ids: if os.path.exists(cur_id): continue cur_file_name = data_dir + "\\" + cur_id print(cur_id) graph, ranks = pytextrank.text_rank(cur_file_name) pytextrank.render_ranks(graph, ranks, cur_id) with codecs.open(cur_id, "w+", "utf_8_sig") as file: for rl in pytextrank.normalize_key_phrases(cur_file_name, ranks): file.write("%s\n" % pytextrank.pretty_print(rl._asdict())) os.chdir(cur_dir)
def generateGraph(text, outputfile, outputdir, plotGraph=False): print('Generating Graph...') #Start by doing statistical parsing/tagging for temp_file = os.path.join(outputdir, 'temp.json') path_stage1 = os.path.join(outputdir, outputfile.split("_")[0] + '_o1.json') txtToJson.textTojson(text, temp_file) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(temp_file)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) #Collect and Normalize the key sentences from the parsed doc graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) #path_stage2 = path_stage1.replace('o1', 'o2') path_stage2 = os.path.join(outputdir, outputfile) try: os.remove(outputfile) except OSError: pass with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) #print(pytextrank.pretty_print(rl)) try: os.remove(temp_file) except OSError: pass if plotGraph: matplotlib.rcParams['figure.figsize'] = (15.0, 15.0) networkx.draw_networkx(graph) plt.show() nx.draw(graph, with_labels=True) plt.show()
def one(text): path_stage0 = "tempfile.json" path_stage1 = "o1.json" path_stage2 = "o2.json" f = open("tempfile.json", "w") f.write("{\"id\":\"777\", \"text\":\"" + text + "\"}") f.close() with open(path_stage1, 'w') as f: for graf in parse_doc(json_iter(path_stage0)): f.write("%s\n" % pretty_print(graf._asdict())) graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) outputs = [] with open(path_stage2, 'w') as f: for rl in normalize_key_phrases(path_stage1, ranks): ans = "%s\n" % pretty_print(rl._asdict()) output = ast.literal_eval(ans) outputs.append((output["text"], output["rank"])) os.remove("tempfile.json") return outputs # text = "The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth." # print (one("The earliest recorded model for planetary motions proposed by Ptolemy about 2000 years ago was a ‘geocentric’ model in which all celestial objects, stars, the sun and the planets, all revolved around the earth."))
def text_rank(json_request): pattern = re.compile("TI - (.*?)\\r|AB - (.*?)\\r") matches = re.findall(pattern, json_request['ris']) all_inputs = [] for section in matches: all_inputs.append((''.join([word + ' ' for word in section])).strip()) input_json = {} input_json['id'] = "0" input_json['text'] = '.'.join(all_inputs) with open('ris_extracted.json', 'w') as output: json.dump(input_json, output) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter('ris_extracted.json')): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) phrases = list([p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=20)]) return phrases
def stage2(path_stage1, path_stage2): #Stage 2 graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
def obj_to_keywords(obj): if isinstance(obj, list) == False: obj = [obj] graphs = list(map(lambda x: x, pytextrank.parse_doc(obj))) dicts = list(map(lambda x: x._asdict(), graphs)) graph, ranks = text_rank(path_stage1) keywords = list(map(lambda x: x._asdict(), pytextrank.normalize_key_phrases(dicts, ranks))) return keywords
def summarize(self, _id, content_text, word_limit): self.logger.log("_id: " + _id) self.logger.log("word_limit: " + str(word_limit)) # File names path_stage0 = 'process/' + _id + '.json' path_stage1 = 'process/' + _id + '_o1.json' path_stage2 = 'process/' + _id + '_o2.json' path_stage3 = 'process/' + _id + '_o3.json' path_stage4 = 'process/' + _id + '_o4.json' # Create input file with open(path_stage0, 'w') as outfile: json.dump({"id": "123", "text": content_text}, outfile) # Statistical Parsing - Stage 1 # Perform statistical parsing/tagging on a document in JSON format with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Ranked Keyphrases - Stage 2 # Collect and normalize the key phrases from a parsed document graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # Extractive Summarization - Stage 3 # Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # Final Output - Stage 4 # Summarize a document based on most significant sentences and key phrases phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) return {'excerpts': graf_text, 'keywords': phrases}
def pytrankSummarize(filename): """ This is another TextRank algorithm. It works in four stages, each feeding its output to the next 1. Part-of-Speech Tagging and lemmatization are performed for every sentence in the document. 2. Key phrases are extracted along with their counts, and are normalized. 3. Calculates a score for each sentence by approximating jaccard distance between the sentence and key phrases. 4. Summarizes the document based on most significant sentences and key phrases. """ import pytextrank jsonText = createJSON(filename) path_stage0 = jsonText path_stage1 = "o1.json" with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) path_stage2 = "o2.json" graph, ranks = pytextrank.text_rank(path_stage1) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=50), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) print("") print("####### From PyTextRank #######") print("**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases, ))
def execute_stage_two(path_stage1): graph, ranks = pytextrank.text_rank(os.path.join(PATH_PREFIX, path_stage1)) pytextrank.render_ranks(graph, ranks) path_name_components = path_stage1.split('.') path_name_components[path_name_components.index('stage1')] = 'stage2' path_stage2 = '-'.join(path_name_components) with open(os.path.join(PATH_PREFIX, path_stage2), 'w') as f: for rl in pytextrank.normalize_key_phrases(os.path.join(PATH_PREFIX, path_stage1), ranks, stopwords=stopwords): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) return path_stage2
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as f: for relationship in pytextrank.normalize_key_phrases( paragraph_output, token_ranks): f.write("%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def generate_phrases(self): """From the graph, take the phrases, with their count, rank""" _, ranks = pytextrank.text_rank(self.graphs) for normal_phrase in pytextrank.normalize_key_phrases( self.graphs, ranks, stopwords=[ "not_a_word______", ] # Setting that stopword is needed, because we dont want # that pytextrank, remove stopwords at this point. ): self.all_phrases.append(dict(normal_phrase._asdict()))
def stage_2_multiprocess(args): fake_json_graph_dicts = args[0] ranks = args[1] thread_num = args[2] rl_fake_json = [] for rl in pytextrank.normalize_key_phrases(fake_json_graph_dicts, ranks, stopwords=RAKE.SmartStopList()): print(pytextrank.pretty_print(rl)) rl_fake_json.append([rl._asdict()]) stage_2_filename = "{publisher}_{version}_textRank_{thread_num}_rl.Stage2.".format( version=version, thread_num=thread_num, publisher=publisher) stage_2_out = os.path.join(directory, "Stage2Results", stage_2_filename) pickle.dump(rl_fake_json, open(stage_2_out, 'wb'))
def collect_and_normalise_key_phrases(self, paragraph_output, key_phrases_output): """ Collect and normalise key phrases from the sentences in the paragraph (in the JSON doc) Rank them using PyTextRank, return a graph and ranked tokens """ graph, token_ranks = pytextrank.text_rank(paragraph_output) pytextrank.render_ranks(graph, token_ranks) with open(key_phrases_output, 'w') as temp_file: for relationship in pytextrank.normalize_key_phrases(paragraph_output, token_ranks): temp_file.write("%s\n" % pytextrank.pretty_print(relationship._asdict())) return graph, token_ranks
def summarize_text(input_file): # seriously f**k this API path_stage0 = input_file path_stage1 = 'stage1.txt' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(graf)) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) path_stage2 = 'stage2.txt' with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook #print(pytextrank.pretty_print(rl)) path_stage3 = 'stage3.txt' kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook #print(pytextrank.pretty_print(s._asdict())) phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=12) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=120), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) #print("**excerpts:** %s\n\n**keywords:** %s" % (graf_text, phrases,)) return ' '.join(s)
def text_ranking(video_seg_id, book_segment): """ :param book_segment: book segment in json format :return: key sentences and key phrases """ # os.chdir(video_path) # creating directory to store segments for clean structure if not os.path.exists('TextRank_data'): os.mkdir('TextRank_data') if not os.path.exists('TextRank_data/seg' + str(video_seg_id)): os.mkdir('TextRank_data/seg' + str(video_seg_id)) subdir = 'TextRank_data/seg' + str(video_seg_id) + '/' path_stage1 = subdir + "stage1.json" path_stage2 = subdir + "stage2_key_ph.json" path_stage3 = subdir + "stage3_imp_sent.json" """Perform statistical parsing/tagging on a document in JSON format""" parse_book_seg = pytextrank.parse_doc([book_segment]) with open(path_stage1, 'w') as f: for graf in parse_book_seg: f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) """Collect and normalize the key phrases from a parsed document""" key_phrases = list(pytextrank.normalize_key_phrases(path_stage1, ranks)) with open(path_stage2, 'w') as f: for rl in key_phrases: f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) kernel = pytextrank.rank_kernel(path_stage2) """Calculate a significance weight for each sentence, using MinHash to approximate a Jaccard distance from key phrases determined by TextRank""" key_sentences = list(pytextrank.top_sentences(kernel, path_stage1)) with open(path_stage3, 'w') as f: for s in key_sentences: f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") return key_sentences, key_phrases
def _get_keywords(path_stage0, path_stage2): # Stage 1: parse doc path_stage1 = 'o1.json' with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # Stage 2: rank words graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) result_dict = dict() with open(path_stage2, 'w') as f2: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): _ro = rl._asdict() ro = dict() ro[_ro['text']] = _ro['rank'] #f2.write("%s\n" % pytextrank.pretty_print(ro)) result_dict[_ro['text']] = _ro['rank'] return result_dict
def retrieveSentences(content, word_limit): currpath = os.getcwd() folder = os.path.join(currpath, str(uuid.uuid4())) os.mkdir(folder) fname = str(uuid.uuid4()) with open("{0}/{1}.json".format(folder, fname), "w") as f: f.write(json.dumps({"id": fname, "text": content})) f.close() path_stage0 = "{0}/{1}.json".format(folder, fname) path_stage1 = "{0}/o1.json".format(folder) with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) f.close() path_stage2 = "{0}/o2.json".format(folder) graph, ranks = pytextrank.text_rank(path_stage1) #pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) f.close() kernel = pytextrank.rank_kernel(path_stage2) path_stage3 = "{0}/o3.json".format(folder) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") f.close() sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=word_limit), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) shutil.rmtree(folder) return s
def rank_bill(bill): bill_id = bill['bill_id'] with open(prefix + '/{}_stage1'.format(bill_id), 'w') as f: for graf in parse_doc([bill]): f.write(pretty_print(graf._asdict())) f.write('\n') path_stage1 = prefix + '/{}_stage1'.format(bill_id) graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) for rl in normalize_key_phrases(path_stage1, ranks): output = pretty_print(rl._asdict()) with open(prefix + '/{}_stage2'.format(bill_id), 'w') as f: f.write(output) path_stage1 = prefix + '/{}_stage1'.format(bill_id) path_stage2 = prefix + '/{}_stage2'.format(bill_id) kernel = rank_kernel(path_stage2) with open(prefix + '/{}_stage3'.format(bill_id), 'w') as f: for s in top_sentences(kernel, path_stage1): f.write(pretty_print(s._asdict()))
path_stage1 = "o1.json" # Extract keyword using pytextrank with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) #print(pytextrank.pretty_print(graf._asdict())) path_stage1 = "o1.json" path_stage2 = "o2.json" graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) #print(pytextrank.pretty_print(rl)) path_stage1 = "o1.json" path_stage2 = "o2.json" path_stage3 = "o3.json" kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook print(pytextrank.pretty_print(s._asdict()))
def insert_key_phrases_into_db(list_of_doc_dicts, doctype, collection): ''' Takes in list of doc dictionaries and a doctype ('comment' or 'post'), processes each doc with PyTextRank, obtains key phrases and inserts key phrases into document in Mongodb as 'key_phrases' field. ''' path_stage0 = 'stage0.json' path_stage1 = 'stage1.json' path_stage2 = 'stage2.json' path_stage3 = 'stage3.json' total_docs = len(list_of_doc_dicts) failed_ids = [] for i, doc_dict in enumerate(list_of_doc_dicts): if i % 50 == 0: print(f'processing {i} of {total_docs} documents') doc_dict['text'] = doc_dict['text'].split('\n_____\n\n')[0] try: with open(path_stage0, 'w') as f: json.dump(doc_dict, f) # Stage 1 with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc( pytextrank.json_iter(path_stage0)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # print(pytextrank.pretty_print(graf)) # Stage 2 graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook # print(pytextrank.pretty_print(rl)) # Stage 3 kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") # to view output in this notebook # print(pytextrank.pretty_print(s._asdict())) # Stage 4 phrase_list = list( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=15) ])) phrases = ", ".join(phrase_list) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) collection.update_one({f'{doctype}_id': { '$eq': doc_dict['id'] }}, {'$set': { 'key_phrases': phrase_list }}) except: failed_ids.append(doc_dict['id']) print('failed on ', doc_dict['id']) continue
def do_pytextrank(data): for item in data: for subItem in data[item]: print('###############') print('description:', subItem['description']) # using pytextrank # reference https://github.com/ceteri/pytextrank/issues/18 # raw input subItemJSON = {'id': subItem['id'], 'text': subItem['description']} subItemJSON = json.dumps(subItemJSON) with open('sub_item.json', 'w') as outFile: outFile.write(subItemJSON) # stage 1 with open('stage1_output.json', 'w') as outFile: for graf in pytextrank.parse_doc( pytextrank.json_iter('sub_item.json')): outFile.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # stage 2 graph, ranks = pytextrank.text_rank('stage1_output.json') pytextrank.render_ranks(graph, ranks) rlLists = [] print('key phrases:') with open('stage2_output.json', 'w') as outFile: for rl in pytextrank.normalize_key_phrases( 'stage1_output.json', ranks): rlList = eval(pytextrank.pretty_print(rl)) rlLists.append(rlList) print(rlList) # cleanup os.system( 'rm -f sub_item.json stage1_output.json stage2_output.json graph.dot' ) # input filter results based on pos # this is a heuristic filteredRlLists = [x for x in rlLists if 'nn' not in x[-2]] if (len(filteredRlLists) == 0): # invalid case continue else: [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists) print('heuristic:', heuristic) print('i/o input:', iOItem) # input filter results based on pos # this is a heuristic filteredRlLists = [x for x in rlLists if 'nn' in x[-2]] if (len(filteredRlLists) == 0): # invalid case continue else: [heuristic, iOItem] = do_heuristic(subItem, filteredRlLists) print('heuristic:', heuristic) print('i/o output:', iOItem) print('###############')
with open(path_stage1, 'w') as f: grafs = pytextrank.parse_doc(pytextrank.json_iter(path_stage0)) for graf in grafs: f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) # to view output in this notebook # print(pytextrank.pretty_print(graf)) # path_stage1 = path_dir + "o1.json" path_stage2 = path_dir + "o2.json" graph, ranks = pytextrank.text_rank(grafs) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(grafs, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) # to view output in this notebook print(pytextrank.pretty_print(rl)) import networkx as nx import pylab as plt nx.draw(graph, with_labels=True) plt.show() path_stage1 = path_dir + "o1.json" path_stage2 = path_dir + "o2.json" path_stage3 = path_dir + "o3.json" kernel = pytextrank.rank_kernel(path_stage2)
def extract_phrasesfrom_textrank(corpus): record_data = pd.DataFrame({'sentences': corpus}) record_data = pd.DataFrame({ 'id': record_data.index.tolist(), 'text': record_data['sentences'].tolist() }) tweet_items = [] for jdict in record_data.to_dict(orient='records'): tweet_items.append(jdict) new_df_tweet = pd.DataFrame(columns=['text', 'keywords']) path_stage1 = "celebrity1_tweet.json" path_stage2 = "celebrity2_tweet.json" path_stage3 = "celebrity3_tweet.json" for item in tweet_items: items_new = [item] with open(path_stage1, 'w') as f: for graf in pytextrank.parse_doc(items_new): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) graph, ranks = pytextrank.text_rank(path_stage1) pytextrank.render_ranks(graph, ranks) with open(path_stage2, 'w') as f: for rl in pytextrank.normalize_key_phrases(path_stage1, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) kernel = pytextrank.rank_kernel(path_stage2) with open(path_stage3, 'w') as f: for s in pytextrank.top_sentences(kernel, path_stage1): f.write(pytextrank.pretty_print(s._asdict())) f.write("\n") phrases = ", ".join( set([ p for p in pytextrank.limit_keyphrases(path_stage2, phrase_limit=5) ])) sent_iter = sorted(pytextrank.limit_sentences(path_stage3, word_limit=150), key=lambda x: x[1]) s = [] for sent_text, idx in sent_iter: s.append(pytextrank.make_sentence(sent_text)) graf_text = " ".join(s) new_df_tweet = new_df_tweet.append( { 'text': item.get('text'), 'keywords': phrases }, ignore_index=True) celeb_list = [ 'Bradley Cooper', 'Chris Kyle', 'Clint Eastwood', 'bradley cooper', 'bradley', 'cooper', 'chris kyle', 'chris', 'kyle', 'clint eastwood', 'clint', 'eastwood' ] cleaned_df_tweet = pd.DataFrame(columns=['sentences', 'keywords']) for index, row in new_df_tweet.iterrows(): if any(celeb in row['keywords'] for celeb in celeb_list): cleaned_df_tweet = cleaned_df_tweet.append( { 'sentences': row['text'], 'keywords': row['keywords'] }, ignore_index=True) cleaned_df_tweet.to_csv(phrase_filepath, sep=',', encoding='utf-8', index=False) new_df_tweet.to_csv(all_phrasefile_path, sep=',', encoding='utf-8', index=False) return new_df_tweet, cleaned_df_tweet
def keyPhrases(): graph, ranks = pytextrank.text_rank('temp2.json') pytextrank.render_ranks(graph, ranks) with open('temp3.json', 'w') as f: for rl in pytextrank.normalize_key_phrases('temp2.json', ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict()))
'for components of a minimal set of solutions and algorithms of construction of ' + \ 'minimal generating sets of solutions for all types of systems are given. ' + \ 'These criteria and the corresponding algorithms for constructing a minimal ' + \ 'supporting set of solutions can be used in solving all the considered types ' + \ 'systems and systems of mixed types.' someothertext = 'Amazon.com, Inc. is located in Seattle, WA and was founded July 5th, 1994 by Jeff Bezos, ' + \ 'allowing customers to buy everything from books to blenders. Seattle is north of Portland and ' + \ 'south of Vancouver, BC. Other notable Seattle - based companies are Starbucks and Boeing.' docs = [{'text': sometext, 'id': 777}] grafs = [{'graf': graf.graf} for graf in pytextrank.parse_doc(docs)] graph, ranks = pytextrank.text_rank(grafs) rank_list = [ rl._asdict() for rl in pytextrank.normalize_key_phrases(grafs, ranks, skip_ner=False) ] kernel = pytextrank.rank_kernel(rank_list) sents = [s._asdict() for s in pytextrank.top_sentences(kernel, grafs)] phrases = [ p[0] for p in pytextrank.limit_keyphrases(rank_list, phrase_limit=6) ] sent_iter = sorted(pytextrank.limit_sentences(sents, word_limit=150), key=lambda x: x[1]) sents = [pytextrank.make_sentence(sent_text) for sent_text, idx in sent_iter] graf_text = ' '.join(sents) print("\n**excerpts:** %s\n\n**keywords:** %s" % ( graf_text, phrases,
# pytextrank.render_ranks(graph, ranks) #stage 2: normalize key phrases stage_2_directory = os.path.join(directory, "Stage2Results") stage_2_files = find_files(stage_2_directory, "*.Stage2") stage_2_filename = "{publisher}_{version}_textRank.normalizedKeyPhrases.".format( version=version, publisher=publisher) stage_2_out = os.path.join(directory, "Stage2Results", "agglomerated", stage_2_filename) rl_fake_json = [] # stage 2 output if not os.path.isfile(stage_2_out): counter = 0 for rl in pytextrank.normalize_key_phrases( fake_json_graph_dicts, ranks, stopwords=RAKE.SmartStopList()): # print(pytextrank.pretty_print(rl)) rl_fake_json.append([rl._asdict()]) stage_2_rl_filename = "___{publisher}_{version}_textRank_{thread_num}_rl.Stage2.".format( version=version, thread_num=counter, publisher=publisher) stage_2_rl_out = os.path.join(directory, "Stage2Results", stage_2_rl_filename) pickle.dump([rl._asdict()], open(stage_2_rl_out, 'wb')) counter += 1 # stage_2_args = [[fake_json_graph_dict, ranks, i] # for i, fake_json_graph_dict in # zip([k for k in range(0, len(fake_json_graph_dicts))], # fake_json_graph_dicts)]
if os.path.exists(OUTOUT_DIRECTORY): shutil.rmtree(OUTOUT_DIRECTORY) os.makedirs(OUTOUT_DIRECTORY) print('Saving tweets to json...') with open(TWEETS_JSON, 'w', encoding='utf8') as outfile: json.dump({'id': '777', 'text': '. '.join(user_tweets)}, outfile, ensure_ascii=False) print('Saving tweets to json - Done') print('Performing statistical parsing/tagging on tweets...') with open(STATISTICAL_PARSING_OUTPUT, 'w') as f: for graf in pytextrank.parse_doc(pytextrank.json_iter(TWEETS_JSON)): f.write("%s\n" % pytextrank.pretty_print(graf._asdict())) print('Performing statistical parsing/tagging on tweets... - Done') print('Collect and normalizing the key phrases from the parsed document...') graph, ranks = pytextrank.text_rank(STATISTICAL_PARSING_OUTPUT) pytextrank.render_ranks(graph, ranks) with open(KEY_PHRASES_NORMALIZATION_OUTPUT, 'w') as f: for rl in pytextrank.normalize_key_phrases(STATISTICAL_PARSING_OUTPUT, ranks): f.write("%s\n" % pytextrank.pretty_print(rl._asdict())) print('Collect and normalizing the key phrases from the parsed document... - Done') print("Summarizing tweets based on key phrases...") phrases = ", ".join(set([p for p in pytextrank.limit_keyphrases(KEY_PHRASES_NORMALIZATION_OUTPUT, phrase_limit=MAX_SUBJECTS_TO_SHOW)])) print("**Top-10 subjects:** %s" % phrases)
#!/usr/bin/env python # encoding: utf-8 from pytextrank import normalize_key_phrases, pretty_print, render_ranks, text_rank import sys ## Stage 2: ## * collect and normalize the key phrases from a parsed document ## ## INPUTS: <stage1> ## OUTPUT: JSON format `RankedLexeme(text, rank, ids, pos)` if __name__ == "__main__": path_stage1 = sys.argv[1] graph, ranks = text_rank(path_stage1) render_ranks(graph, ranks) for rl in normalize_key_phrases(path_stage1, ranks): # print (rl) print(pretty_print(rl._asdict()))