def lda_test(x, n_topics, n_top_sent, sent_length, num_best, vocab, sentences): model = lda.LDA(n_topics=n_topics, n_iter=800, random_state=1) model.fit_transform(x) topic_word = model.topic_word_ n_top_words = 5 final1 = '' final_out = '' for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) n_top_sent = n_top_sent s = [] for i, doc_dist in enumerate(model.doc_topic_.transpose()): s1 = [] topic_words = np.array(sentences)[np.argsort(doc_dist)][:-n_top_words:-1] print('{}: {}'.format(i, ' '.join(topic_words))) # ff = f1_score(x, topic_words) # print(ff) t = textblob.TextBlob(' '.join(topic_words)) for sent in t.sentences: t = parse_sentence(sent) s1.append(t) s.append(s1) for s1 in s: print(s1) print("------") compresser = takahe.word_graph(s1, nb_words=sent_length, lang='en', punct_tag="PUNCT") # Get the 50 best paths candidates = compresser.get_compression(num_best) # 1. Rerank compressions by path length (Filippova's method) maxim1 = 0 best_cand1 = '' for cummulative_score, path in candidates: if cummulative_score > maxim1: maxim1 = cummulative_score best_cand1 = ' '.join([u[0] for u in path]) # Normalize path score by path length normalized_score = cummulative_score / len(path) print('Best: ') print(best_cand1) final1 += best_cand1 # Write the word graph in the dot format compresser.write_dot('test.dot') # 2. Rerank compressions by keyphrases (Boudin and Morin's method) reranker = takahe.keyphrase_reranker(s1, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() # Loop over the best reranked candidates maxim = 0 best_cand = '' for score, path in reranked_candidates: # Print the best reranked candidates if score > maxim: maxim = score best_cand = ' '.join([u[0] for u in path]) print('Best: ') print(best_cand) final_out += best_cand print(final1) print(final_out) return final1
def keyphrases_based_msc(sentences, output_sent_num = 50): """ 经过keyphrases重排序后的多语句压缩 :param sentences: :param output_sent_num: :return: """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 利用keyphrases对压缩结果重新打分 reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() results = [] for score, path in reranked_candidates: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results
def event_keyphrase_based_msc(sentences, output_sent_num=50): """ 基于事件来构建词图,基于keyphrase来对输出语句进行reranking :param sentences: 待压缩的语句 :param output_sent_num: 输出语句的个数 :return:得分#句子 """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = panda.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 利用keyphrases对压缩结果重新打分 reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() results = [] for score, path in reranked_candidates: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results
def _dofuse(cluster): """ Extracts the call to takahe to interrupt it if it's taking too long. """ fuser = takahe.word_graph(cluster, nb_words=6, lang="en", punct_tag="PUNCT") # get fusions fusions = fuser.get_compression(50) # rerank and keep top 10 reranker = takahe.keyphrase_reranker(cluster, fusions, lang="en") rerankedfusions = reranker.rerank_nbest_compressions()[0:10] return rerankedfusions
def _dofuse(sentenceL): """ Extracts the call to takahe to interrupt it if it's taking too long. """ fuser = takahe.word_graph(sentenceL, nb_words=6, lang="en", punct_tag="PUNCT") # get fusions fusions = fuser.get_compression(50) # rerank and keep top 10 reranker = takahe.keyphrase_reranker(sentenceL, fusions, lang="en") rerankedfusions = reranker.rerank_nbest_compressions()[0:5] return rerankedfusions
def get_compressed_sen(sentences, nb_words): compresser = takahe.word_graph(sentences, nb_words = nb_words, lang = 'en', punct_tag = "." ) candidates = compresser.get_compression(3) # print("--------------------Top 3 candicate---------------", candidates) reranker = takahe.keyphrase_reranker(sentences, candidates, lang = 'en') # print("reranker: ", reranker) # print("finish initialising reranker------------") reranked_candidates = reranker.rerank_nbest_compressions() # print(reranked_candidates) if len(reranked_candidates)>0: score, path = reranked_candidates[0] result = ' '.join([u[0] for u in path]) else: result=' ' # print("----------------selected candicate as final output-------------- ", result) return result
################################################################################ # Create a word graph from the set of sentences with parameters : # - minimal number of words in the compression : 6 # - language of the input sentences : en (english) # - POS tag for punctuation marks : PUNCT compresser = takahe.word_graph(sentences, nb_words=6, lang='en', punct_tag="PUNCT") # Get the 50 best paths candidates = compresser.get_compression(50) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) # Print normalized score and compression print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format compresser.write_dot('test.dot') # 2. Rerank compressions by keyphrases (Boudin and Morin's method) reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() # Loop over the best reranked candidates for score, path in reranked_candidates: # Print the best reranked candidates print round(score, 3), ' '.join([u[0] for u in path])
lang = 'en', punct_tag = "PUNCT" ) # Get the 50 best paths candidates = compresser.get_compression(1) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) print(path) # Print normalized score and compression # print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format compresser.write_dot('test.dot') # 2. Rerank compressions by keyphrases (Boudin and Morin's method) reranker = takahe.keyphrase_reranker( sentences, candidates, lang = 'en' ) reranked_candidates = reranker.rerank_nbest_compressions() # Loop over the best reranked candidates for score, path in reranked_candidates: # Print the best reranked candidates print(path) # print round(score, 3), ' '.join([u[0] for u in path])
candidates = compresser.get_compression(50) # 1. Rerank compressions by path length (Filippova's method) for cummulative_score, path in candidates: # Normalize path score by path length normalized_score = cummulative_score / len(path) # Print normalized score and compression print round(normalized_score, 3), ' '.join([u[0] for u in path]) # Write the word graph in the dot format compresser.write_dot('test.dot') # 2. Rerank compressions by keyphrases (Boudin and Morin's method) reranker = takahe.keyphrase_reranker(textList, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() b = reranked_candidates[len(reranked_candidates) - 1] print(b[1]) gg = ' '.join([u[0] for u in b[1]]) # Loop over the best reranked candidates for score, path in reranked_candidates: # Print the best reranked candidates print round(score, 3), ' '.join([u[0] for u in path]) print round(score, 3), ' '.join([u[0] for u in path]) # In[27]: #send text into grammar checker.