Пример #1
0
def lda_test(x, n_topics, n_top_sent, sent_length, num_best, vocab, sentences):
    model = lda.LDA(n_topics=n_topics, n_iter=800, random_state=1)
    model.fit_transform(x)
    topic_word = model.topic_word_
    n_top_words = 5
    final1 = ''
    final_out = ''
    for i, topic_dist in enumerate(topic_word):
        topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    n_top_sent = n_top_sent
    s = []
    for i, doc_dist in enumerate(model.doc_topic_.transpose()):
        s1 = []
        topic_words = np.array(sentences)[np.argsort(doc_dist)][:-n_top_words:-1]
        print('{}: {}'.format(i, ' '.join(topic_words)))
        # ff = f1_score(x, topic_words)
        # print(ff)
        t = textblob.TextBlob(' '.join(topic_words))
        for sent in t.sentences:
            t = parse_sentence(sent)
            s1.append(t)
        s.append(s1)
    for s1 in s:
        print(s1)
        print("------")
        compresser = takahe.word_graph(s1, nb_words=sent_length, lang='en', punct_tag="PUNCT")
        # Get the 50 best paths
        candidates = compresser.get_compression(num_best)
        # 1. Rerank compressions by path length (Filippova's method)
        maxim1 = 0
        best_cand1 = ''
        for cummulative_score, path in candidates:
            if cummulative_score > maxim1:
                maxim1 = cummulative_score
                best_cand1 = ' '.join([u[0] for u in path])
            # Normalize path score by path length
            normalized_score = cummulative_score / len(path)
        print('Best: ')
        print(best_cand1)
        final1 += best_cand1
        # Write the word graph in the dot format
        compresser.write_dot('test.dot')
        # 2. Rerank compressions by keyphrases (Boudin and Morin's method)
        reranker = takahe.keyphrase_reranker(s1, candidates, lang='en')
        reranked_candidates = reranker.rerank_nbest_compressions()
        # Loop over the best reranked candidates
        maxim = 0
        best_cand = ''
        for score, path in reranked_candidates:
            # Print the best reranked candidates
            if score > maxim:
                maxim = score
                best_cand = ' '.join([u[0] for u in path])
        print('Best: ')
        print(best_cand)
        final_out += best_cand
    print(final1)
    print(final_out)
    return final1
Пример #2
0
def keyphrases_based_msc(sentences, output_sent_num = 50):

    """
    经过keyphrases重排序后的多语句压缩
    :param sentences:
    :param output_sent_num:
    :return:
    """

    # 构建词图,并执行压缩
    # 忽略词数小于8的句子
    compresser = takahe.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT")

    # 获取压缩结果
    candidates = compresser.get_compression(output_sent_num)

    # 利用keyphrases对压缩结果重新打分
    reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en')
    reranked_candidates = reranker.rerank_nbest_compressions()

    results = []
    for score, path in reranked_candidates:
        results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n')

    return results
Пример #3
0
def event_keyphrase_based_msc(sentences, output_sent_num=50):

    """
    基于事件来构建词图,基于keyphrase来对输出语句进行reranking
    :param sentences: 待压缩的语句
    :param output_sent_num: 输出语句的个数
    :return:得分#句子
    """

    # 构建词图,并执行压缩
    # 忽略词数小于8的句子
    compresser = panda.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT")

    # 获取压缩结果
    candidates = compresser.get_compression(output_sent_num)

    # 利用keyphrases对压缩结果重新打分
    reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en')
    reranked_candidates = reranker.rerank_nbest_compressions()

    results = []
    for score, path in reranked_candidates:
        results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n')

    return results
Пример #4
0
def _dofuse(cluster):
    """
    Extracts the call to takahe to interrupt it if it's taking too long.
    """
    fuser = takahe.word_graph(cluster,
                              nb_words=6,
                              lang="en",
                              punct_tag="PUNCT")
    # get fusions
    fusions = fuser.get_compression(50)
    # rerank and keep top 10
    reranker = takahe.keyphrase_reranker(cluster, fusions, lang="en")
    rerankedfusions = reranker.rerank_nbest_compressions()[0:10]
    return rerankedfusions
def _dofuse(sentenceL):
    """
	Extracts the call to takahe to interrupt it if it's taking too long.
	"""
    fuser = takahe.word_graph(sentenceL,
                              nb_words=6,
                              lang="en",
                              punct_tag="PUNCT")
    # get fusions
    fusions = fuser.get_compression(50)
    # rerank and keep top 10
    reranker = takahe.keyphrase_reranker(sentenceL, fusions, lang="en")
    rerankedfusions = reranker.rerank_nbest_compressions()[0:5]
    return rerankedfusions
Пример #6
0
def get_compressed_sen(sentences, nb_words):
    compresser = takahe.word_graph(sentences, nb_words = nb_words, lang = 'en', punct_tag = "." )
    candidates = compresser.get_compression(3)
    # print("--------------------Top 3 candicate---------------", candidates)
    reranker = takahe.keyphrase_reranker(sentences,
                                      candidates,
                                      lang = 'en')
    # print("reranker: ", reranker)
    # print("finish initialising reranker------------")

    reranked_candidates = reranker.rerank_nbest_compressions()
    # print(reranked_candidates)
    if len(reranked_candidates)>0:
        score, path = reranked_candidates[0]
        result = ' '.join([u[0] for u in path])
    else:
        result=' '
    # print("----------------selected candicate as final output-------------- ", result)
    return result
Пример #7
0
################################################################################

# Create a word graph from the set of sentences with parameters :
# - minimal number of words in the compression : 6
# - language of the input sentences : en (english)
# - POS tag for punctuation marks : PUNCT
compresser = takahe.word_graph(sentences, nb_words=6, lang='en', punct_tag="PUNCT")

# Get the 50 best paths
candidates = compresser.get_compression(50)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:
    # Normalize path score by path length
    normalized_score = cummulative_score / len(path)

    # Print normalized score and compression
    print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format
compresser.write_dot('test.dot')

# 2. Rerank compressions by keyphrases (Boudin and Morin's method)
reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en')

reranked_candidates = reranker.rerank_nbest_compressions()

# Loop over the best reranked candidates
for score, path in reranked_candidates:
    # Print the best reranked candidates
    print round(score, 3), ' '.join([u[0] for u in path])
Пример #8
0
	                            lang = 'en',
	                            punct_tag = "PUNCT" )

# Get the 50 best paths
candidates = compresser.get_compression(1)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:

	# Normalize path score by path length
	normalized_score = cummulative_score / len(path)
	print(path)
	# Print normalized score and compression
	# print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format
compresser.write_dot('test.dot')

# 2. Rerank compressions by keyphrases (Boudin and Morin's method)
reranker = takahe.keyphrase_reranker( sentences,
									  candidates,
									  lang = 'en' )

reranked_candidates = reranker.rerank_nbest_compressions()

# Loop over the best reranked candidates
for score, path in reranked_candidates:

	# Print the best reranked candidates
	print(path)
	# print round(score, 3), ' '.join([u[0] for u in path])
Пример #9
0
candidates = compresser.get_compression(50)

# 1. Rerank compressions by path length (Filippova's method)
for cummulative_score, path in candidates:

    # Normalize path score by path length
    normalized_score = cummulative_score / len(path)

    # Print normalized score and compression
    print round(normalized_score, 3), ' '.join([u[0] for u in path])

# Write the word graph in the dot format
compresser.write_dot('test.dot')

# 2. Rerank compressions by keyphrases (Boudin and Morin's method)
reranker = takahe.keyphrase_reranker(textList, candidates, lang='en')

reranked_candidates = reranker.rerank_nbest_compressions()
b = reranked_candidates[len(reranked_candidates) - 1]
print(b[1])
gg = ' '.join([u[0] for u in b[1]])
# Loop over the best reranked candidates
for score, path in reranked_candidates:

    # Print the best reranked candidates
    print round(score, 3), ' '.join([u[0] for u in path])
    print round(score, 3), ' '.join([u[0] for u in path])

# In[27]:

#send text into grammar checker.