def event_based_msc(sentences, output_sent_num = 50): """ 事件驱动的多语句压缩 :param sentences: 待压缩的输入语句集合 :param output_sent_num: 输出语句的个数,默认50句 :return: 分数#句子 """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = panda.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 将图保存成文本形式 # compresser.write_dot('graph.dot') # 对压缩结果进行归一化,并按得分由小到大排序 tmp = [] for score, path in candidates: tmp.append((score / len(path), path)) # 依据得分进行排序 tmp = sorted(tmp, key = lambda tmp : tmp[0]) # 封装结果返回 results = [] for score, path in tmp: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results
def event_keyphrase_based_msc(sentences, output_sent_num=50): """ 基于事件来构建词图,基于keyphrase来对输出语句进行reranking :param sentences: 待压缩的语句 :param output_sent_num: 输出语句的个数 :return:得分#句子 """ # 构建词图,并执行压缩 # 忽略词数小于8的句子 compresser = panda.word_graph(sentences, nb_words=8, lang='en', punct_tag="PUNCT") # 获取压缩结果 candidates = compresser.get_compression(output_sent_num) # 利用keyphrases对压缩结果重新打分 reranker = takahe.keyphrase_reranker(sentences, candidates, lang='en') reranked_candidates = reranker.rerank_nbest_compressions() results = [] for score, path in reranked_candidates: results.append(str(round(score, 6)) + "#" + ' '.join([u[0] for u in path]) + '\n') return results