def Edmundson(result, answer): # result 机器摘要 # answer 参考摘要 r_s = set(TFIDF.cut_by_sentence(result)) r_a = set(TFIDF.cut_by_sentence(answer)) share = r_s & r_a return len(share) / len(r_a)
def textrank_sentence(self, doc, threshold): ''' 使用 TextRank 算法抽取关键句,使用论文中提供的相似度计算公式,详见实验报告 抽取 `size` 个句子作为下一次喂给 MMR 的输入 ''' # create the undirect graph for the textrank sentences graphh = Graph(self.d, self.iteration) sentences = TFIDF.cut_by_sentence(doc) # change the size_word dynamicly if len(sentences) < 9: size_word = 2 else: # maybe 6 size_word = 5 if len(sentences) <= 4: print("The content is too short, do not need to summary !") return sentences # size may need to large as 0.6 size = round(len(sentences) * 0.3) if size == 0: size = min(1, len(sentences)) # use TextRank to get the keywords from the sentence # Get Keywords, need to fix # get the data sentences_array = [] for sent in sentences: result = self.textrank_words(sent, size_word) if result == False: # do not find the keywords with the TextRank result = [] # TFIDF 补充关键字,并考虑合并关键词组 result = append_keywords(sent, result, self.keywords_doc) # result = join_result(result, sent) # add the keywords with the TFIDF if result == False: words = [] else: words = list(map(lambda x: x[0], result)) sentences_array.append(words) # Get Keywords Over, need to finish fix operator # 初始化 TextRank 图 for i, sent1 in enumerate(sentences_array): for j, sent2 in enumerate(sentences_array): if i == j or len(sent1) == 0 or len(sent2) == 0: continue weights = textrank_similiar(np.array(sent1), np.array(sent2)) if weights > threshold: # similiar graphh.add_edge(i, j, weights) # 图排序开始 graphh.rank() result = graphh.WS if result is None: print("Error, because of the high threshold !") return False result = sorted(result.items(), key=itemgetter(1), reverse=True) if size > len(result): print("Overload !") size = len(result) result = result[:size] # 句子抽取 result_sent = [] for i, j in result: result_sent.append(sentences[i]) return result_sent