Пример #1
0
def Edmundson(result, answer):
    # result 机器摘要
    # answer 参考摘要
    r_s = set(TFIDF.cut_by_sentence(result))
    r_a = set(TFIDF.cut_by_sentence(answer))
    share = r_s & r_a
    return len(share) / len(r_a)
Пример #2
0
    def textrank_sentence(self, doc, threshold):
        '''
        使用 TextRank 算法抽取关键句,使用论文中提供的相似度计算公式,详见实验报告
        抽取 `size` 个句子作为下一次喂给 MMR 的输入
        '''

        # create the undirect graph for the textrank sentences
        graphh = Graph(self.d, self.iteration)

        sentences = TFIDF.cut_by_sentence(doc)

        # change the size_word dynamicly
        if len(sentences) < 9:
            size_word = 2
        else:
            # maybe 6
            size_word = 5

        if len(sentences) <= 4:
            print("The content is too short, do not need to summary !")
            return sentences

        # size may need to large as 0.6
        size = round(len(sentences) * 0.3)
        if size == 0:
            size = min(1, len(sentences))
        # use TextRank to get the keywords from the sentence

        # Get Keywords, need to fix
        # get the data
        sentences_array = []
        for sent in sentences:
            result = self.textrank_words(sent, size_word)

            if result == False:
                # do not find the keywords with the TextRank
                result = []

            # TFIDF 补充关键字,并考虑合并关键词组
            result = append_keywords(sent, result, self.keywords_doc)
            # result = join_result(result, sent)
            # add the keywords with the TFIDF

            if result == False:
                words = []
            else:
                words = list(map(lambda x: x[0], result))
            sentences_array.append(words)

        # Get Keywords Over, need to finish fix operator

        # 初始化 TextRank 图
        for i, sent1 in enumerate(sentences_array):
            for j, sent2 in enumerate(sentences_array):
                if i == j or len(sent1) == 0 or len(sent2) == 0:
                    continue
                weights = textrank_similiar(np.array(sent1), np.array(sent2))
                if weights > threshold:
                    # similiar
                    graphh.add_edge(i, j, weights)

        # 图排序开始
        graphh.rank()
        result = graphh.WS

        if result is None:
            print("Error, because of the high threshold !")
            return False

        result = sorted(result.items(), key=itemgetter(1), reverse=True)

        if size > len(result):
            print("Overload !")
            size = len(result)
        result = result[:size]

        # 句子抽取
        result_sent = []
        for i, j in result:
            result_sent.append(sentences[i])

        return result_sent