Exemplo n.º 1
0
    def test_summarizer(self):
        summarizer_cn = SubmodularSummarizer("chinese")
        summarizer_en = SubmodularSummarizer("english")
        summarizer_en_stem = SubmodularSummarizer("english", True)

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_len = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        summary_en = summarizer_en(document_set_en, 100)
        summary_en_len = sum(
            get_en_sentence_length(sentence) for sentence in summary_en)
        summary_en_text = ' '.join(sentence._texts for sentence in summary_en)

        summary_en_stem = summarizer_en(document_set_en, 100)
        summary_en_stem_len = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_stem)
        summary_en_stem_text = ' '.join(sentence._texts
                                        for sentence in summary_en_stem)

        self.assertLessEqual(summary_cn_len, 100)
        self.assertLessEqual(summary_en_len, 100)
        self.assertLessEqual(summary_en_stem_len, 100)

        print(
            "-----------------------------chinese default---------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_len))
        print(
            "-----------------------------english default---------------------------"
        )
        print(summary_en_text)
        print("the summary length is {}".format(summary_en_len))
        print(
            "-----------------------------english    stem---------------------------"
        )
        print(summary_en_stem_text)
        print("the summary length is {}".format(summary_en_stem_len))
Exemplo n.º 2
0
    def test_summarizer(self):
        summarizer_en = LexRank_querySummarizer("english")
        summarizer_en_stem = LexRank_querySummarizer("english", True)
        summarizer_cn = LexRank_querySummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path
        )
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path
        )

        query_cn = u"科比生涯的最后一个赛季"
        query_en = "a day in the life with org"

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn_mmr = summarizer_cn(document_set_cn, query_cn, 100, method="MMR")
        summary_cn_mmr_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_mmr)
        summary_cn_mmr_text = ''.join(sentence._texts + '。' for sentence in summary_cn_mmr)

        summary_cn_default = summarizer_cn(document_set_cn, query_cn, 100, method="default")
        summary_cn_default_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_default)
        summary_cn_default_text = ''.join(sentence._texts + '。' for sentence in summary_cn_default)

        summary_en_mmr = summarizer_en(document_set_en, query_en, 100, method="MMR")
        summary_en_mmr_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_mmr)
        summary_en_mmr_text = ' '.join(sentence._texts for sentence in summary_en_mmr)

        self.assertLessEqual(summary_cn_mmr_len, 100)
        self.assertLessEqual(summary_cn_default_len, 100)
        self.assertLessEqual(summary_en_mmr_len, 100)

        print("--------------------------chinese MMR-----------------------------")
        print(summary_cn_mmr_text)
        print("the summary length is {}".format(summary_cn_mmr_len))
        print("--------------------------chinese default-----------------------------")
        print(summary_cn_default_text)
        print("the summary length is {}".format(summary_cn_default_len))
        print("--------------------------english MMR-----------------------------")
        print(summary_en_mmr_text)
        print("the summary length is {}".format(summary_en_mmr_len))
Exemplo n.º 3
0
    def test_summarizer(self):
        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        summarizer_cn = AverageRankSummarizer("chinese")
        summarizer_cn1 = SubmodularSummarizer("chinese")
        summarizer_cn2 = LexRankSummarizer("chinese")
        summarizer_cn3 = LsaSummarizer("chinese")

        summarizer_en = AverageRankSummarizer("english")
        summarizer_en1 = SubmodularSummarizer("english")
        summarizer_en2 = LexRankSummarizer("english")
        summarizer_en3 = LsaSummarizer("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summary_cn = summarizer_cn(document_set_cn, 100, summarizer_cn1,
                                   summarizer_cn2, summarizer_cn3)
        summary_cn_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        summary_en = summarizer_en(document_set_en, 100, summarizer_en1,
                                   summarizer_en2, summarizer_en3)
        summary_en_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en)
        summary_en_text = ' '.join(sentence._texts for sentence in summary_en)

        self.assertLessEqual(summary_cn_length, 100)
        self.assertLessEqual(summary_en_length, 100)

        print(
            "--------------------------chinese   average_rank-----------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_length))
        print(
            "--------------------------english   average_rank-----------------------------"
        )
        print(summary_en_text)
        print("the summary length is {}".format(summary_en_length))
Exemplo n.º 4
0
    def test_summarizer(self):
        summarizer_en = conceptILPSummarizer("english")
        summarizer_en_stem = conceptILPSummarizer("english", True)
        summarizer_cn = conceptILPSummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path
        )
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path
        )

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn_ilp = summarizer_cn(document_set_cn, 100, method="ilp")
        summary_cn_ilp_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_ilp)
        summary_cn_ilp_text = ''.join(sentence._texts + '。' for sentence in summary_cn_ilp)

        summary_cn_greedy = summarizer_cn(document_set_cn, 100, method="greedy")
        summary_cn_greedy_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_greedy)
        summary_cn_greedy_text = ''.join(sentence._texts + '。' for sentence in summary_cn_greedy)

        summary_cn_tabu = summarizer_cn(document_set_cn, 100, method="tabu")
        summary_cn_tabu_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_tabu)
        summary_cn_tabu_text = ''.join(sentence._texts + '。' for sentence in summary_cn_tabu)

        summary_en_ilp = summarizer_en(document_set_en, 100, method="ilp")
        summary_en_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_ilp)
        summary_en_ilp_text = ' '.join(sentence._texts for sentence in summary_en_ilp)

        summary_en_greedy = summarizer_en(document_set_en, 100, method="greedy")
        summary_en_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_greedy)
        summary_en_greedy_text = ' '.join(sentence._texts for sentence in summary_en_greedy)

        summary_en_tabu = summarizer_en(document_set_en, 100, method="tabu")
        summary_en_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_tabu)
        summary_en_tabu_text = ' '.join(sentence._texts for sentence in summary_en_tabu)

        summary_en_stem_ilp = summarizer_en_stem(document_set_en, 100, method="ilp")
        summary_en_stem_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_ilp)
        summary_en_stem_ilp_text = ' '.join(sentence._texts for sentence in summary_en_stem_ilp)

        summary_en_stem_greedy = summarizer_en_stem(document_set_en, 100, method="greedy")
        summary_en_stem_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_greedy)
        summary_en_stem_greedy_text = ' '.join(sentence._texts for sentence in summary_en_stem_greedy)

        summary_en_stem_tabu = summarizer_en_stem(document_set_en, 100, method="tabu")
        summary_en_stem_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_tabu)
        summary_en_stem_tabu_text = ' '.join(sentence._texts for sentence in summary_en_stem_tabu)


        self.assertLessEqual(summary_cn_ilp_len, 100)
        self.assertLessEqual(summary_cn_greedy_len, 100)
        self.assertLessEqual(summary_cn_tabu_len, 100)

        self.assertLessEqual(summary_en_ilp_len, 100)
        self.assertLessEqual(summary_en_greedy_len, 100)
        self.assertLessEqual(summary_en_tabu_len, 100)

        self.assertLessEqual(summary_en_stem_ilp_len, 100)
        self.assertLessEqual(summary_en_stem_greedy_len, 100)
        self.assertLessEqual(summary_en_stem_tabu_len, 100)

        print("--------------------------chinese      ILP-----------------------------")
        print(summary_cn_ilp_text)
        print("the summary length is {}".format(summary_cn_ilp_len))
        print("--------------------------chinese   greedy-----------------------------")
        print(summary_cn_greedy_text)
        print("the summary length is {}".format(summary_cn_greedy_len))
        print("--------------------------chinese     tabu-----------------------------")
        print(summary_cn_tabu_text)
        print("the summary length is {}".format(summary_cn_tabu_len))
        print("--------------------------english      ILP-----------------------------")
        print(summary_en_ilp_text)
        print("the summary length is {}".format(summary_en_ilp_len))
        print("--------------------------english   greedy-----------------------------")
        print(summary_en_greedy_text)
        print("the summary length is {}".format(summary_en_greedy_len))
        print("--------------------------english     tabu-----------------------------")
        print(summary_en_tabu_text)
        print("the summary length is {}".format(summary_en_tabu_len))
        print("--------------------------english Stem ILP-----------------------------")
        print(summary_en_stem_ilp_text)
        print("the summary length is {}".format(summary_en_stem_ilp_len))
        print("--------------------------eng   StemGreedy-----------------------------")
        print(summary_en_stem_greedy_text)
        print("the summary length is {}".format(summary_en_stem_greedy_len))
        print("--------------------------english StemTabu-----------------------------")
        print(summary_en_stem_tabu_text)
        print("the summary length is {}".format(summary_en_stem_tabu_len))
Exemplo n.º 5
0
    def test_summarizer(self):
        summarizer_en = KLSummarizer("english")
        summarizer_en_stem = KLSummarizer("english", True)
        summarizer_cn = KLSummarizer("chinese")
        summarizer_cn.stop_words = get_stop_words("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        summary_en = summarizer_en(document_set_en, 100)
        summary_en_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en)
        summary_en_text = ' '.join(sentence._texts for sentence in summary_en)

        summary_en_stem = summarizer_en_stem(document_set_en, 100)
        summary_en_stem_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_stem)
        summary_en_stem_text = ' '.join(sentence._texts
                                        for sentence in summary_en_stem)

        self.assertLessEqual(summary_cn_length, 100)
        self.assertLessEqual(summary_en_length, 100)
        self.assertLessEqual(summary_en_stem_length, 100)

        print(
            "--------------------------chinese   KL-----------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_length))
        print(
            "--------------------------english   KL-----------------------------"
        )
        print(summary_en_text)
        print("the summary length is {}".format(summary_en_length))
        print(
            "--------------------------english stem-----------------------------"
        )
        print(summary_en_stem_text)
        print("the summary length is {}".format(summary_en_stem_length))

        # print(summary_cn_text)
        # print(summary_en_text)
        # print(summary_en_stem)
        # print summarizer_en_stem._get_content_words_in_sentence(summary_en[0])
        # print summarizer_en._get_content_words_in_sentence(summary_en[0])
        word_list = summarizer_en_stem._get_content_words_in_sentence(
            summary_en[0])
        word_stem_list = summarizer_en._get_content_words_in_sentence(
            summary_en[0])
        self.assertNotEqual(word_list, word_stem_list)
Exemplo n.º 6
0
    def test_summarizer(self):
        summarizer_en = LsaSummarizer("english")
        summarizer_en_stem = LsaSummarizer("english", True)
        summarizer_cn = LsaSummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        # summary_cn_mmr = summarizer_cn(document_set_cn, 100, method="MMR")
        summary_cn_mmr = summarizer_cn(document_set_cn,
                                       100,
                                       method="MMR",
                                       metric="tfidf")
        summary_cn_mmr_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn_mmr)
        summary_cn_text_mmr = ''.join(sentence._texts + '。'
                                      for sentence in summary_cn_mmr)

        summary_en_tfidf = summarizer_en(document_set_en,
                                         100,
                                         method="MMR",
                                         metric="tfidf")
        summary_en_tfidf_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_tfidf)
        summary_en_text_tfidf = ' '.join(sentence._texts
                                         for sentence in summary_en_tfidf)

        summary_en_mmr = summarizer_en(document_set_en, 100, method="MMR")
        summary_en_mmr_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_mmr)
        summary_en_text_mmr = ' '.join(sentence._texts
                                       for sentence in summary_en_mmr)

        print(
            "-----------------------------chinese default-------------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_length))
        print(
            "-----------------------------chinese     MMR-------------------------------"
        )
        print(summary_cn_text_mmr)
        print("the summary length is {}".format(summary_cn_mmr_length))
        print(
            "-----------------------------english   tfidf-------------------------------"
        )
        print(summary_en_text_tfidf)
        print("the summary length is {}".format(summary_en_tfidf_length))
        print(
            "-----------------------------english     MMR-------------------------------"
        )
        print(summary_en_text_mmr)
        print("the summary length is {}".format(summary_en_mmr_length))

        self.assertLessEqual(summary_en_tfidf_length, 100)
        self.assertLessEqual(summary_en_mmr_length, 100)
        self.assertLessEqual(summary_cn_length, 100)
        self.assertLessEqual(summary_cn_mmr_length, 100)