Пример #1
0
    def test_summarizer(self):
        summarizer_cn = SubmodularSummarizer("chinese")
        summarizer_en = SubmodularSummarizer("english")
        summarizer_en_stem = SubmodularSummarizer("english", True)

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_len = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        summary_en = summarizer_en(document_set_en, 100)
        summary_en_len = sum(
            get_en_sentence_length(sentence) for sentence in summary_en)
        summary_en_text = ' '.join(sentence._texts for sentence in summary_en)

        summary_en_stem = summarizer_en(document_set_en, 100)
        summary_en_stem_len = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_stem)
        summary_en_stem_text = ' '.join(sentence._texts
                                        for sentence in summary_en_stem)

        self.assertLessEqual(summary_cn_len, 100)
        self.assertLessEqual(summary_en_len, 100)
        self.assertLessEqual(summary_en_stem_len, 100)

        print(
            "-----------------------------chinese default---------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_len))
        print(
            "-----------------------------english default---------------------------"
        )
        print(summary_en_text)
        print("the summary length is {}".format(summary_en_len))
        print(
            "-----------------------------english    stem---------------------------"
        )
        print(summary_en_stem_text)
        print("the summary length is {}".format(summary_en_stem_len))
Пример #2
0
    def test_summarizer(self):
        summarizer_en = LexRank_querySummarizer("english")
        summarizer_en_stem = LexRank_querySummarizer("english", True)
        summarizer_cn = LexRank_querySummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path
        )
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path
        )

        query_cn = u"科比生涯的最后一个赛季"
        query_en = "a day in the life with org"

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn_mmr = summarizer_cn(document_set_cn, query_cn, 100, method="MMR")
        summary_cn_mmr_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_mmr)
        summary_cn_mmr_text = ''.join(sentence._texts + '。' for sentence in summary_cn_mmr)

        summary_cn_default = summarizer_cn(document_set_cn, query_cn, 100, method="default")
        summary_cn_default_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_default)
        summary_cn_default_text = ''.join(sentence._texts + '。' for sentence in summary_cn_default)

        summary_en_mmr = summarizer_en(document_set_en, query_en, 100, method="MMR")
        summary_en_mmr_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_mmr)
        summary_en_mmr_text = ' '.join(sentence._texts for sentence in summary_en_mmr)

        self.assertLessEqual(summary_cn_mmr_len, 100)
        self.assertLessEqual(summary_cn_default_len, 100)
        self.assertLessEqual(summary_en_mmr_len, 100)

        print("--------------------------chinese MMR-----------------------------")
        print(summary_cn_mmr_text)
        print("the summary length is {}".format(summary_cn_mmr_len))
        print("--------------------------chinese default-----------------------------")
        print(summary_cn_default_text)
        print("the summary length is {}".format(summary_cn_default_len))
        print("--------------------------english MMR-----------------------------")
        print(summary_en_mmr_text)
        print("the summary length is {}".format(summary_en_mmr_len))
Пример #3
0
def handle_arguments(args):
    document_format = args['--format']
    if document_format is not None and document_format not in PARSERS:
        raise ValueError(
            "Unsupported input format. Possible values are {0}. Given: {1}."
        ).format(", ".join(PARSERS.keys()), document_format)
    parser = PARSERS[document_format or "plaintext"]

    words_limit = args['--length'] or 250
    words_limit = int(words_limit)
    language = args['--language'] or "english"
    parser = parser(language)

    if args['--file'] is not None:
        file_path = args['--file']
        file_path = abspath(file_path)
        if isdir(file_path):
            document_set = parser.build_documentSet_from_dir(file_path)
        elif isfile(file_path):
            document_set = parser.build_document_from_file(file_path)
        else:
            raise ValueError("Input file is invalid")

    if args['--stopwords']:
        stop_words = read_stop_words(args['--stopwords'])
    else:
        stop_words = get_stop_words(language)

    if args['--stem']:
        stem_or_not = True
    else:
        stem_or_not = False

    query = False or to_unicode(args['--query'])
    if args['--query']:
        summarizer_class = next(cls for name, cls in METHODS_Q.items()
                                if args[name])
    else:
        summarizer_class = next(cls for name, cls in METHODS.items()
                                if args[name])

    summarizer = build_summarizer(summarizer_class, language, stop_words,
                                  stem_or_not)

    return document_set, summarizer, language, words_limit, query
Пример #4
0
    def test_summarizer(self):
        summarizer_en = conceptILPSummarizer("english")
        summarizer_en_stem = conceptILPSummarizer("english", True)
        summarizer_cn = conceptILPSummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path
        )
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path
        )

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")
        summarizer_en_stem.stop_words = get_stop_words("english")

        summary_cn_ilp = summarizer_cn(document_set_cn, 100, method="ilp")
        summary_cn_ilp_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_ilp)
        summary_cn_ilp_text = ''.join(sentence._texts + '。' for sentence in summary_cn_ilp)

        summary_cn_greedy = summarizer_cn(document_set_cn, 100, method="greedy")
        summary_cn_greedy_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_greedy)
        summary_cn_greedy_text = ''.join(sentence._texts + '。' for sentence in summary_cn_greedy)

        summary_cn_tabu = summarizer_cn(document_set_cn, 100, method="tabu")
        summary_cn_tabu_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_tabu)
        summary_cn_tabu_text = ''.join(sentence._texts + '。' for sentence in summary_cn_tabu)

        summary_en_ilp = summarizer_en(document_set_en, 100, method="ilp")
        summary_en_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_ilp)
        summary_en_ilp_text = ' '.join(sentence._texts for sentence in summary_en_ilp)

        summary_en_greedy = summarizer_en(document_set_en, 100, method="greedy")
        summary_en_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_greedy)
        summary_en_greedy_text = ' '.join(sentence._texts for sentence in summary_en_greedy)

        summary_en_tabu = summarizer_en(document_set_en, 100, method="tabu")
        summary_en_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_tabu)
        summary_en_tabu_text = ' '.join(sentence._texts for sentence in summary_en_tabu)

        summary_en_stem_ilp = summarizer_en_stem(document_set_en, 100, method="ilp")
        summary_en_stem_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_ilp)
        summary_en_stem_ilp_text = ' '.join(sentence._texts for sentence in summary_en_stem_ilp)

        summary_en_stem_greedy = summarizer_en_stem(document_set_en, 100, method="greedy")
        summary_en_stem_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_greedy)
        summary_en_stem_greedy_text = ' '.join(sentence._texts for sentence in summary_en_stem_greedy)

        summary_en_stem_tabu = summarizer_en_stem(document_set_en, 100, method="tabu")
        summary_en_stem_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_tabu)
        summary_en_stem_tabu_text = ' '.join(sentence._texts for sentence in summary_en_stem_tabu)


        self.assertLessEqual(summary_cn_ilp_len, 100)
        self.assertLessEqual(summary_cn_greedy_len, 100)
        self.assertLessEqual(summary_cn_tabu_len, 100)

        self.assertLessEqual(summary_en_ilp_len, 100)
        self.assertLessEqual(summary_en_greedy_len, 100)
        self.assertLessEqual(summary_en_tabu_len, 100)

        self.assertLessEqual(summary_en_stem_ilp_len, 100)
        self.assertLessEqual(summary_en_stem_greedy_len, 100)
        self.assertLessEqual(summary_en_stem_tabu_len, 100)

        print("--------------------------chinese      ILP-----------------------------")
        print(summary_cn_ilp_text)
        print("the summary length is {}".format(summary_cn_ilp_len))
        print("--------------------------chinese   greedy-----------------------------")
        print(summary_cn_greedy_text)
        print("the summary length is {}".format(summary_cn_greedy_len))
        print("--------------------------chinese     tabu-----------------------------")
        print(summary_cn_tabu_text)
        print("the summary length is {}".format(summary_cn_tabu_len))
        print("--------------------------english      ILP-----------------------------")
        print(summary_en_ilp_text)
        print("the summary length is {}".format(summary_en_ilp_len))
        print("--------------------------english   greedy-----------------------------")
        print(summary_en_greedy_text)
        print("the summary length is {}".format(summary_en_greedy_len))
        print("--------------------------english     tabu-----------------------------")
        print(summary_en_tabu_text)
        print("the summary length is {}".format(summary_en_tabu_len))
        print("--------------------------english Stem ILP-----------------------------")
        print(summary_en_stem_ilp_text)
        print("the summary length is {}".format(summary_en_stem_ilp_len))
        print("--------------------------eng   StemGreedy-----------------------------")
        print(summary_en_stem_greedy_text)
        print("the summary length is {}".format(summary_en_stem_greedy_len))
        print("--------------------------english StemTabu-----------------------------")
        print(summary_en_stem_tabu_text)
        print("the summary length is {}".format(summary_en_stem_tabu_len))
Пример #5
0
    def test_summarizer(self):
        summarizer_en = KLSummarizer("english")
        summarizer_en_stem = KLSummarizer("english", True)
        summarizer_cn = KLSummarizer("chinese")
        summarizer_cn.stop_words = get_stop_words("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        summary_en = summarizer_en(document_set_en, 100)
        summary_en_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en)
        summary_en_text = ' '.join(sentence._texts for sentence in summary_en)

        summary_en_stem = summarizer_en_stem(document_set_en, 100)
        summary_en_stem_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_stem)
        summary_en_stem_text = ' '.join(sentence._texts
                                        for sentence in summary_en_stem)

        self.assertLessEqual(summary_cn_length, 100)
        self.assertLessEqual(summary_en_length, 100)
        self.assertLessEqual(summary_en_stem_length, 100)

        print(
            "--------------------------chinese   KL-----------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_length))
        print(
            "--------------------------english   KL-----------------------------"
        )
        print(summary_en_text)
        print("the summary length is {}".format(summary_en_length))
        print(
            "--------------------------english stem-----------------------------"
        )
        print(summary_en_stem_text)
        print("the summary length is {}".format(summary_en_stem_length))

        # print(summary_cn_text)
        # print(summary_en_text)
        # print(summary_en_stem)
        # print summarizer_en_stem._get_content_words_in_sentence(summary_en[0])
        # print summarizer_en._get_content_words_in_sentence(summary_en[0])
        word_list = summarizer_en_stem._get_content_words_in_sentence(
            summary_en[0])
        word_stem_list = summarizer_en._get_content_words_in_sentence(
            summary_en[0])
        self.assertNotEqual(word_list, word_stem_list)
Пример #6
0
    def test_stopwords(self):
        stop_words_cn = get_stop_words("chinese")
        stop_words_en = get_stop_words("english")

        self.assertIn("啊", stop_words_cn)
        self.assertIn("and", stop_words_en)
Пример #7
0
    def test_summarizer(self):
        summarizer_en = LsaSummarizer("english")
        summarizer_en_stem = LsaSummarizer("english", True)
        summarizer_cn = LsaSummarizer("chinese")

        data_file_path = abspath(dirname(__file__)) + '/data'
        cn_data_file_path = data_file_path + '/chinese/'
        en_data_file_path = data_file_path + '/english/'
        parser_cn = PlaintextParser("chinese")
        parser_en = PlaintextParser("english")

        document_set_cn = parser_cn.build_documentSet_from_dir(
            cn_data_file_path)
        document_set_en = parser_en.build_documentSet_from_dir(
            en_data_file_path)

        summarizer_cn.stop_words = get_stop_words("chinese")
        summarizer_en.stop_words = get_stop_words("english")

        summary_cn = summarizer_cn(document_set_cn, 100)
        summary_cn_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn)
        summary_cn_text = ''.join(sentence._texts + '。'
                                  for sentence in summary_cn)

        # summary_cn_mmr = summarizer_cn(document_set_cn, 100, method="MMR")
        summary_cn_mmr = summarizer_cn(document_set_cn,
                                       100,
                                       method="MMR",
                                       metric="tfidf")
        summary_cn_mmr_length = sum(
            get_cn_sentence_length(sentence) for sentence in summary_cn_mmr)
        summary_cn_text_mmr = ''.join(sentence._texts + '。'
                                      for sentence in summary_cn_mmr)

        summary_en_tfidf = summarizer_en(document_set_en,
                                         100,
                                         method="MMR",
                                         metric="tfidf")
        summary_en_tfidf_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_tfidf)
        summary_en_text_tfidf = ' '.join(sentence._texts
                                         for sentence in summary_en_tfidf)

        summary_en_mmr = summarizer_en(document_set_en, 100, method="MMR")
        summary_en_mmr_length = sum(
            get_en_sentence_length(sentence) for sentence in summary_en_mmr)
        summary_en_text_mmr = ' '.join(sentence._texts
                                       for sentence in summary_en_mmr)

        print(
            "-----------------------------chinese default-------------------------------"
        )
        print(summary_cn_text)
        print("the summary length is {}".format(summary_cn_length))
        print(
            "-----------------------------chinese     MMR-------------------------------"
        )
        print(summary_cn_text_mmr)
        print("the summary length is {}".format(summary_cn_mmr_length))
        print(
            "-----------------------------english   tfidf-------------------------------"
        )
        print(summary_en_text_tfidf)
        print("the summary length is {}".format(summary_en_tfidf_length))
        print(
            "-----------------------------english     MMR-------------------------------"
        )
        print(summary_en_text_mmr)
        print("the summary length is {}".format(summary_en_mmr_length))

        self.assertLessEqual(summary_en_tfidf_length, 100)
        self.assertLessEqual(summary_en_mmr_length, 100)
        self.assertLessEqual(summary_cn_length, 100)
        self.assertLessEqual(summary_cn_mmr_length, 100)