def test_summarizer(self): summarizer_cn = SubmodularSummarizer("chinese") summarizer_en = SubmodularSummarizer("english") summarizer_en_stem = SubmodularSummarizer("english", True) data_file_path = abspath(dirname(__file__)) + '/data' cn_data_file_path = data_file_path + '/chinese/' en_data_file_path = data_file_path + '/english/' parser_cn = PlaintextParser("chinese") parser_en = PlaintextParser("english") document_set_cn = parser_cn.build_documentSet_from_dir( cn_data_file_path) document_set_en = parser_en.build_documentSet_from_dir( en_data_file_path) summarizer_cn.stop_words = get_stop_words("chinese") summarizer_en.stop_words = get_stop_words("english") summarizer_en_stem.stop_words = get_stop_words("english") summary_cn = summarizer_cn(document_set_cn, 100) summary_cn_len = sum( get_cn_sentence_length(sentence) for sentence in summary_cn) summary_cn_text = ''.join(sentence._texts + '。' for sentence in summary_cn) summary_en = summarizer_en(document_set_en, 100) summary_en_len = sum( get_en_sentence_length(sentence) for sentence in summary_en) summary_en_text = ' '.join(sentence._texts for sentence in summary_en) summary_en_stem = summarizer_en(document_set_en, 100) summary_en_stem_len = sum( get_en_sentence_length(sentence) for sentence in summary_en_stem) summary_en_stem_text = ' '.join(sentence._texts for sentence in summary_en_stem) self.assertLessEqual(summary_cn_len, 100) self.assertLessEqual(summary_en_len, 100) self.assertLessEqual(summary_en_stem_len, 100) print( "-----------------------------chinese default---------------------------" ) print(summary_cn_text) print("the summary length is {}".format(summary_cn_len)) print( "-----------------------------english default---------------------------" ) print(summary_en_text) print("the summary length is {}".format(summary_en_len)) print( "-----------------------------english stem---------------------------" ) print(summary_en_stem_text) print("the summary length is {}".format(summary_en_stem_len))
def test_summarizer(self): summarizer_en = LexRank_querySummarizer("english") summarizer_en_stem = LexRank_querySummarizer("english", True) summarizer_cn = LexRank_querySummarizer("chinese") data_file_path = abspath(dirname(__file__)) + '/data' cn_data_file_path = data_file_path + '/chinese/' en_data_file_path = data_file_path + '/english/' parser_cn = PlaintextParser("chinese") parser_en = PlaintextParser("english") document_set_cn = parser_cn.build_documentSet_from_dir( cn_data_file_path ) document_set_en = parser_en.build_documentSet_from_dir( en_data_file_path ) query_cn = u"科比生涯的最后一个赛季" query_en = "a day in the life with org" summarizer_cn.stop_words = get_stop_words("chinese") summarizer_en.stop_words = get_stop_words("english") summarizer_en_stem.stop_words = get_stop_words("english") summary_cn_mmr = summarizer_cn(document_set_cn, query_cn, 100, method="MMR") summary_cn_mmr_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_mmr) summary_cn_mmr_text = ''.join(sentence._texts + '。' for sentence in summary_cn_mmr) summary_cn_default = summarizer_cn(document_set_cn, query_cn, 100, method="default") summary_cn_default_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_default) summary_cn_default_text = ''.join(sentence._texts + '。' for sentence in summary_cn_default) summary_en_mmr = summarizer_en(document_set_en, query_en, 100, method="MMR") summary_en_mmr_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_mmr) summary_en_mmr_text = ' '.join(sentence._texts for sentence in summary_en_mmr) self.assertLessEqual(summary_cn_mmr_len, 100) self.assertLessEqual(summary_cn_default_len, 100) self.assertLessEqual(summary_en_mmr_len, 100) print("--------------------------chinese MMR-----------------------------") print(summary_cn_mmr_text) print("the summary length is {}".format(summary_cn_mmr_len)) print("--------------------------chinese default-----------------------------") print(summary_cn_default_text) print("the summary length is {}".format(summary_cn_default_len)) print("--------------------------english MMR-----------------------------") print(summary_en_mmr_text) print("the summary length is {}".format(summary_en_mmr_len))
def handle_arguments(args): document_format = args['--format'] if document_format is not None and document_format not in PARSERS: raise ValueError( "Unsupported input format. Possible values are {0}. Given: {1}." ).format(", ".join(PARSERS.keys()), document_format) parser = PARSERS[document_format or "plaintext"] words_limit = args['--length'] or 250 words_limit = int(words_limit) language = args['--language'] or "english" parser = parser(language) if args['--file'] is not None: file_path = args['--file'] file_path = abspath(file_path) if isdir(file_path): document_set = parser.build_documentSet_from_dir(file_path) elif isfile(file_path): document_set = parser.build_document_from_file(file_path) else: raise ValueError("Input file is invalid") if args['--stopwords']: stop_words = read_stop_words(args['--stopwords']) else: stop_words = get_stop_words(language) if args['--stem']: stem_or_not = True else: stem_or_not = False query = False or to_unicode(args['--query']) if args['--query']: summarizer_class = next(cls for name, cls in METHODS_Q.items() if args[name]) else: summarizer_class = next(cls for name, cls in METHODS.items() if args[name]) summarizer = build_summarizer(summarizer_class, language, stop_words, stem_or_not) return document_set, summarizer, language, words_limit, query
def test_summarizer(self): summarizer_en = conceptILPSummarizer("english") summarizer_en_stem = conceptILPSummarizer("english", True) summarizer_cn = conceptILPSummarizer("chinese") data_file_path = abspath(dirname(__file__)) + '/data' cn_data_file_path = data_file_path + '/chinese/' en_data_file_path = data_file_path + '/english/' parser_cn = PlaintextParser("chinese") parser_en = PlaintextParser("english") document_set_cn = parser_cn.build_documentSet_from_dir( cn_data_file_path ) document_set_en = parser_en.build_documentSet_from_dir( en_data_file_path ) summarizer_cn.stop_words = get_stop_words("chinese") summarizer_en.stop_words = get_stop_words("english") summarizer_en_stem.stop_words = get_stop_words("english") summary_cn_ilp = summarizer_cn(document_set_cn, 100, method="ilp") summary_cn_ilp_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_ilp) summary_cn_ilp_text = ''.join(sentence._texts + '。' for sentence in summary_cn_ilp) summary_cn_greedy = summarizer_cn(document_set_cn, 100, method="greedy") summary_cn_greedy_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_greedy) summary_cn_greedy_text = ''.join(sentence._texts + '。' for sentence in summary_cn_greedy) summary_cn_tabu = summarizer_cn(document_set_cn, 100, method="tabu") summary_cn_tabu_len = sum(get_cn_sentence_length(sentence) for sentence in summary_cn_tabu) summary_cn_tabu_text = ''.join(sentence._texts + '。' for sentence in summary_cn_tabu) summary_en_ilp = summarizer_en(document_set_en, 100, method="ilp") summary_en_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_ilp) summary_en_ilp_text = ' '.join(sentence._texts for sentence in summary_en_ilp) summary_en_greedy = summarizer_en(document_set_en, 100, method="greedy") summary_en_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_greedy) summary_en_greedy_text = ' '.join(sentence._texts for sentence in summary_en_greedy) summary_en_tabu = summarizer_en(document_set_en, 100, method="tabu") summary_en_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_tabu) summary_en_tabu_text = ' '.join(sentence._texts for sentence in summary_en_tabu) summary_en_stem_ilp = summarizer_en_stem(document_set_en, 100, method="ilp") summary_en_stem_ilp_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_ilp) summary_en_stem_ilp_text = ' '.join(sentence._texts for sentence in summary_en_stem_ilp) summary_en_stem_greedy = summarizer_en_stem(document_set_en, 100, method="greedy") summary_en_stem_greedy_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_greedy) summary_en_stem_greedy_text = ' '.join(sentence._texts for sentence in summary_en_stem_greedy) summary_en_stem_tabu = summarizer_en_stem(document_set_en, 100, method="tabu") summary_en_stem_tabu_len = sum(get_en_sentence_length(sentence) for sentence in summary_en_stem_tabu) summary_en_stem_tabu_text = ' '.join(sentence._texts for sentence in summary_en_stem_tabu) self.assertLessEqual(summary_cn_ilp_len, 100) self.assertLessEqual(summary_cn_greedy_len, 100) self.assertLessEqual(summary_cn_tabu_len, 100) self.assertLessEqual(summary_en_ilp_len, 100) self.assertLessEqual(summary_en_greedy_len, 100) self.assertLessEqual(summary_en_tabu_len, 100) self.assertLessEqual(summary_en_stem_ilp_len, 100) self.assertLessEqual(summary_en_stem_greedy_len, 100) self.assertLessEqual(summary_en_stem_tabu_len, 100) print("--------------------------chinese ILP-----------------------------") print(summary_cn_ilp_text) print("the summary length is {}".format(summary_cn_ilp_len)) print("--------------------------chinese greedy-----------------------------") print(summary_cn_greedy_text) print("the summary length is {}".format(summary_cn_greedy_len)) print("--------------------------chinese tabu-----------------------------") print(summary_cn_tabu_text) print("the summary length is {}".format(summary_cn_tabu_len)) print("--------------------------english ILP-----------------------------") print(summary_en_ilp_text) print("the summary length is {}".format(summary_en_ilp_len)) print("--------------------------english greedy-----------------------------") print(summary_en_greedy_text) print("the summary length is {}".format(summary_en_greedy_len)) print("--------------------------english tabu-----------------------------") print(summary_en_tabu_text) print("the summary length is {}".format(summary_en_tabu_len)) print("--------------------------english Stem ILP-----------------------------") print(summary_en_stem_ilp_text) print("the summary length is {}".format(summary_en_stem_ilp_len)) print("--------------------------eng StemGreedy-----------------------------") print(summary_en_stem_greedy_text) print("the summary length is {}".format(summary_en_stem_greedy_len)) print("--------------------------english StemTabu-----------------------------") print(summary_en_stem_tabu_text) print("the summary length is {}".format(summary_en_stem_tabu_len))
def test_summarizer(self): summarizer_en = KLSummarizer("english") summarizer_en_stem = KLSummarizer("english", True) summarizer_cn = KLSummarizer("chinese") summarizer_cn.stop_words = get_stop_words("chinese") data_file_path = abspath(dirname(__file__)) + '/data' cn_data_file_path = data_file_path + '/chinese/' en_data_file_path = data_file_path + '/english/' parser_cn = PlaintextParser("chinese") parser_en = PlaintextParser("english") document_set_cn = parser_cn.build_documentSet_from_dir( cn_data_file_path) document_set_en = parser_en.build_documentSet_from_dir( en_data_file_path) summary_cn = summarizer_cn(document_set_cn, 100) summary_cn_length = sum( get_cn_sentence_length(sentence) for sentence in summary_cn) summary_cn_text = ''.join(sentence._texts + '。' for sentence in summary_cn) summary_en = summarizer_en(document_set_en, 100) summary_en_length = sum( get_en_sentence_length(sentence) for sentence in summary_en) summary_en_text = ' '.join(sentence._texts for sentence in summary_en) summary_en_stem = summarizer_en_stem(document_set_en, 100) summary_en_stem_length = sum( get_en_sentence_length(sentence) for sentence in summary_en_stem) summary_en_stem_text = ' '.join(sentence._texts for sentence in summary_en_stem) self.assertLessEqual(summary_cn_length, 100) self.assertLessEqual(summary_en_length, 100) self.assertLessEqual(summary_en_stem_length, 100) print( "--------------------------chinese KL-----------------------------" ) print(summary_cn_text) print("the summary length is {}".format(summary_cn_length)) print( "--------------------------english KL-----------------------------" ) print(summary_en_text) print("the summary length is {}".format(summary_en_length)) print( "--------------------------english stem-----------------------------" ) print(summary_en_stem_text) print("the summary length is {}".format(summary_en_stem_length)) # print(summary_cn_text) # print(summary_en_text) # print(summary_en_stem) # print summarizer_en_stem._get_content_words_in_sentence(summary_en[0]) # print summarizer_en._get_content_words_in_sentence(summary_en[0]) word_list = summarizer_en_stem._get_content_words_in_sentence( summary_en[0]) word_stem_list = summarizer_en._get_content_words_in_sentence( summary_en[0]) self.assertNotEqual(word_list, word_stem_list)
def test_stopwords(self): stop_words_cn = get_stop_words("chinese") stop_words_en = get_stop_words("english") self.assertIn("啊", stop_words_cn) self.assertIn("and", stop_words_en)
def test_summarizer(self): summarizer_en = LsaSummarizer("english") summarizer_en_stem = LsaSummarizer("english", True) summarizer_cn = LsaSummarizer("chinese") data_file_path = abspath(dirname(__file__)) + '/data' cn_data_file_path = data_file_path + '/chinese/' en_data_file_path = data_file_path + '/english/' parser_cn = PlaintextParser("chinese") parser_en = PlaintextParser("english") document_set_cn = parser_cn.build_documentSet_from_dir( cn_data_file_path) document_set_en = parser_en.build_documentSet_from_dir( en_data_file_path) summarizer_cn.stop_words = get_stop_words("chinese") summarizer_en.stop_words = get_stop_words("english") summary_cn = summarizer_cn(document_set_cn, 100) summary_cn_length = sum( get_cn_sentence_length(sentence) for sentence in summary_cn) summary_cn_text = ''.join(sentence._texts + '。' for sentence in summary_cn) # summary_cn_mmr = summarizer_cn(document_set_cn, 100, method="MMR") summary_cn_mmr = summarizer_cn(document_set_cn, 100, method="MMR", metric="tfidf") summary_cn_mmr_length = sum( get_cn_sentence_length(sentence) for sentence in summary_cn_mmr) summary_cn_text_mmr = ''.join(sentence._texts + '。' for sentence in summary_cn_mmr) summary_en_tfidf = summarizer_en(document_set_en, 100, method="MMR", metric="tfidf") summary_en_tfidf_length = sum( get_en_sentence_length(sentence) for sentence in summary_en_tfidf) summary_en_text_tfidf = ' '.join(sentence._texts for sentence in summary_en_tfidf) summary_en_mmr = summarizer_en(document_set_en, 100, method="MMR") summary_en_mmr_length = sum( get_en_sentence_length(sentence) for sentence in summary_en_mmr) summary_en_text_mmr = ' '.join(sentence._texts for sentence in summary_en_mmr) print( "-----------------------------chinese default-------------------------------" ) print(summary_cn_text) print("the summary length is {}".format(summary_cn_length)) print( "-----------------------------chinese MMR-------------------------------" ) print(summary_cn_text_mmr) print("the summary length is {}".format(summary_cn_mmr_length)) print( "-----------------------------english tfidf-------------------------------" ) print(summary_en_text_tfidf) print("the summary length is {}".format(summary_en_tfidf_length)) print( "-----------------------------english MMR-------------------------------" ) print(summary_en_text_mmr) print("the summary length is {}".format(summary_en_mmr_length)) self.assertLessEqual(summary_en_tfidf_length, 100) self.assertLessEqual(summary_en_mmr_length, 100) self.assertLessEqual(summary_cn_length, 100) self.assertLessEqual(summary_cn_mmr_length, 100)