def test_compute_ratings(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("Dog cat fish.", Tokenizer("english")) s1 = Sentence("Dog cat camel.", Tokenizer("english")) s2 = Sentence("Fish frog horse.", Tokenizer("english")) document = build_document([s0, s1, s2]) ratings = summarizer._compute_ratings(document.sentences) assert ratings[s0] == 0 assert ratings[s1] == -2 assert ratings[s2] == -1 # Due to the frequency discounting, after finding sentence s0, # s2 should come before s1 since only two of its words get discounted # rather than all 3 of s1's s0 = Sentence("one two three", Tokenizer("english")) s1 = Sentence("one two four", Tokenizer("english")) s2 = Sentence("three five six", Tokenizer("english")) document = build_document([s0, s1, s2]) ratings = summarizer._compute_ratings(document.sentences) assert ratings[s0] == 0 assert ratings[s1] == -2 assert ratings[s2] == -1
def test_the_sentences_should_be_in_different_order(summarizer): """https://github.com/miso-belica/sumy/issues/146""" paragraphs = [ ["This is 1st sentence.", "This is 2nd sentence."], ["This is 3rd sentence.", "This is 4th sentence."], ["This is 5th sentence."], ] document = build_document(*paragraphs) reversed_document = build_document(*(reversed(p) for p in reversed(paragraphs))) sentences = summarizer(document, "100%") reversed_sentences = summarizer(reversed_document, "100%") assert tuple(reversed(sentences)) == reversed_sentences
def test_cue_3(): document = build_document( ( "ba "*10, "bb "*10, " sa"*8 + " bb"*10, "bb bc ba", ), (), ( "babbbc "*10, "na nb nc nd sa" + " bc"*10, " ba n"*10, ) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 5) assert list(map(to_unicode, sentences)) == [ ("ba "*10).strip(), ("bb "*10).strip(), "bb bc ba", "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc", ("ba n "*10).strip(), ]
def test_location_method_with_empty_document(): summarizer = EdmundsonSummarizer() summarizer.null_words = ("na", "nb", "nc",) sentences = summarizer.location_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_key_empty(): summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) sentences = summarizer.key_method(build_document(), 10) assert list(map(to_unicode, sentences)) == []
def test_three_sentences(): document = build_document(( "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("s", ) returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "wa s s s wa s s s wa", "wb s wb s wb s s s s s s s s s wb", "wc s s wc s s wc", ]
def test_single_sentence(summarizer): s = Sentence("I am one slightly longer sentence.", Tokenizer("english")) document = build_document([s]) returned = summarizer(document, 10) assert len(returned) == 1
def test_various_words_with_significant_percentage(): document = build_document(( "1 a", "2 b b", "3 c c c", "4 d d d", "5 z z z z", "6 e e e e e", )) summarizer = LuhnSummarizer() summarizer.stop_words = ("1", "2", "3", "4", "5", "6") returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "6 e e e e e", ] returned = summarizer(document, 2) assert list(map(to_unicode, returned)) == [ "5 z z z z", "6 e e e e e", ] returned = summarizer(document, 3) assert list(map(to_unicode, returned)) == [ "3 c c c", "5 z z z z", "6 e e e e e", ]
def test_single_sentence(): document = build_document(("I am the sentence you like",)) summarizer = LsaSummarizer() summarizer.stopwords = ("I", "am", "the",) sentences = summarizer(document, 10) assert len(sentences) == 1 assert to_unicode(sentences[0]) == "I am the sentence you like"
def test_numpy_not_installed(): summarizer = LsaSummarizer() numpy = lsa_module.numpy lsa_module.numpy = None with pytest.raises(ValueError): summarizer(build_document(), 10) lsa_module.numpy = numpy
def test_numpy_not_installed(): summarizer = LexRankSummarizer() numpy = lex_rank_module.numpy lex_rank_module.numpy = None with pytest.raises(ValueError): summarizer(build_document(), 10) lex_rank_module.numpy = numpy
def test_single_sentence(): document = build_document(("Já jsem jedna věta", )) summarizer = LuhnSummarizer() summarizer.stop_words = ( "já", "jsem", ) returned = summarizer(document, 10) assert len(returned) == 1
def test_single_sentence(): document = build_document(("I am one sentence", )) summarizer = TextRankSummarizer() summarizer.stop_words = ( "I", "am", ) returned = summarizer(document, 10) assert len(returned) == 1
def test_three_sentences_but_second_winner(): document = build_document([ "I am that 1. sentence", "And I am 2. sentence - winning sentence", "And I am 3. sentence - winner is my 2nd name", ]) summarizer = ReductionSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] returned = summarizer(document, 1) assert len(returned) == 1 assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
def test_cue_1(): document = build_document( ("ba bb bc bb unknown ľščťžýáíé sb sc sb",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc",) summarizer.stigma_words = ("sa", "sb", "sc",) sentences = summarizer.cue_method(document, 10) assert len(sentences) == 1
def test_compute_tf(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("kicking soccer balls.", Tokenizer("english")) s1 = Sentence("eating chicken dumplings.", Tokenizer("english")) document = build_document([s0, s1]) freq = summarizer._compute_tf(document.sentences) assert freq["kicking"] == 1 / 6 assert freq["soccer"] == 1 / 6 assert freq["balls"] == 1 / 6 assert freq["eating"] == 1 / 6 assert freq["chicken"] == 1 / 6 assert freq["dumplings"] == 1 / 6 document = build_document([s0, s0, s1]) freq = summarizer._compute_tf(document.sentences) assert freq["kicking"] == 2 / 9 assert freq["soccer"] == 2 / 9 assert freq["balls"] == 2 / 9 assert freq["eating"] == 1 / 9 assert freq["chicken"] == 1 / 9 assert freq["dumplings"] == 1 / 9
def test_key_1(): document = build_document( ("wa wb wc wd", "I like music",), ("This is test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("ba", "bb", "bc", "bonus",) sentences = summarizer.key_method(document, 1) assert list(map(to_unicode, sentences)) == [ "This is test sentence with some extra words and bonus", ]
def test_get_all_content_words_in_doc(): summarizer = _build_summarizer(EMPTY_STOP_WORDS) s0 = Sentence("One two three.", Tokenizer("english")) s1 = Sentence("One two three.", Tokenizer("english")) document = build_document([s0, s1]) content_words = summarizer._get_all_content_words_in_doc( document.sentences) content_words_freq = {} for w in content_words: content_words_freq[w] = content_words_freq.get(w, 0) + 1 content_words_correct = {"one": 2, "two": 2, "three": 2} assert content_words_freq == content_words_correct
def test_sentences_rating(): document = build_document([ "a c e g", "a b c d e f g", "b d f", ]) summarizer = ReductionSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] ratings = summarizer.rate_sentences(document) assert len(ratings) == 3 assert ratings[document.sentences[1]] > ratings[document.sentences[0]] assert ratings[document.sentences[0]] > ratings[document.sentences[2]]
def test_dictionary_without_stop_words(): summarizer = LsaSummarizer() summarizer.stop_words = ["stop", "Halt", "SHUT", "HmMm"] document = build_document( ("stop halt shut hmmm", "Stop Halt Shut Hmmm",), ("StOp HaLt ShUt HmMm", "STOP HALT SHUT HMMM",), ("Some relevant sentence", "Some moRe releVant sentEnce",), ) expected = frozenset(["some", "more", "relevant", "sentence"]) dictionary = summarizer._create_dictionary(document) assert expected == frozenset(dictionary.keys())
def test_key_2(): document = build_document( ("Om nom nom nom nom", "Sure I summarize it, with bonus",), ("This is bonus test sentence with some extra words and bonus",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("nom", "bonus",) sentences = summarizer.key_method(document, 2) assert list(map(to_unicode, sentences)) == [ "Om nom nom nom nom", "This is bonus test sentence with some extra words and bonus", ]
def test_document(): document = build_document( ("I am the sentence you like", "Do you like me too",), ("This sentence is better than that above", "Are you kidding me",) ) summarizer = LsaSummarizer() summarizer.stopwords = ( "I", "am", "the", "you", "are", "me", "is", "than", "that", "this", ) sentences = summarizer(document, 2) assert len(sentences) == 2 assert to_unicode(sentences[0]) == "I am the sentence you like" assert to_unicode(sentences[1]) == "This sentence is better than that above"
def test_two_sentences_but_one_winner(): document = build_document( ("Já jsem 1. vítězná ta věta", "A já ta 2. vítězná věta")) summarizer = LuhnSummarizer() summarizer.stop_words = ( "já", "jsem", "a", "ta", ) returned = summarizer(document, 1) assert list(map(to_unicode, returned)) == [ "A já ta 2. vítězná věta", ]
def test_two_sentences(): document = build_document( ("I am that 1. sentence", "And I am 2. winning prize")) summarizer = TextRankSummarizer() summarizer.stop_words = ( "I", "am", "and", "that", ) returned = summarizer(document, 10) assert len(returned) == 2 assert to_unicode(returned[0]) == "I am that 1. sentence" assert to_unicode(returned[1]) == "And I am 2. winning prize"
def test_title_method_without_title(): document = build_document( ("This is sentence", "This is another one",), ("And some next sentence but no heading",) ) summarizer = EdmundsonSummarizer() summarizer.null_words = ("this", "is", "some", "and",) sentences = summarizer.title_method(document, 10) assert list(map(to_unicode, sentences)) == [ "This is sentence", "This is another one", "And some next sentence but no heading", ]
def test_rating_with_zero_or_single_words_in_sentences(sentences, expected_ratings): """ This is an edge-case test when the sentence(s) have only one word or even zero words. This test makes me sure the logic will not break when such a case is encountered. """ document = build_document(sentences) summarizer = TextRankSummarizer() ratings = summarizer.rate_sentences(document) assert ratings == { document.sentences[0]: pytest.approx(expected_ratings[0]), document.sentences[1]: pytest.approx(expected_ratings[1]), }
def test_sentences_rating(): document = build_document([ "a c e g", "a b c d e f g", "b d f", ]) summarizer = TextRankSummarizer() ratings = summarizer.rate_sentences(document) assert ratings == { document.sentences[0]: pytest.approx(0.29714368215098025), document.sentences[1]: pytest.approx(0.42683373199392705), document.sentences[2]: pytest.approx(0.2760223553913001), } assert pytest.approx(sum(ratings.values())) == 1
def test_cue_letters_case(): document = build_document( ("X X X", "x x x x",), ("w w w", "W W W W",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("X", "w",) summarizer.stigma_words = ("stigma",) sentences = summarizer.cue_method(document, 2) assert list(map(to_unicode, sentences)) == [ "x x x x", "W W W W", ]
def test_unique_words(): document = build_document( ("Nějaký muž šel kolem naší zahrady", "Nějaký muž šel kolem vaší zahrady",), ("Už už abych taky šel",), ) assert sorted(frozenset(document.words)) == [ "Nějaký", "Už", "abych", "kolem", "muž", "naší", "taky", "už", "vaší", "zahrady", "šel", ]
def test_key_3(): document = build_document( ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",), ("x X x X",) ) summarizer = EdmundsonSummarizer() summarizer.bonus_words = ("wa", "X",) sentences = summarizer.key_method(document, 3) assert list(map(to_unicode, sentences)) == [ "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa", ] sentences = summarizer.key_method(document, 3, weight=0) assert list(map(to_unicode, sentences)) == [ "wa wa wa wa", "wa Wa Wa Wa wa", "x X x X", ]