Пример #1
0
    def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")
Пример #2
0
    def test_cue_2(self):
        document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ),
                                  ("Pepek likes spinach", ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Пример #3
0
    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "na",
            "nb",
            "nc",
            "nd",
            "ne",
        )

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
Пример #4
0
    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1,
                                         key_weight=1,
                                         title_weight=0,
                                         location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words",
                                  "like", "because")
        summarizer.stigma_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(
            to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Пример #5
0
    def test_title_method_3(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer.title_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
                         "This is next paragraph because of blank line above")
        self.assertEqual(
            to_unicode(sentences[2]),
            "Here is the winner because contains words like cool and heading")
Пример #6
0
    def test_cue_3(self):
        document = build_document(
            (
                "ba "*10,
                "bb "*10,
                " sa"*8 + " bb"*10,
                "bb bc ba",
            ),
            (),
            (
                "babbbc "*10,
                "na nb nc nd sa" + " bc"*10,
                " ba n"*10,
            )
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
            "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())
Пример #7
0
    def test_cue_3(self):
        document = build_document((
            "ba " * 10,
            "bb " * 10,
            " sa" * 8 + " bb" * 10,
            "bb bc ba",
        ), (), (
            "babbbc " * 10,
            "na nb nc nd sa" + " bc" * 10,
            " ba n" * 10,
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
                         "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())
Пример #8
0
    def test_document(self):
        document = build_document((
            "I am the sentence you like",
            "Do you like me too",
        ), (
            "This sentence is better than that above",
            "Are you kidding me",
        ))
        summarizer = LsaSummarizer()
        summarizer.stopwords = (
            "I",
            "am",
            "the",
            "you",
            "are",
            "me",
            "is",
            "than",
            "that",
            "this",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]),
                         "This sentence is better than that above")
Пример #9
0
    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("na", "nb", "nc", "nd", "ne",)

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")
Пример #10
0
    def test_two_sentences(self):
        document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem", "a", "ta",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Пример #11
0
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = GraphSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
Пример #12
0
def test_two_sentences():
    document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ("I", "am", "and", "that",)

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")
Пример #14
0
    def test_document(self):
        document = build_document(
            ("I am the sentence you like", "Do you like me too",),
            ("This sentence is better than that above", "Are you kidding me",)
        )
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above")
Пример #15
0
    def test_key_2(self):
        document = build_document(
            ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
            ("This is bonus test sentence with some extra words and bonus",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("nom", "bonus",)

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Пример #16
0
    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
Пример #17
0
    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")
Пример #18
0
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
Пример #19
0
    def test_title_method_without_title(self):
        document = build_document(
            ("This is sentence", "This is another one",),
            ("And some next sentence but no heading",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "some", "and",)

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]), "And some next sentence but no heading")
Пример #20
0
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
Пример #21
0
    def test_key_no_bonus_words_in_document(self):
        document = build_document(
            ("wa wb wc wd", "I like music",),
            ("This is test sentence with some extra words",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
            "This is test sentence with some extra words")
Пример #22
0
    def test_cue_letters_case(self):
        document = build_document(
            ("X X X", "x x x x",),
            ("w w w", "W W W W",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("X", "w",)
        summarizer.stigma_words = ("stigma",)

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Пример #23
0
    def test_two_sentences(self):
        document = build_document(
            ("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = (
            "já",
            "jsem",
            "a",
            "ta",
        )

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")
Пример #24
0
    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
            "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
            "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
            "Už už abych taky šel")
Пример #25
0
    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
                         "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
                         "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
                         "Už už abych taky šel")
def create_list():

    global page_title, l, title_tokens, net_graph, graph_list, q, badtags

    addonv2.path(pdfname)
    page_title, l, title_tokens = bookmark_page_v4.bkpage(pdfname)
    badtags = [
        'cover', 'notes and further reading'
        'title page', 'copyright page', 'contents', 'new to the third edition',
        'review questions', 'laboratory exercises',
        'epilogue: algorithms that run forever', 'brief contents',
        'about the authors', 'exercises', 'solved exercises',
        'about the author', 'preface', 'selected bibliography',
        'acknowledgments', 'references', 'index', 'brief contents', 'foreword',
        'bibliography', 'table of contents', 'foreword', 'appendix',
        'epilogue', 'about the cd'
    ]

    net_graph = nx.DiGraph()
    graph_list = list()

    for i in range(0, len(badtags)):
        badtags[i].encode('utf-8')
    q = list()
    k = -1

    for i in range(0, len(addonv2.l)):
        flag = 0
        for j in range(0, len(l)):

            if addonv2.l[i][1] == l[j] and addonv2.l[i][0] != '3':
                k += 1
                t = list()
                t.extend([
                    addonv2.l[i][0], addonv2.l[i][1],
                    page_title[addonv2.l[i][1]]
                ])
                print t, ' ', k
                q.append(t)
                flag = 1
        if flag == 0:
            if int(addonv2.l[i][0]) == 1:
                t = list()
                t.extend([addonv2.l[i][0], addonv2.l[i][1], ""])
                q.append(t)

    for i in range(0, len(q) - 1):
        if q[i][2] == "":
            q[i][2] = q[i + 1][2]

    for i in range(0, len(q)):
        q[i][1] = to_unicode(q[i][1]).strip()
        q[i][1] = re.sub(u"(\u2018|\u2019|\u201c|\u201d)", "", q[i][1])
        q[i][1] = re.sub(u"\xa0", " ", q[i][1])
        print q[i], i

    print len(q)

    gFunc2(q)
    get_data(badtags)
Пример #27
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"), Tokenizer("czech"))
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo.")
    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
Пример #29
0
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Пример #30
0
 def to_sentences(self, paragraph):
     if hasattr(self._sentence_tokenizer, '_params'):
         extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(
             self._language, [])
         self._sentence_tokenizer._params.abbrev_types.update(
             extra_abbreviations)
     sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph))
     return tuple(map(unicode.strip, sentences))
Пример #31
0
    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
Пример #32
0
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Пример #33
0
    def test_key_2(self):
        document = build_document((
            "Om nom nom nom nom",
            "Sure I summarize it, with bonus",
        ), ("This is bonus test sentence with some extra words and bonus", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "nom",
            "bonus",
        )

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(
            to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")
Пример #34
0
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Пример #35
0
    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "This is only one sentence.")
Пример #36
0
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Пример #37
0
    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "This is only one sentence.")
Пример #38
0
    def test_key_no_bonus_words_in_document(self):
        document = build_document((
            "wa wb wc wd",
            "I like music",
        ), ("This is test sentence with some extra words", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
            "bonus",
        )

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
                         "This is test sentence with some extra words")
Пример #39
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.")
        self.assertEqual(to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo.")
Пример #40
0
def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."
Пример #41
0
    def test_title_method_without_title(self):
        document = build_document((
            "This is sentence",
            "This is another one",
        ), ("And some next sentence but no heading", ))

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "some",
            "and",
        )

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]),
                         "And some next sentence but no heading")
Пример #42
0
    def test_title_method_2(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer.title_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "This is next paragraph because of blank line above")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Пример #43
0
    def test_cue_2(self):
        document = build_document(
            ("ba bb bc bb unknown ľščťžýáíé sb sc sb",),
            ("Pepek likes spinach",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
Пример #44
0
    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")
Пример #45
0
    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")
Пример #46
0
    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo."
        )
Пример #47
0
    def test_cue_letters_case(self):
        document = build_document((
            "X X X",
            "x x x x",
        ), (
            "w w w",
            "W W W W",
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "X",
            "w",
        )
        summarizer.stigma_words = ("stigma", )

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")
Пример #48
0
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = GraphSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Пример #49
0
def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")
Пример #51
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Пример #52
0
    def test_key_3(self):
        document = build_document((
            "wa",
            "wa wa",
            "wa wa wa",
            "wa wa wa wa",
            "wa Wa Wa Wa wa",
        ), ("x X x X", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "wa",
            "X",
        )

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Пример #53
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s", )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
Пример #54
0
    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")
Пример #55
0
    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1,
            title_weight=0, location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because")
        summarizer.stigma_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")
Пример #56
0
    def test_real_example(self):
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech"))
        summarizer = LuhnSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        returned = summarizer(parser.document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(
            to_unicode(returned[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(returned[1]),
            "Připadal si, že je mezi malými dětmi a realizoval se tím, "
            "že si ve třídě o rok mladších dětí budoval vedoucí pozici.")
Пример #57
0
    def test_key_3(self):
        document = build_document(
            ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
            ("x X x X",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("wa", "X",)

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")
Пример #58
0
    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s",)

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")
 def test_to_unicode(self):
     returned = compat.to_unicode(self.o)
     self.assertStringsEqual(UNICODE_STRING, returned)