Python to_unicode примеры, sumy._compat.to_unicode Python примеры использования

Пример #1

0

Показать файл

Файл: test_parsers.py Проект: NecessitateApps/Summarizer

    def test_annotated_text(self):
        path = expand_resource_path("snippets/paragraphs.html")
        url = "http://www.snippet.org/paragraphs.html"
        parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

        document = parser.document

        self.assertEqual(len(document.paragraphs), 2)

        self.assertEqual(len(document.paragraphs[0].headings), 1)
        self.assertEqual(len(document.paragraphs[0].sentences), 1)

        self.assertEqual(to_unicode(document.paragraphs[0].headings[0]),
                         "Toto je nadpis prvej úrovne")
        self.assertEqual(to_unicode(document.paragraphs[0].sentences[0]),
                         "Toto je prvý odstavec a to je fajn.")

        self.assertEqual(len(document.paragraphs[1].headings), 0)
        self.assertEqual(len(document.paragraphs[1].sentences), 2)

        self.assertEqual(
            to_unicode(document.paragraphs[1].sentences[0]),
            "Tento text je tu aby vyplnil prázdne miesto v srdci súboru.")
        self.assertEqual(to_unicode(document.paragraphs[1].sentences[1]),
                         "Aj súbory majú predsa city.")

Пример #2

0

Показать файл

    def test_cue_2(self):
        document = build_document(("ba bb bc bb unknown ľščťžýáíé sb sc sb", ),
                                  ("Pepek likes spinach", ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "ba bb bc bb unknown ľščťžýáíé sb sc sb")

Пример #3

0

Показать файл

    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "na",
            "nb",
            "nc",
            "nd",
            "ne",
        )

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")

Пример #4

0

Показать файл

    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1,
                                         key_weight=1,
                                         title_weight=0,
                                         location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words",
                                  "like", "because")
        summarizer.stigma_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(
            to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")

Пример #5

0

Показать файл

    def test_title_method_3(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "I",
            "am",
            "and",
        )

        sentences = summarizer.title_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]),
                         "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
                         "This is next paragraph because of blank line above")
        self.assertEqual(
            to_unicode(sentences[2]),
            "Here is the winner because contains words like cool and heading")

Пример #6

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_cue_3(self):
        document = build_document(
            (
                "ba "*10,
                "bb "*10,
                " sa"*8 + " bb"*10,
                "bb bc ba",
            ),
            (),
            (
                "babbbc "*10,
                "na nb nc nd sa" + " bc"*10,
                " ba n"*10,
            )
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba "*10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb "*10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
            "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n "*10).strip())

Пример #7

0

Показать файл

    def test_cue_3(self):
        document = build_document((
            "ba " * 10,
            "bb " * 10,
            " sa" * 8 + " bb" * 10,
            "bb bc ba",
        ), (), (
            "babbbc " * 10,
            "na nb nc nd sa" + " bc" * 10,
            " ba n" * 10,
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
        )
        summarizer.stigma_words = (
            "sa",
            "sb",
            "sc",
        )

        sentences = summarizer.cue_method(document, 5)
        self.assertEqual(len(sentences), 5)
        self.assertEqual(to_unicode(sentences[0]), ("ba " * 10).strip())
        self.assertEqual(to_unicode(sentences[1]), ("bb " * 10).strip())
        self.assertEqual(to_unicode(sentences[2]), "bb bc ba")
        self.assertEqual(to_unicode(sentences[3]),
                         "na nb nc nd sa bc bc bc bc bc bc bc bc bc bc")
        self.assertEqual(to_unicode(sentences[4]), ("ba n " * 10).strip())

Пример #8

0

Показать файл

    def test_document(self):
        document = build_document((
            "I am the sentence you like",
            "Do you like me too",
        ), (
            "This sentence is better than that above",
            "Are you kidding me",
        ))
        summarizer = LsaSummarizer()
        summarizer.stopwords = (
            "I",
            "am",
            "the",
            "you",
            "are",
            "me",
            "is",
            "than",
            "that",
            "this",
        )

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
                         "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]),
                         "This sentence is better than that above")

Пример #9

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_location_method_2(self):
        document = build_document_from_string("""
            # na nb nc ha hb
            ha = 1 + 1 + 0 = 2
            middle = 0
            ha hb = 2 + 1 + 0 = 3

            first = 1
            ha hb ha = 3
            last = 1

            # hc hd
            hb hc hd = 3 + 1 + 0 = 4
            ha hb = 2 + 1 + 0 = 3
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("na", "nb", "nc", "nd", "ne",)

        sentences = summarizer.location_method(document, 4, w_p1=0, w_p2=0)
        self.assertEqual(len(sentences), 4)
        self.assertEqual(to_unicode(sentences[0]), "ha hb = 2 + 1 + 0 = 3")
        self.assertEqual(to_unicode(sentences[1]), "ha hb ha = 3")
        self.assertEqual(to_unicode(sentences[2]), "hb hc hd = 3 + 1 + 0 = 4")
        self.assertEqual(to_unicode(sentences[3]), "ha hb = 2 + 1 + 0 = 3")

Пример #10

0

Показать файл

Файл: test_luhn.py Проект: Anhmike/sumy

    def test_two_sentences(self):
        document = build_document(("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("já", "jsem", "a", "ta",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")

Пример #11

0

Показать файл

Файл: test_graph.py Проект: abensrhir/sumy

    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = GraphSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")

Пример #12

0

Показать файл

def test_two_sentences():
    document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = ("I", "am", "and", "that",)

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"

Пример #13

0

Показать файл

Файл: test_text_rank.py Проект: vino5211/SNLP-16-Scientific-Article-Summarization

    def test_two_sentences(self):
        document = build_document(("I am that 1. sentence", "And I am 2. winning prize"))
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ("I", "am", "and", "that",)

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "I am that 1. sentence")
        self.assertEqual(to_unicode(returned[1]), "And I am 2. winning prize")

Пример #14

0

Показать файл

Файл: test_lsa.py Проект: jgomezdans/sumy

    def test_document(self):
        document = build_document(
            ("I am the sentence you like", "Do you like me too",),
            ("This sentence is better than that above", "Are you kidding me",)
        )
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the", "you", "are", "me", "is", "than", "that", "this",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")
        self.assertEqual(to_unicode(sentences[1]), "This sentence is better than that above")

Пример #15

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_key_2(self):
        document = build_document(
            ("Om nom nom nom nom", "Sure I summarize it, with bonus",),
            ("This is bonus test sentence with some extra words and bonus",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("nom", "bonus",)

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")

Пример #16

0

Показать файл

Файл: test_random.py Проект: likegitcoding/sumy

    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")

Пример #17

0

Показать файл

    def test_sentences_in_right_order(self):
        document = build_document_from_string("""
            # Heading one
            First sentence.
            Second sentence.
            Third sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 4)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "First sentence.")
        self.assertEqual(to_unicode(sentences[1]), "Second sentence.")
        self.assertEqual(to_unicode(sentences[2]), "Third sentence.")

Пример #18

0

Показать файл

Файл: test_random.py Проект: lakshaym30/StockAnalysis

def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."

Пример #19

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_title_method_without_title(self):
        document = build_document(
            ("This is sentence", "This is another one",),
            ("And some next sentence but no heading",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "some", "and",)

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]), "And some next sentence but no heading")

Пример #20

0

Показать файл

Файл: test_random.py Проект: miso-belica/sumy

def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."

Пример #21

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_key_no_bonus_words_in_document(self):
        document = build_document(
            ("wa wb wc wd", "I like music",),
            ("This is test sentence with some extra words",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc", "bonus",)

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
            "This is test sentence with some extra words")

Пример #22

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_cue_letters_case(self):
        document = build_document(
            ("X X X", "x x x x",),
            ("w w w", "W W W W",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("X", "w",)
        summarizer.stigma_words = ("stigma",)

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")

Пример #23

0

Показать файл

    def test_two_sentences(self):
        document = build_document(
            ("Já jsem 1. věta", "A já ta 2. vítězná výhra"))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = (
            "já",
            "jsem",
            "a",
            "ta",
        )

        returned = summarizer(document, 10)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "Já jsem 1. věta")
        self.assertEqual(to_unicode(returned[1]), "A já ta 2. vítězná výhra")

Пример #24

0

Показать файл

Файл: test_dom.py Проект: Anhmike/sumy

    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
            "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
            "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
            "Už už abych taky šel")

Пример #25

0

Показать файл

    def test_sentences(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.sentences), 3)
        self.assertEqual(to_unicode(document.sentences[0]),
                         "Nějaký muž šel kolem naší zahrady")
        self.assertEqual(to_unicode(document.sentences[1]),
                         "Nějaký jiný muž šel kolem vaší zahrady")
        self.assertEqual(to_unicode(document.sentences[2]),
                         "Už už abych taky šel")

Пример #26

0

Показать файл

Файл: getdatav10.py Проект: ljw23/Concept-Centrality-Of-Textbooks

def create_list():

    global page_title, l, title_tokens, net_graph, graph_list, q, badtags

    addonv2.path(pdfname)
    page_title, l, title_tokens = bookmark_page_v4.bkpage(pdfname)
    badtags = [
        'cover', 'notes and further reading'
        'title page', 'copyright page', 'contents', 'new to the third edition',
        'review questions', 'laboratory exercises',
        'epilogue: algorithms that run forever', 'brief contents',
        'about the authors', 'exercises', 'solved exercises',
        'about the author', 'preface', 'selected bibliography',
        'acknowledgments', 'references', 'index', 'brief contents', 'foreword',
        'bibliography', 'table of contents', 'foreword', 'appendix',
        'epilogue', 'about the cd'
    ]

    net_graph = nx.DiGraph()
    graph_list = list()

    for i in range(0, len(badtags)):
        badtags[i].encode('utf-8')
    q = list()
    k = -1

    for i in range(0, len(addonv2.l)):
        flag = 0
        for j in range(0, len(l)):

            if addonv2.l[i][1] == l[j] and addonv2.l[i][0] != '3':
                k += 1
                t = list()
                t.extend([
                    addonv2.l[i][0], addonv2.l[i][1],
                    page_title[addonv2.l[i][1]]
                ])
                print t, ' ', k
                q.append(t)
                flag = 1
        if flag == 0:
            if int(addonv2.l[i][0]) == 1:
                t = list()
                t.extend([addonv2.l[i][0], addonv2.l[i][1], ""])
                q.append(t)

    for i in range(0, len(q) - 1):
        if q[i][2] == "":
            q[i][2] = q[i + 1][2]

    for i in range(0, len(q)):
        q[i][1] = to_unicode(q[i][1]).strip()
        q[i][1] = re.sub(u"(\u2018|\u2019|\u201c|\u201d)", "", q[i][1])
        q[i][1] = re.sub(u"\xa0", " ", q[i][1])
        print q[i], i

    print len(q)

    gFunc2(q)
    get_data(badtags)

Пример #27

0

Показать файл

Файл: test_lsa.py Проект: zeyaddeeb/sumy

    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"), Tokenizer("czech"))
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo.")

Пример #28

0

Показать файл

Файл: test_lsa.py Проект: abiraja2004/Automatic-Text-Summarizer-2

    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")

Пример #29

0

Показать файл

Файл: test_lsa.py Проект: miso-belica/sumy

def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"

Пример #30

0

Показать файл

Файл: tokenizers.py Проект: ymmtr6/boston_terrier

 def to_sentences(self, paragraph):
     if hasattr(self._sentence_tokenizer, '_params'):
         extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(
             self._language, [])
         self._sentence_tokenizer._params.abbrev_types.update(
             extra_abbreviations)
     sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph))
     return tuple(map(unicode.strip, sentences))

Пример #31

0

Показать файл

Файл: test_lsa.py Проект: jgomezdans/sumy

    def test_single_sentence(self):
        document = build_document(("I am the sentence you like",))
        summarizer = LsaSummarizer()
        summarizer.stopwords = ("I", "am", "the",)

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "I am the sentence you like")

Пример #32

0

Показать файл

Файл: test_lsa.py Проект: lakshaym30/StockAnalysis

def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"

Пример #33

0

Показать файл

    def test_key_2(self):
        document = build_document((
            "Om nom nom nom nom",
            "Sure I summarize it, with bonus",
        ), ("This is bonus test sentence with some extra words and bonus", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "nom",
            "bonus",
        )

        sentences = summarizer.key_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "Om nom nom nom nom")
        self.assertEqual(
            to_unicode(sentences[1]),
            "This is bonus test sentence with some extra words and bonus")

Пример #34

0

Показать файл

Файл: test_random.py Проект: miso-belica/sumy

def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."

Пример #35

0

Показать файл

Файл: test_random.py Проект: likegitcoding/sumy

    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]), "This is only one sentence.")

Пример #36

0

Показать файл

Файл: test_random.py Проект: lakshaym30/StockAnalysis

def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."

Пример #37

0

Показать файл

    def test_less_sentences_than_requested(self):
        document = build_document_from_string("""
            This is only one sentence.
        """)
        summarizer = RandomSummarizer()

        sentences = summarizer(document, 10)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
                         "This is only one sentence.")

Пример #38

0

Показать файл

    def test_key_no_bonus_words_in_document(self):
        document = build_document((
            "wa wb wc wd",
            "I like music",
        ), ("This is test sentence with some extra words", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "ba",
            "bb",
            "bc",
            "bonus",
        )

        sentences = summarizer.key_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wb wc wd")
        self.assertEqual(to_unicode(sentences[1]), "I like music")
        self.assertEqual(to_unicode(sentences[2]),
                         "This is test sentence with some extra words")

Пример #39

0

Показать файл

Файл: test_lsa.py Проект: jgomezdans/sumy

    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením.")
        self.assertEqual(to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo.")

Пример #40

0

Показать файл

Файл: test_html_parser.py Проект: miso-belica/sumy

def test_annotated_text():
    path = expand_resource_path("snippets/paragraphs.html")
    url = "http://www.snippet.org/paragraphs.html"
    parser = HtmlParser.from_file(path, url, Tokenizer("czech"))

    document = parser.document

    assert len(document.paragraphs) == 2

    assert len(document.paragraphs[0].headings) == 1
    assert len(document.paragraphs[0].sentences) == 1

    assert to_unicode(document.paragraphs[0].headings[0]) == "Toto je nadpis prvej úrovne"
    assert to_unicode(document.paragraphs[0].sentences[0]) == "Toto je prvý odstavec a to je fajn."

    assert len(document.paragraphs[1].headings) == 0
    assert len(document.paragraphs[1].sentences) == 2

    assert to_unicode(document.paragraphs[1].sentences[0]) == "Tento text je tu aby vyplnil prázdne miesto v srdci súboru."
    assert to_unicode(document.paragraphs[1].sentences[1]) == "Aj súbory majú predsa city."

Пример #41

0

Показать файл

    def test_title_method_without_title(self):
        document = build_document((
            "This is sentence",
            "This is another one",
        ), ("And some next sentence but no heading", ))

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = (
            "this",
            "is",
            "some",
            "and",
        )

        sentences = summarizer.title_method(document, 10)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "This is sentence")
        self.assertEqual(to_unicode(sentences[1]), "This is another one")
        self.assertEqual(to_unicode(sentences[2]),
                         "And some next sentence but no heading")

Пример #42

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_title_method_2(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer()
        summarizer.null_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer.title_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "This is next paragraph because of blank line above")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")

Пример #43

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_cue_2(self):
        document = build_document(
            ("ba bb bc bb unknown ľščťžýáíé sb sc sb",),
            ("Pepek likes spinach",)
        )

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("ba", "bb", "bc",)
        summarizer.stigma_words = ("sa", "sb", "sc",)

        sentences = summarizer.cue_method(document, 10)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")
        self.assertEqual(to_unicode(sentences[1]), "Pepek likes spinach")

        sentences = summarizer.cue_method(document, 1)
        self.assertEqual(len(sentences), 1)
        self.assertEqual(to_unicode(sentences[0]),
            "ba bb bc bb unknown ľščťžýáíé sb sc sb")

Пример #44

0

Показать файл

Файл: test_dom.py Проект: Anhmike/sumy

    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")

Пример #45

0

Показать файл

    def test_headings(self):
        document = build_document_from_string("""
            Nějaký muž šel kolem naší zahrady
            Nějaký jiný muž šel kolem vaší zahrady

            # Nová myšlenka
            Už už abych taky šel
        """)

        self.assertEqual(len(document.headings), 1)
        self.assertEqual(to_unicode(document.headings[0]), "Nová myšlenka")

Пример #46

0

Показать файл

Файл: test_lsa.py Проект: Anhmike/sumy

    def test_real_example(self):
        """Source: http://www.prevko.cz/dite/skutecne-pribehy-deti"""
        parser = PlaintextParser.from_string(
            load_resource("snippets/prevko.txt"),
            Tokenizer("czech")
        )
        summarizer = LsaSummarizer(Stemmer("czech"))
        summarizer.stop_words = get_stop_words("czech")

        sentences = summarizer(parser.document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(
            to_unicode(sentences[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(sentences[1]),
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, "
            "což se chlapci ani trochu nelíbilo."
        )

Пример #47

0

Показать файл

    def test_cue_letters_case(self):
        document = build_document((
            "X X X",
            "x x x x",
        ), (
            "w w w",
            "W W W W",
        ))

        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "X",
            "w",
        )
        summarizer.stigma_words = ("stigma", )

        sentences = summarizer.cue_method(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]), "x x x x")
        self.assertEqual(to_unicode(sentences[1]), "W W W W")

Пример #48

0

Показать файл

Файл: test_graph.py Проект: abensrhir/sumy

    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = GraphSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")

Пример #49

0

Показать файл

Файл: test_reduction.py Проект: miso-belica/sumy

def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"

Пример #50

0

Показать файл

Файл: test_text_rank.py Проект: vino5211/SNLP-16-Scientific-Article-Summarization

    def test_three_sentences_but_second_winner(self):
        document = build_document([
            "I am that 1. sentence",
            "And I am 2. sentence - winning sentence",
            "And I am 3. sentence - winner is my 2nd name",
        ])
        summarizer = TextRankSummarizer()
        summarizer.stop_words = ["I", "am", "and", "that"]

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "And I am 2. sentence - winning sentence")

Пример #51

0

Показать файл

Файл: test_luhn.py Проект: Anhmike/sumy

    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")

Пример #52

0

Показать файл

    def test_key_3(self):
        document = build_document((
            "wa",
            "wa wa",
            "wa wa wa",
            "wa wa wa wa",
            "wa Wa Wa Wa wa",
        ), ("x X x X", ))
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = (
            "wa",
            "X",
        )

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")

Пример #53

0

Показать файл

    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s", )

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]),
                         "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")

Пример #54

0

Показать файл

    def test_various_words_with_significant_percentage(self):
        document = build_document((
            "1 a",
            "2 b b",
            "3 c c c",
            "4 d d d",
            "5 z z z z",
            "6 e e e e e",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("1", "2", "3", "4", "5", "6")

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "6 e e e e e")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "5 z z z z")
        self.assertEqual(to_unicode(returned[1]), "6 e e e e e")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "3 c c c")
        self.assertEqual(to_unicode(returned[1]), "5 z z z z")
        self.assertEqual(to_unicode(returned[2]), "6 e e e e e")

Пример #55

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_mixed_cue_key(self):
        document = build_document_from_string("""
            # This is cool heading
            Because I am sentence I like words
            And because I am string I like characters

            # blank and heading
            This is next paragraph because of blank line above
            Here is the winner because contains words like cool and heading
        """)

        summarizer = EdmundsonSummarizer(cue_weight=1, key_weight=1,
            title_weight=0, location_weight=0)
        summarizer.bonus_words = ("cool", "heading", "sentence", "words", "like", "because")
        summarizer.stigma_words = ("this", "is", "I", "am", "and",)

        sentences = summarizer(document, 2)
        self.assertEqual(len(sentences), 2)
        self.assertEqual(to_unicode(sentences[0]),
            "Because I am sentence I like words")
        self.assertEqual(to_unicode(sentences[1]),
            "Here is the winner because contains words like cool and heading")

Пример #56

0

Показать файл

    def test_real_example(self):
        parser = PlaintextParser.from_string(
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením. "
            "Přerostly až v reparát z jazyka na konci školního roku. "
            "Nedopadl bohužel dobře a tak musel opakovat 6. třídu, což se chlapci ani trochu nelíbilo. "
            "Připadal si, že je mezi malými dětmi a realizoval se tím, že si ve třídě "
            "o rok mladších dětí budoval vedoucí pozici. "
            "Dost razantně. Fyzickou převahu měl, takže to nedalo až tak moc práce.",
            Tokenizer("czech"))
        summarizer = LuhnSummarizer(stem_word)
        summarizer.stop_words = get_stop_words("czech")

        returned = summarizer(parser.document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(
            to_unicode(returned[0]),
            "Jednalo se o případ chlapce v 6. třídě, který měl problémy s učením."
        )
        self.assertEqual(
            to_unicode(returned[1]),
            "Připadal si, že je mezi malými dětmi a realizoval se tím, "
            "že si ve třídě o rok mladších dětí budoval vedoucí pozici.")

Пример #57

0

Показать файл

Файл: test_edmundson.py Проект: likegitcoding/sumy

    def test_key_3(self):
        document = build_document(
            ("wa", "wa wa", "wa wa wa", "wa wa wa wa", "wa Wa Wa Wa wa",),
            ("x X x X",)
        )
        summarizer = EdmundsonSummarizer()
        summarizer.bonus_words = ("wa", "X",)

        sentences = summarizer.key_method(document, 3)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[2]), "wa Wa Wa Wa wa")

        sentences = summarizer.key_method(document, 3, weight=0)
        self.assertEqual(len(sentences), 3)
        self.assertEqual(to_unicode(sentences[0]), "wa wa wa wa")
        self.assertEqual(to_unicode(sentences[1]), "wa Wa Wa Wa wa")
        self.assertEqual(to_unicode(sentences[2]), "x X x X")

Пример #58

0

Показать файл

Файл: test_luhn.py Проект: Anhmike/sumy

    def test_three_sentences(self):
        document = build_document((
            "wa s s s wa s s s wa",
            "wb s wb s wb s s s s s s s s s wb",
            "wc s s wc s s wc",
        ))
        summarizer = LuhnSummarizer()
        summarizer.stop_words = ("s",)

        returned = summarizer(document, 1)
        self.assertEqual(len(returned), 1)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")

        returned = summarizer(document, 2)
        self.assertEqual(len(returned), 2)
        self.assertEqual(to_unicode(returned[0]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[1]), "wc s s wc s s wc")

        returned = summarizer(document, 3)
        self.assertEqual(len(returned), 3)
        self.assertEqual(to_unicode(returned[0]), "wa s s s wa s s s wa")
        self.assertEqual(to_unicode(returned[1]), "wb s wb s wb s s s s s s s s s wb")
        self.assertEqual(to_unicode(returned[2]), "wc s s wc s s wc")

Пример #59

0

Показать файл

Файл: test_unicode_compatible_class.py Проект: likegitcoding/sumy

 def test_to_unicode(self):
     returned = compat.to_unicode(self.o)
     self.assertStringsEqual(UNICODE_STRING, returned)

Python to_unicode примеры использования