Exemplo n.º 1
0
def test_sentences_in_right_order():
    document = build_document_from_string("""
        # Heading one
        First sentence.
        Second sentence.
        Third sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 4)
    assert len(sentences) == 3
    assert to_unicode(sentences[0]) == "First sentence."
    assert to_unicode(sentences[1]) == "Second sentence."
    assert to_unicode(sentences[2]) == "Third sentence."
def test_document():
    document = build_document(
        ("I am the sentence you like", "Do you like me too",),
        ("This sentence is better than that above", "Are you kidding me",)
    )
    summarizer = LsaSummarizer()
    summarizer.stopwords = (
        "I", "am", "the", "you", "are", "me", "is", "than", "that", "this",
    )

    sentences = summarizer(document, 2)
    assert len(sentences) == 2
    assert to_unicode(sentences[0]) == "I am the sentence you like"
    assert to_unicode(sentences[1]) == "This sentence is better than that above"
Exemplo n.º 3
0
def test_two_sentences():
    document = build_document(
        ("I am that 1. sentence", "And I am 2. winning prize"))
    summarizer = TextRankSummarizer()
    summarizer.stop_words = (
        "I",
        "am",
        "and",
        "that",
    )

    returned = summarizer(document, 10)
    assert len(returned) == 2
    assert to_unicode(returned[0]) == "I am that 1. sentence"
    assert to_unicode(returned[1]) == "And I am 2. winning prize"
 def to_sentences(self, paragraph):
     if hasattr(self._sentence_tokenizer, '_params'):
         extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get(
             self._language, [])
         self._sentence_tokenizer._params.abbrev_types.update(
             extra_abbreviations)
     sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph))
     return tuple(map(unicode.strip, sentences))
def test_single_sentence():
    document = build_document(("I am the sentence you like",))
    summarizer = LsaSummarizer()
    summarizer.stopwords = ("I", "am", "the",)

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "I am the sentence you like"
Exemplo n.º 6
0
def test_less_sentences_than_requested():
    document = build_document_from_string("""
        This is only one sentence.
    """)
    summarizer = RandomSummarizer()

    sentences = summarizer(document, 10)
    assert len(sentences) == 1
    assert to_unicode(sentences[0]) == "This is only one sentence."
Exemplo n.º 7
0
def main(args=None):
    args = docopt(to_string(__doc__), args, version=__version__)
    summarizer, parser, items_count = handle_arguments(args)

    for sentence in summarizer(parser.document, items_count):
        if PY3:
            print(to_unicode(sentence))
        else:
            print(to_bytes(sentence))

    return 0
def test_three_sentences_but_second_winner():
    document = build_document([
        "I am that 1. sentence",
        "And I am 2. sentence - winning sentence",
        "And I am 3. sentence - winner is my 2nd name",
    ])
    summarizer = ReductionSummarizer()
    summarizer.stop_words = ["I", "am", "and", "that"]

    returned = summarizer(document, 1)
    assert len(returned) == 1
    assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
Exemplo n.º 9
0
    def __init__(self, words, tokenizer=None):
        if isinstance(words, string_types) and tokenizer is None:
            raise ValueError(
                "Tokenizer has to be given if ``words`` is not a sequence.")
        elif isinstance(words, string_types):
            words = tokenizer.to_words(to_unicode(words))
        elif not isinstance(words, Sequence):
            raise ValueError(
                "Parameter ``words`` has to be sequence or string with tokenizer given."
            )

        self._terms = Counter(map(unicode.lower, words))
        self._max_frequency = max(self._terms.values()) if self._terms else 1
Exemplo n.º 10
0
def null_stemmer(object):
    "Converts given object to unicode with lower letters."
    return to_unicode(object).lower()
Exemplo n.º 11
0
def test_repr_object_to_unicode():
    value = UNICODE_STRING if py3k.PY3 else BYTES_STRING
    instance = _build_test_instance("__repr__", value)

    returned = py3k.to_unicode(instance)
    _assert_strings_equal(UNICODE_STRING, returned)
Exemplo n.º 12
0
def test_unicode_object_to_unicode():
    method = "__str__" if py3k.PY3 else "__unicode__"
    instance = _build_test_instance(method, UNICODE_STRING)

    returned = py3k.to_unicode(instance)
    _assert_strings_equal(UNICODE_STRING, returned)
 def normalize_word(word):
     return to_unicode(word).lower()
Exemplo n.º 14
0
 def __init__(self, text, tokenizer, is_heading=False):
     self._text = to_unicode(text).strip()
     self._tokenizer = tokenizer
     self._is_heading = bool(is_heading)
Exemplo n.º 15
0
 def __init__(self, text, tokenizer):
     super(PlaintextParser, self).__init__(tokenizer)
     self._text = to_unicode(text).strip()
Exemplo n.º 16
0
def load_resource(path):
    path = expand_resource_path(path)
    with open(path, "rb") as file:
        return to_unicode(file.read())
 def to_words(self, sentence):
     words = self._word_tokenizer.tokenize(to_unicode(sentence))
     #return tuple(filter(self._is_word, words))
     return tuple(words)
Exemplo n.º 18
0
def test_unicode_to_unicode():
    returned = py3k.to_unicode(UNICODE_STRING)
    _assert_strings_equal(UNICODE_STRING, returned)
Exemplo n.º 19
0
def parse_stop_words(data):
    return frozenset(w.rstrip() for w in to_unicode(data).splitlines() if w)
Exemplo n.º 20
0
def test_to_unicode():
    returned = compat.to_unicode(O())
    _assert_strings_equal(UNICODE_STRING, returned)