def test_sentences_in_right_order(): document = build_document_from_string(""" # Heading one First sentence. Second sentence. Third sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 4) assert len(sentences) == 3 assert to_unicode(sentences[0]) == "First sentence." assert to_unicode(sentences[1]) == "Second sentence." assert to_unicode(sentences[2]) == "Third sentence."
def test_document(): document = build_document( ("I am the sentence you like", "Do you like me too",), ("This sentence is better than that above", "Are you kidding me",) ) summarizer = LsaSummarizer() summarizer.stopwords = ( "I", "am", "the", "you", "are", "me", "is", "than", "that", "this", ) sentences = summarizer(document, 2) assert len(sentences) == 2 assert to_unicode(sentences[0]) == "I am the sentence you like" assert to_unicode(sentences[1]) == "This sentence is better than that above"
def test_two_sentences(): document = build_document( ("I am that 1. sentence", "And I am 2. winning prize")) summarizer = TextRankSummarizer() summarizer.stop_words = ( "I", "am", "and", "that", ) returned = summarizer(document, 10) assert len(returned) == 2 assert to_unicode(returned[0]) == "I am that 1. sentence" assert to_unicode(returned[1]) == "And I am 2. winning prize"
def to_sentences(self, paragraph): if hasattr(self._sentence_tokenizer, '_params'): extra_abbreviations = self.LANGUAGE_EXTRA_ABREVS.get( self._language, []) self._sentence_tokenizer._params.abbrev_types.update( extra_abbreviations) sentences = self._sentence_tokenizer.tokenize(to_unicode(paragraph)) return tuple(map(unicode.strip, sentences))
def test_single_sentence(): document = build_document(("I am the sentence you like",)) summarizer = LsaSummarizer() summarizer.stopwords = ("I", "am", "the",) sentences = summarizer(document, 10) assert len(sentences) == 1 assert to_unicode(sentences[0]) == "I am the sentence you like"
def test_less_sentences_than_requested(): document = build_document_from_string(""" This is only one sentence. """) summarizer = RandomSummarizer() sentences = summarizer(document, 10) assert len(sentences) == 1 assert to_unicode(sentences[0]) == "This is only one sentence."
def main(args=None): args = docopt(to_string(__doc__), args, version=__version__) summarizer, parser, items_count = handle_arguments(args) for sentence in summarizer(parser.document, items_count): if PY3: print(to_unicode(sentence)) else: print(to_bytes(sentence)) return 0
def test_three_sentences_but_second_winner(): document = build_document([ "I am that 1. sentence", "And I am 2. sentence - winning sentence", "And I am 3. sentence - winner is my 2nd name", ]) summarizer = ReductionSummarizer() summarizer.stop_words = ["I", "am", "and", "that"] returned = summarizer(document, 1) assert len(returned) == 1 assert to_unicode(returned[0]) == "And I am 2. sentence - winning sentence"
def __init__(self, words, tokenizer=None): if isinstance(words, string_types) and tokenizer is None: raise ValueError( "Tokenizer has to be given if ``words`` is not a sequence.") elif isinstance(words, string_types): words = tokenizer.to_words(to_unicode(words)) elif not isinstance(words, Sequence): raise ValueError( "Parameter ``words`` has to be sequence or string with tokenizer given." ) self._terms = Counter(map(unicode.lower, words)) self._max_frequency = max(self._terms.values()) if self._terms else 1
def null_stemmer(object): "Converts given object to unicode with lower letters." return to_unicode(object).lower()
def test_repr_object_to_unicode(): value = UNICODE_STRING if py3k.PY3 else BYTES_STRING instance = _build_test_instance("__repr__", value) returned = py3k.to_unicode(instance) _assert_strings_equal(UNICODE_STRING, returned)
def test_unicode_object_to_unicode(): method = "__str__" if py3k.PY3 else "__unicode__" instance = _build_test_instance(method, UNICODE_STRING) returned = py3k.to_unicode(instance) _assert_strings_equal(UNICODE_STRING, returned)
def normalize_word(word): return to_unicode(word).lower()
def __init__(self, text, tokenizer, is_heading=False): self._text = to_unicode(text).strip() self._tokenizer = tokenizer self._is_heading = bool(is_heading)
def __init__(self, text, tokenizer): super(PlaintextParser, self).__init__(tokenizer) self._text = to_unicode(text).strip()
def load_resource(path): path = expand_resource_path(path) with open(path, "rb") as file: return to_unicode(file.read())
def to_words(self, sentence): words = self._word_tokenizer.tokenize(to_unicode(sentence)) #return tuple(filter(self._is_word, words)) return tuple(words)
def test_unicode_to_unicode(): returned = py3k.to_unicode(UNICODE_STRING) _assert_strings_equal(UNICODE_STRING, returned)
def parse_stop_words(data): return frozenset(w.rstrip() for w in to_unicode(data).splitlines() if w)
def test_to_unicode(): returned = compat.to_unicode(O()) _assert_strings_equal(UNICODE_STRING, returned)