def test_slicing(self): wl = tb.WordList(self.words) first = wl[0] assert_true(isinstance(first, tb.Word)) assert_equal(first, 'Schön') dogs = wl[0:2] assert_true(isinstance(dogs, tb.WordList)) assert_equal(dogs, tb.WordList(['Schön', 'ist']))
def test_empty_sentence(self): assert_equal(self.empty_sentence.tags, []) assert_equal(self.empty_sentence.tokens, tb.WordList([])) assert_equal(self.empty_sentence.words, tb.WordList([])) assert_equal(self.empty_sentence.noun_phrases, tb.WordList([])) assert_equal(self.empty_sentence.np_counts, {}) assert_equal(self.empty_sentence.word_counts, {}) assert_equal(self.empty_sentence.ngrams(), []) assert_equal(self.empty_sentence.parse(), "")
def test_singularize(self): wl = tb.WordList([ 'Hunde', 'Katzen', 'Büffel', # 'Menschen', 'Mäuse' not processed correctly ]) assert_equal( wl.singularize(), tb.WordList([ 'Hund', 'Katze', 'Büffel', # 'Mensch', 'Maus' processed as # 'Menschen', 'Mäus' ]))
def test_slice_repr(self): wl = tb.WordList(['Schön', 'ist', 'besser']) if PY2: assert_equal(unicode(repr(wl[:2])), u"WordList([u'Sch\\xf6n', u'ist'])") else: assert_equal(repr(wl[:2]), "WordList(['Schön', 'ist'])")
def test_overrides(self): b = tb.BlobberDE(tokenizer=SentenceTokenizer()) blob = b("Was nun? Dumme Kuh?") assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) assert_equal(blob.tokens, tb.WordList(["Was nun?", "Dumme Kuh?"])) blob2 = b("Ein anderer Blob") # blobs have the same tokenizer assert_true(blob.tokenizer is blob2.tokenizer) # but aren't the same object assert_not_equal(blob, blob2)
def test_repr(self): wl = tb.WordList(['Schön', 'ist', 'besser']) # This compat clause is necessary because from __future__ import unicode_literals # turns the whole second argument into one single unicode string: # Without it you get an AssertionError on PY2: # "WordList([u'Sch\\xf6n', u'ist', u'besser'])" != \ # u"WordList(['Sch\xf6n', 'ist', 'besser'])" if PY2: assert_equal(unicode(repr(wl)), u"WordList([u'Sch\\xf6n', u'ist', u'besser'])") else: assert_equal(repr(wl), "WordList(['Schön', 'ist', 'besser'])")
def test_len(self): wl = tb.WordList(['Schön', 'ist', 'besser']) assert_equal(len(wl), 3)
def test_extend(self): wl = tb.WordList(["Hunde", "Katzen"]) wl.extend(["Büffel", 4]) assert_true(isinstance(wl[2], tb.Word)) assert_true(isinstance(wl[3], int))
def test_append(self): wl = tb.WordList(['Hund']) wl.append("Katze") assert_true(isinstance(wl[1], tb.Word)) wl.append(('ein', 'Tupel')) assert_true(isinstance(wl[2], tuple))
def test_convert_to_list(self): wl = tb.WordList(self.words) assert_equal(list(wl), self.words)
def test_count(self): wl = tb.WordList(['monty', 'python', 'Python', 'Monty']) assert_equal(wl.count('monty'), 2) assert_equal(wl.count('monty', case_sensitive=True), 1) assert_equal(wl.count('mon'), 0)
def test_lower(self): wl = tb.WordList(['Philosophie', 'voN', 'PYTHON']) assert_equal(wl.lower(), tb.WordList(['philosophie', 'von', 'python']))
def test_upper(self): wl = tb.WordList(self.words) assert_equal(wl.upper(), tb.WordList([w.upper() for w in self.words]))
def test_lemmatize(self): wl = tb.WordList(["Katze", "Hunde", "Ochsen"]) assert_equal(wl.lemmatize(), tb.WordList(['Katze', 'Hund', 'Ochse']))
def test_pluralize(self): wl = tb.WordList(['Hund', 'Katze', 'Büffel']) assert_equal(wl.pluralize(), tb.WordList(['Hunde', 'Katzen', 'Büffel']))
def test_str(self): wl = tb.WordList(self.words) assert_equal(str(wl), str(self.words))