Exemplo n.º 1
0
 def test_ngrams(self):
     assert list(pytextparser.word_tokenize(text="foo bar bomb blar", ngrams=2)) == [
         ("foo", "bar"),
         ("bar", "bomb"),
         ("bomb", "blar"),
     ]
Exemplo n.º 2
0
 def test_ignores_numeric(self):
     assert list(pytextparser.word_tokenize(text="one two 3 four")) == [("one",), ("two",), ("four",)]
Exemplo n.º 3
0
 def test_min_length(self):
     assert list(pytextparser.word_tokenize(text="one for the money two for the go", min_length=4)) == [("money",)]
Exemplo n.º 4
0
 def test_ignores_stopwords(self):
     assert list(
         pytextparser.word_tokenize(
             text="The first rule of python is", stopwords=set(["the", "of", "is"]), min_length=1
         )
     ) == [("first",), ("rule",), ("python",)]
Exemplo n.º 5
0
 def test_splits_punctuation(self):
     assert list(pytextparser.word_tokenize(text="first. second")) == [("first",), ("second",)]
Exemplo n.º 6
0
 def test_sentence(self):
     assert list(pytextparser.word_tokenize(text="hello cruel world")) == [("hello",), ("cruel",), ("world",)]