Пример #1
0
 def setUp(self):
     self.corpus = Corpus.from_file("deerwester")
     self.pp_list = [preprocess.LowercaseTransformer(),
                     preprocess.WordPunctTokenizer(),
                     preprocess.SnowballStemmer(),
                     preprocess.NGrams(),
                     tag.AveragedPerceptronTagger()]
Пример #2
0
 def test_ngrams(self):
     vect = BowVectorizer()
     corpus = Corpus.from_file('deerwester')
     corpus = preprocess.RegexpTokenizer('\w+')(corpus)
     corpus = preprocess.NGrams(ngrams_range=(1, 3))(corpus)
     result = vect.transform(corpus)
     attrs = [attr.name for attr in result.domain.attributes]
     self.assertIn(corpus.tokens[0][1], attrs)
     self.assertIn(' '.join(corpus.tokens[0][:2]), attrs)
     self.assertIn(' '.join(corpus.tokens[0][:3]), attrs)
Пример #3
0
 def setUp(self):
     self.pp = preprocess.NGrams((2, 3))
     self.corpus = Corpus.from_file('deerwester')