def test_call(self): word = "Testing" tokens = ["Testing", "tokenized", "Sentence"] stemmer = preprocess.PorterStemmer() self.assertEqual(stemmer(word), self.stemmer(word)) self.assertEqual(stemmer(tokens), [self.stemmer(token) for token in tokens])
def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def test_call_porter(self): pp = preprocess.PorterStemmer() self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
def test_str(self): stemmer = preprocess.PorterStemmer() self.assertEqual('Porter Stemmer', str(stemmer))
def test_porter_with_bad_input(self): stemmer = preprocess.PorterStemmer() self.assertRaises(TypeError, stemmer, 10)
def test_str(self): stemmer = preprocess.PorterStemmer() self.assertIn('porter', str(stemmer).lower()) stemmer = preprocess.SnowballStemmer('french') self.assertIn('french', str(stemmer).lower())