Пример #1
0
 def test_call(self):
     word = "Testing"
     tokens = ["Testing", "tokenized", "Sentence"]
     stemmer = preprocess.PorterStemmer()
     self.assertEqual(stemmer(word), self.stemmer(word))
     self.assertEqual(stemmer(tokens),
                      [self.stemmer(token) for token in tokens])
Пример #2
0
def pre_process(path):
    corpus = orangecontrib.text.Corpus.from_file(path)

    p = preprocess.Preprocessor(
        transformers=[
            preprocess.LowercaseTransformer(),
            preprocess.UrlRemover(),
            preprocess.HtmlTransformer()
        ],
        tokenizer=preprocess.RegexpTokenizer('\w+'),
        normalizer=preprocess.PorterStemmer(),
        filters=[
            preprocess.StopwordsFilter('english'),
            preprocess.RegexpFilter(
                '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<'
            )
        ])
    return p(corpus)
Пример #3
0
 def test_call_porter(self):
     pp = preprocess.PorterStemmer()
     self.assertFalse(self.corpus.has_tokens())
     corpus = pp(self.corpus)
     self.assertTrue(corpus.has_tokens())
     self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
Пример #4
0
 def test_str(self):
     stemmer = preprocess.PorterStemmer()
     self.assertEqual('Porter Stemmer', str(stemmer))
Пример #5
0
 def test_porter_with_bad_input(self):
     stemmer = preprocess.PorterStemmer()
     self.assertRaises(TypeError, stemmer, 10)
Пример #6
0
    def test_str(self):
        stemmer = preprocess.PorterStemmer()
        self.assertIn('porter', str(stemmer).lower())

        stemmer = preprocess.SnowballStemmer('french')
        self.assertIn('french', str(stemmer).lower())