def test_pickle_corpus(self): """ Corpus must be picklable (for save data widget) gh-590 """ c = Corpus.from_file('book-excerpts') # it must also work with preprocessed corpus self.pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.FrequencyFilter(), preprocess.StopwordsFilter() ] for pp in self.pp_list: c = pp(c) pickle.dumps(c)
def test_preprocess_words(self): corpus = Corpus.from_file("book-excerpts") words = [ "House", "dóctor", "boy", "way", "Rum https://google.com", "https://google.com", "<p>abra<b>cadabra</b><p>", ] pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), ] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["house", "doctor", "boy", "way", "rum", "abracadabra"], _preprocess_words(corpus, words, dummy_callback), ) words = ["House", "dóctor", "boys", "way", "Rum"] pp_list = [preprocess.SnowballStemmer()] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["hous", "doctor", "boy", "way", "rum"], _preprocess_words(corpus, words, dummy_callback), )
def test_snowball(self): stemmer = preprocess.SnowballStemmer('french') token = 'voudrais' self.assertEqual(stemmer._preprocess(token), nltk.SnowballStemmer(language='french').stem(token))
def test_call_snowball(self): pp = preprocess.SnowballStemmer() self.assertFalse(self.corpus.has_tokens()) corpus = pp(self.corpus) self.assertTrue(corpus.has_tokens()) self.assertEqual(len(corpus.used_preprocessor.preprocessors), 2)
self.view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview from orangecontrib.text import preprocess corpus = Corpus.from_file("book-excerpts") # corpus.set_title_variable("Text") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: corpus = p(corpus) w = StringVariable("Words") w.attributes["type"] = "words" words = ["house", "doctor", "boy", "way", "Rum"] words = Table( Domain([], metas=[w]), np.empty((len(words), 0)), metas=np.array(words).reshape((-1, 1)), ) WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)
def test_str(self): stemmer = preprocess.PorterStemmer() self.assertIn('porter', str(stemmer).lower()) stemmer = preprocess.SnowballStemmer('french') self.assertIn('french', str(stemmer).lower())