def setUp(self) -> None: self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments) # create corpus self.corpus = Corpus.from_file("book-excerpts") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: self.corpus = p(self.corpus) # create words table words = ["house", "doctor", "boy", "way", "Rum"] self.words = self.create_words_table(words)
def test_preprocess_words(self): corpus = Corpus.from_file("book-excerpts") words = [ "House", "dóctor", "boy", "way", "Rum https://google.com", "https://google.com", "<p>abra<b>cadabra</b><p>", ] pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), ] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["house", "doctor", "boy", "way", "rum", "abracadabra"], _preprocess_words(corpus, words, dummy_callback), ) words = ["House", "dóctor", "boys", "way", "Rum"] pp_list = [preprocess.SnowballStemmer()] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["hous", "doctor", "boy", "way", "rum"], _preprocess_words(corpus, words, dummy_callback), )
def test_strip_accents(self): transformer = preprocess.StripAccentsTransformer() self.assertEqual(transformer._preprocess('Abra'), 'Abra') self.assertEqual(transformer._preprocess('\u00C0bra'), 'Abra')
self.view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview from orangecontrib.text import preprocess corpus = Corpus.from_file("book-excerpts") # corpus.set_title_variable("Text") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: corpus = p(corpus) w = StringVariable("Words") w.attributes["type"] = "words" words = ["house", "doctor", "boy", "way", "Rum"] words = Table( Domain([], metas=[w]), np.empty((len(words), 0)), metas=np.array(words).reshape((-1, 1)), ) WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)