def setUp(self): self.corpus = Corpus.from_file("deerwester") self.pp_list = [preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.NGrams(), tag.AveragedPerceptronTagger()]
def test_string_processor(self): p = preprocess.LowercaseTransformer() tokens2 = self.corpus.tokens.copy() tokens = p(self.corpus).tokens np.testing.assert_equal( tokens, np.array([[t.lower() for t in doc] for doc in tokens2], dtype="object"))
def create_corpus(texts: List[str]) -> Corpus: """ Create sample corpus with texts passed """ text_var = StringVariable("Text") domain = Domain([], metas=[text_var]) c = Corpus( domain, metas=np.array(texts).reshape(-1, 1), text_features=[text_var], ) return preprocess.LowercaseTransformer()(c)
def test_string_processor(self): p = Preprocessor(transformers=preprocess.LowercaseTransformer()) tokens = p(self.corpus).tokens p2 = Preprocessor(transformers=[]) tokens2 = p2(self.corpus).tokens np.testing.assert_equal(tokens, [[t.lower() for t in doc] for doc in tokens2]) self.assertRaises(TypeError, Preprocessor, string_transformers=1)
def set_corpus(self, data=None): self.corpus = data # create preprocessed corpus upon setting data to avoid preprocessing # at each method run pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer() ] self.pp_corpus = PreprocessorList(pp_list)(self.corpus) self.commit()
def set_corpus(self, data=None): self.corpus = data self.pp_corpus = None if self.corpus is not None: if not self.corpus.has_tokens(): # create preprocessed corpus upon setting data to avoid # preprocessing at each method run pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer() ] self.pp_corpus = PreprocessorList(pp_list)(self.corpus) else: self.pp_corpus = self.corpus self.commit.now()
def test_filter_pos_tags(self): pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), tag.AveragedPerceptronTagger(), preprocess.StopwordsFilter() ] corpus = self.corpus with corpus.unlocked(): corpus.metas[0, 0] = "This is the most beautiful day in the world" for pp in pp_list: corpus = pp(corpus) self.assertEqual(len(corpus.tokens), len(corpus.pos_tags)) self.assertEqual(len(corpus.tokens[0]), len(corpus.pos_tags[0])) self.assertEqual(corpus.tokens[0], ["beautiful", "day", "world"]) self.assertEqual(corpus.pos_tags[0], ["JJ", "NN", "NN"])
def setUp(self) -> None: self.widget: OWScoreDocuments = self.create_widget(OWScoreDocuments) # create corpus self.corpus = Corpus.from_file("book-excerpts") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: self.corpus = p(self.corpus) # create words table words = ["house", "doctor", "boy", "way", "Rum"] self.words = self.create_words_table(words)
def test_preprocessed(self): widget = self.create_widget(OWSentimentAnalysis) corpus = self.corpus.copy() pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer() ] for pp in pp_list: corpus = pp(corpus) self.send_signal(widget.Inputs.corpus, corpus) self.assertTrue(widget.pp_corpus) widget.liu_hu.click() simulate.combobox_activate_item(widget.liu_lang, "English") self.assertTrue(widget.pp_corpus) self.send_signal(widget.Inputs.corpus, None) self.assertIsNone(widget.pp_corpus)
def test_pickle_corpus(self): """ Corpus must be picklable (for save data widget) gh-590 """ c = Corpus.from_file('book-excerpts') # it must also work with preprocessed corpus self.pp_list = [ preprocess.LowercaseTransformer(), preprocess.WordPunctTokenizer(), preprocess.SnowballStemmer(), preprocess.FrequencyFilter(), preprocess.StopwordsFilter() ] for pp in self.pp_list: c = pp(c) pickle.dumps(c)
def pre_process(path): corpus = orangecontrib.text.Corpus.from_file(path) p = preprocess.Preprocessor( transformers=[ preprocess.LowercaseTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer() ], tokenizer=preprocess.RegexpTokenizer('\w+'), normalizer=preprocess.PorterStemmer(), filters=[ preprocess.StopwordsFilter('english'), preprocess.RegexpFilter( '\.|,|:|;|!|\?|\(|\)|\||\+|\'|\"|‘|’|“|”|\'|\’|…|\-|–|—|\$|&|\*|>|<' ) ]) return p(corpus)
def test_string_processor(self): class StripStringTransformer(preprocess.BaseTransformer): @classmethod def transform(cls, string): return string[:-1] p = Preprocessor(transformers=StripStringTransformer()) np.testing.assert_equal( p(self.corpus).tokens, np.array([[doc[:-1]] for doc in self.corpus.documents])) p = Preprocessor(transformers=[ StripStringTransformer(), preprocess.LowercaseTransformer() ]) np.testing.assert_equal( p(self.corpus).tokens, np.array([[doc[:-1].lower()] for doc in self.corpus.documents])) self.assertRaises(TypeError, Preprocessor, string_transformers=1)
def test_preprocess_words(self): corpus = Corpus.from_file("book-excerpts") words = [ "House", "dóctor", "boy", "way", "Rum https://google.com", "https://google.com", "<p>abra<b>cadabra</b><p>", ] pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.UrlRemover(), preprocess.HtmlTransformer(), ] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["house", "doctor", "boy", "way", "rum", "abracadabra"], _preprocess_words(corpus, words, dummy_callback), ) words = ["House", "dóctor", "boys", "way", "Rum"] pp_list = [preprocess.SnowballStemmer()] for p in pp_list: corpus = p(corpus) self.assertListEqual( ["hous", "doctor", "boy", "way", "rum"], _preprocess_words(corpus, words, dummy_callback), )
def test_lowercase(self): transformer = preprocess.LowercaseTransformer() self.assertEqual(transformer._preprocess('Abra'), 'abra') self.assertEqual(transformer._preprocess('\u00C0bra'), '\u00E0bra')
raise NotImplementedError self.view.selectionModel().select(selection, QItemSelectionModel.ClearAndSelect) if __name__ == "__main__": from orangewidget.utils.widgetpreview import WidgetPreview from orangecontrib.text import preprocess corpus = Corpus.from_file("book-excerpts") # corpus.set_title_variable("Text") pp_list = [ preprocess.LowercaseTransformer(), preprocess.StripAccentsTransformer(), preprocess.SnowballStemmer(), ] for p in pp_list: corpus = p(corpus) w = StringVariable("Words") w.attributes["type"] = "words" words = ["house", "doctor", "boy", "way", "Rum"] words = Table( Domain([], metas=[w]), np.empty((len(words), 0)), metas=np.array(words).reshape((-1, 1)), ) WidgetPreview(OWScoreDocuments).run(set_data=corpus, set_words=words)