def test_tokenizer_choice(self): test_sentence = 'The quick #brown fox jumps over the lazy dog :-)' default_tokens = [ 'the', 'quick', '#', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', ':', '-', ')' ] no_punct_tokens = [ 'the', 'quick', 'brown', 'fox', 'jumps', 'over','the', 'lazy', 'dog' ] tweet_tokens = [ 'the', 'quick', '#brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', ':-)' ] p = Preprocessor(tokenizer='default') self.assertEqual(type(p.tokenizer), type(word_tokenize)) tokens = p(test_sentence) self.assertEqual(tokens[0], default_tokens) p = Preprocessor(tokenizer='no_punct') self.assertEqual(type(p.tokenizer), type(RegexpTokenizer(r'\w+').tokenize)) tokens = p(test_sentence) self.assertEqual(tokens[0], no_punct_tokens) p = Preprocessor(tokenizer='twitter') self.assertEqual(type(p.tokenizer), type(TweetTokenizer().tokenize)) tokens = p(test_sentence) self.assertEqual(tokens[0], tweet_tokens) with self.assertRaises(ValueError): Preprocessor(tokenizer='unsupported_value')
def test_string_processor(self): p = Preprocessor(transformers=preprocess.LowercaseTransformer()) tokens = p(self.corpus).tokens p2 = Preprocessor(transformers=[]) tokens2 = p2(self.corpus).tokens np.testing.assert_equal(tokens, [[t.lower() for t in doc] for doc in tokens2]) self.assertRaises(TypeError, Preprocessor, string_transformers=1)
def test_token_normalizer(self): class CapTokenNormalizer(preprocess.BaseNormalizer): @classmethod def normalize(cls, token): return token.capitalize() p = Preprocessor(normalizer=CapTokenNormalizer()) tokens = p(self.corpus).tokens p2 = Preprocessor(normalizer=None) tokens2 = p2(self.corpus).tokens np.testing.assert_equal( tokens, [[t.capitalize() for t in doc] for doc in tokens2])
def test_preprocess_sentence_stopwords(self): # No stop words. p = Preprocessor() result = p(self.TEST_STRING) correct = [ [ 'human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications' ] ] self.assertEqual(result, correct) # English stop words. p = Preprocessor(stop_words='english') result = p(self.TEST_STRING) correct = [ [ 'human', 'machine', 'interface', 'lab', 'abc', 'computer', 'applications' ] ] self.assertEqual(result, correct) # Custom stop words. custom_stop_words = [ 'abc', 'applications', 'computer', 'for', ] p = Preprocessor(stop_words=custom_stop_words) result = p(self.TEST_STRING) correct = [ [ 'human', 'machine', 'interface', 'lab' ] ] self.assertEqual(result, correct)
def test_inplace(self): p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w')) corpus = p(self.corpus, inplace=True) self.assertIs(corpus, self.corpus) corpus = p(self.corpus, inplace=False) self.assertIsNot(corpus, self.corpus) self.assertEqual(corpus, self.corpus) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer('\w+')) corpus = p(self.corpus, inplace=False) self.assertIsNot(corpus, self.corpus) self.assertNotEqual(corpus, self.corpus)
def test_tokenize(self): correct = [ ['human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications'], ['a', 'survey', 'of', 'user', 'opinion', 'of', 'computer', 'system', 'response', 'time'], ['the', 'eps', 'user', 'interface', 'management', 'system'], ['system', 'and', 'human', 'system', 'engineering', 'testing', 'of', 'eps'], ['relation', 'of', 'user', 'perceived', 'response', 'time', 'to', 'error', 'measurement'], ['the', 'generation', 'of', 'random', 'binary', 'unordered', 'trees'], ['the', 'intersection', 'graph', 'of', 'paths', 'in', 'trees'], ['graph', 'minors', 'iv', 'widths', 'of', 'trees', 'and', 'well', 'quasi', 'ordering'], ['graph', 'minors', 'a', 'survey'] ] # String. p = Preprocessor(lowercase=False) self.assertEqual( p(self.TEST_STRING), [ [ 'Human', 'machine', 'interface', 'for', 'lab', 'abc', 'computer', 'applications', ] ] ) # List. p = Preprocessor() self.assertEqual( p(self.TEST_LIST), correct ) # Corpus. p = Preprocessor() self.assertEqual( p(self.TEST_CORPUS).tokens, correct )
def apply(self): # TODO change this to custom stopwords if self.remove_stpwrds: sw = 'english' pp = Preprocessor(incl_punct=self.include_punctuation, trans=self.transformation, lowercase=self.lowercase, stop_words=sw) self.send(Output.PREPROCESSOR, pp)
def _compute_indices(self): # type: () -> Optional[None, list] if self.corpus is None: self.indices = None return if self.corpus and not self.corpus.has_tokens(): preprocessor = Preprocessor(tokenizer=WordPunctTokenizer()) preprocessor(self.corpus) self.indices = [ConcordanceIndex(doc, key=lambda x: x.lower()) for doc in self.corpus.tokens]
def test_tokenizer(self): class SpaceTokenizer(preprocess.BaseTokenizer): @classmethod def tokenize(cls, string): return string.split() p = Preprocessor(tokenizer=SpaceTokenizer()) np.testing.assert_equal(p(self.corpus).tokens, np.array([sent.split() for sent in self.corpus.documents]))
def tokens(self): """ Return a list of lists containing tokens. If tokens are not yet present, run default preprocessor and save tokens. """ if self._tokens is None: from orangecontrib.text.preprocess import Preprocessor p = Preprocessor() self._tokens = p(self.documents) return self._tokens
def test_token_normalizer(self): class CapTokenNormalizer(preprocess.BaseNormalizer): @classmethod def normalize(cls, token): return token.capitalize() p = Preprocessor(normalizer=CapTokenNormalizer()) np.testing.assert_equal( p(self.corpus).tokens, np.array([[sent.capitalize()] for sent in self.corpus.documents]))
def test_max_df(self): ff = preprocess.FrequencyFilter(max_df=.3) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) size = len(self.corpus.documents) corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, size * .3) ff.max_df = 2 corpus = p(self.corpus) self.assertFrequencyRange(corpus, 1, 2)
def __init__(self): super().__init__() self.n_words = 0 self.mean_weight = 0 self.std_weight = 0 self.selected_words = SelectedWords(self) self.webview = None self.bow = None # bag-of-words, obviously self.topics = None self.corpus = None self.PREPROCESS = Preprocessor() self._create_layout()
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: # Compiled Regexes are NOT deepcopy-able and hence to make Corpus deepcopy-able # we cannot store then (due to Corpus also storing used_preprocessor for BoW compute values). # To bypass the problem regex is compiled before every __call__ and discarded right after. self.tokenizer = self.tokenizer_cls(self.__pattern) corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback callback(0, "Tokenizing...") corpus = self._store_tokens_from_documents(corpus, callback) self.tokenizer = None return corpus
def test_min_df(self): ff = preprocess.FrequencyFilter(min_df=.5) p = Preprocessor(tokenizer=preprocess.RegexpTokenizer(r'\w+'), filters=[ff]) processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, size * .5, size) ff.min_df = 2 processed = p(self.corpus) size = len(processed.documents) self.assertFrequencyRange(processed, 2, size)
def test_string_processor(self): class StripStringTransformer(preprocess.BaseTransformer): @classmethod def transform(cls, string): return string[:-1] p = Preprocessor(transformers=StripStringTransformer()) np.testing.assert_equal( p(self.corpus).tokens, np.array([[doc[:-1]] for doc in self.corpus.documents])) p = Preprocessor(transformers=[ StripStringTransformer(), preprocess.LowercaseTransformer() ]) np.testing.assert_equal( p(self.corpus).tokens, np.array([[doc[:-1].lower()] for doc in self.corpus.documents])) self.assertRaises(TypeError, Preprocessor, string_transformers=1)
def test_porter_sentence(self): corpus = [ 'Caresses flies dies mules denied died agreed owned humbled sized.' ] stemmed = [ 'caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl' ] p = Preprocessor(lowercase=True, stop_words=None, trans=Stemmer) corpus = p(corpus) print(corpus) self.assertEqual(sorted(corpus), sorted(stemmed))
def __call__(self, corpus: Corpus, callback: Callable = None) -> Corpus: try: self.__model = udpipe.Model.load(self.models[self.__language]) except StopIteration: raise UDPipeStopIteration if self.__use_tokenizer: corpus = Preprocessor.__call__(self, corpus) if callback is None: callback = dummy_callback callback(0, "Normalizing...") return self._store_tokens_from_documents(corpus, callback) else: return super().__call__(corpus, callback)
def test_preprocess_corpus_float_df(self): p = Preprocessor(min_df=0.2, max_df=0.5, lowercase=True) result = p(self.TEST_CORPUS) correct = [ ['human', 'interface', 'computer'], ['a', 'survey', 'user', 'computer', 'system', 'response', 'time'], ['the', 'eps', 'user', 'interface', 'system'], ['system', 'and', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['the', 'trees'], ['the', 'graph', 'trees'], ['graph', 'minors', 'trees', 'and'], ['graph', 'minors', 'a', 'survey'] ] self.assertEqual(result.tokens, correct)
def test_token_filter(self): class SpaceTokenizer(preprocess.BaseTokenizer): @classmethod def tokenize(cls, string): return string.split() class LengthFilter(preprocess.BaseTokenFilter): @classmethod def check(cls, token): return len(token) < 4 p = Preprocessor(tokenizer=SpaceTokenizer(), filters=LengthFilter()) np.testing.assert_equal(p(self.corpus).tokens, np.array([[token for token in doc.split() if len(token) < 4] for doc in self.corpus.documents]))
def assemble_preprocessor(self): self.error(0, '') pp_settings = { # If disabled, this defaults to True, which is not what we want. 'lowercase': False, } for pp in self.preprocessors: if pp.enabled: pp_settings.update(pp.get_pp_setting()) pp_settings['callback'] = self.document_finished try: preprocessor = Preprocessor(**pp_settings) except Exception as e: self.error(0, str(e)) return None return preprocessor
def test_preprocess_corpus_min_df(self): p = Preprocessor( lowercase=True, stop_words=['for', 'a', 'of', 'the', 'and', 'to', 'in'], min_df=2) corpus = p(self.corpus) correct = [['human', 'interface', 'computer'], [ 'survey', 'user', 'computer', 'system', 'response', 'time' ], ['eps', 'user', 'interface', 'system'], ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] self.assertEqual(len(corpus), len(correct)) for i, j in zip(corpus, correct): self.assertEqual(sorted(i), sorted(j))
def __init__(self): super().__init__() self.lda = None self.corpus = None self.preprocessor = Preprocessor() # Info. info_box = gui.widgetBox(self.controlArea, "Info") self.info_label = gui.label(info_box, self, '') # Settings. topic_box = gui.widgetBox(self.controlArea, "Settings") hbox = gui.widgetBox(topic_box, orientation=0) self.topics_label = gui.label(hbox, self, 'Number of topics: ') self.topics_label.setMaximumSize(self.topics_label.sizeHint()) self.topics_input = gui.spin(hbox, self, "num_topics", minv=1, maxv=2**31 - 1, callback=self.num_topics_changed) # Commit button self.commit = gui.button(self.controlArea, self, "&Apply", callback=self.apply, default=True) self.commit.setEnabled(False) gui.rubber(self.controlArea) # Topics description self.cols = ['Topic', 'Topic keywords'] self.topic_desc = QtGui.QTreeWidget() self.topic_desc.setColumnCount(len(self.cols)) self.topic_desc.setHeaderLabels(self.cols) #self.topic_desc.setSelectionMode(QtGui.QTreeView.ExtendedSelection) self.topic_desc.itemSelectionChanged.connect( self.selected_topic_changed) for i in range(len(self.cols)): self.topic_desc.resizeColumnToContents(i) self.mainArea.layout().addWidget(self.topic_desc) self.refresh_gui()
def test_wordnet_lemmatizer_sentence(self): corpus = [ 'Pursued brightness insightful blessed lies held timelessly minds.' ] lemmas = [ 'pursued', 'brightness', 'insightful', 'blessed', 'lie', 'held', 'timelessly', 'mind', '.' ] p = Preprocessor(transformation=Lemmatizer,) result = p(corpus)[0] self.assertEqual(result, lemmas)
def preprocess_only_words(corpus: Corpus) -> Corpus: """ Apply the preprocessor that splits words, transforms them to lower case (and removes punctuations). Parameters ---------- corpus Corpus on which the preprocessor will be applied. Returns ------- Preprocessed corpus. Result of pre-processing is saved in tokens/ngrams. """ p = Preprocessor( transformers=[LowercaseTransformer()], # by default regexp keeps only words (no punctuations, no spaces) tokenizer=RegexpTokenizer(), ) return p(corpus, inplace=False)
def test_snowball_sentence(self): corpus = [ 'Caresses flies dies mules denied died agreed owned humbled sized.' ] stemmed = [ 'caress', 'fli', 'die', 'mule', 'deni', 'die', 'agre', 'own', 'humbl', 'size', '.' ] p = Preprocessor(transformation=SnowballStemmer) result = p(corpus)[0] self.assertEqual(result, stemmed)
def test_preprocess_one_sentence_lowercase_stopwords(self): p = Preprocessor(lowercase=True, stop_words=None) corpus = p(self.sentence) self.assertEqual( sorted(corpus), sorted([ 'abc', 'applications', 'computer', 'for', 'human', 'interface', 'lab', 'machine' ])) p = Preprocessor(lowercase=True, stop_words='english') corpus = p(self.sentence) self.assertEqual( sorted(corpus), sorted([ 'abc', 'applications', 'computer', 'human', 'interface', 'lab', 'machine' ])) p = Preprocessor(lowercase=True, stop_words=['abc', 'applications', 'computer', 'for']) corpus = p(self.sentence) self.assertEqual(sorted(corpus), sorted(['human', 'interface', 'lab', 'machine'])) p = Preprocessor(lowercase=False, stop_words=None) corpus = p(self.sentence) self.assertEqual( sorted(corpus), sorted([ 'Human', 'abc', 'applications', 'computer', 'for', 'interface', 'lab', 'machine' ])) p = Preprocessor(lowercase=False, stop_words='english') corpus = p(self.sentence) self.assertEqual( sorted(corpus), sorted([ 'abc', 'applications', 'computer', 'Human', 'interface', 'lab', 'machine' ])) p = Preprocessor(lowercase=False, stop_words=['abc', 'applications', 'computer', 'for']) corpus = p(self.sentence) self.assertEqual(sorted(corpus), sorted(['Human', 'interface', 'lab', 'machine']))
def test_faulty_init_parameters(self): # Stop word source. with self.assertRaises(ValueError): Preprocessor(stop_words='faulty_value') # Transformation. with self.assertRaises(ValueError): Preprocessor(transformation='faulty_value') # Min/Max df. with self.assertRaises(ValueError): Preprocessor(min_df='faulty_value') with self.assertRaises(ValueError): Preprocessor(max_df='faulty_value') with self.assertRaises(ValueError): Preprocessor(min_df=1.5) with self.assertRaises(ValueError): Preprocessor(max_df=1.5)
def __call__(self, corpus, use_tfidf=False): if corpus is None: raise ValueError( 'Cannot compute Bag of Words without an input corpus.') has_tokens = hasattr(corpus, 'tokens') and corpus.tokens is not None if not has_tokens: # Perform default pre-processing. preprocessor = Preprocessor() corpus = preprocessor(corpus) self.check_progress() # Step 1 dictionary = corpora.Dictionary(corpus.tokens, prune_at=np.inf) self.vocabulary = dictionary # Term frequencies. bag_of_words = [dictionary.doc2bow(i) for i in corpus.tokens] self.check_progress() # Step 2 if use_tfidf: tfidf_model = TfidfModel(bag_of_words) bag_of_words = tfidf_model[bag_of_words] self.check_progress() # Step 3 X = matutils.corpus2dense(bag_of_words, num_terms=len(dictionary.keys()), dtype=np.float64).T bow_corpus = corpus.copy() feats = [v for k, v in sorted(dictionary.items())] bow_corpus.extend_attributes(X, feats, var_attrs={'bow_feature': True}) self.check_progress() # Step 4 return bow_corpus
def set_preprocessor(self, data): if data is None: self.preprocessor = Preprocessor() else: self.preprocessor = data self.apply()