def test_remove_non_alphas_04(self): warnings.simplefilter('ignore') expected_output = [['look', 'sentence'], ['sentence', '59', '1', 'numbers'], ['sentence', 'punctuation'], ['sentence', 'john', 'paul', 'emails']] docs = [ self.test_docs['stopwords_01'], self.test_docs['numbers_01'], self.test_docs['punct_01'], self.test_docs['emails_01'], ] params = { 'keep_alpha': False, 'keep_alpha_nums': True, 'remove_stops': True, 'remove_oov': False, 'remove_nums': False, 'remove_url': False, 'remove_email': False, 'remove_punct': False } t = Tokenizer('nltk') f = TokenFilter('spacy', **params) p = Pipeline(t, f) test_output = p.apply(docs) self.assertEqual(test_output, expected_output)
def test_nltk_lemmatizer(self): warnings.simplefilter('ignore') expected_output = [[ 'running', "n't", "n't", 'cry', 'going', 'gone', 'tested' ]] docs = [] docs.append(self.test_docs['stems_01']) params = { 'keep_alpha': False, 'keep_alpha_nums': False, 'remove_stops': True, 'remove_oov': False, 'remove_nums': False, 'remove_url': False, 'remove_email': False, 'remove_punct': False } t = Tokenizer('spacy') f = TokenFilter('spacy', **params) s = Stemmer('nltk', lemmatizer='wordnet') p = Pipeline(t, f, s) test_output = p.apply(docs) self.assertEqual(test_output, expected_output, 'NLTK lemmatiZZZZzer did not lemmatize')
def test_spacy_stops(self): warnings.simplefilter('ignore') expected_output = [[':', 'look', 'like', '-', 'sentence']] docs = [] docs.append(self.test_docs['stopwords_01']) params = { 'keep_alpha': False, 'keep_alpha_nums': False, 'remove_stops': True, 'remove_oov': False, 'remove_nums': False, 'remove_url': False, 'remove_email': True, 'remove_punct': False } t = Tokenizer('spacy') f = TokenFilter('spacy', **params) p = Pipeline(t, f) test_output = p.apply(docs) self.assertEqual(test_output, expected_output, 'Spacy did not remove stop words')
def test_nltk_oov(self): warnings.simplefilter('ignore') expected_output = [['this', 'sentence', 'only']] docs = [] docs.append(self.test_docs['oov_01']) params = { 'keep_alpha': False, 'keep_alpha_nums': False, 'remove_stops': False, 'remove_oov': True, 'remove_nums': False, 'remove_url': False, 'remove_email': True, 'remove_punct': False } t = Tokenizer('nltk') f = TokenFilter('nltk', **params) p = Pipeline(t, f) test_output = p.apply(docs) self.assertEqual(test_output, expected_output, 'NLTK did not remove non English words')
def test_spacy_lemma(self): warnings.simplefilter('ignore') expected_output = [['run', 'not', 'not', 'cry', 'go', 'go', 'test']] docs = [] docs.append(self.test_docs['stems_01']) params = { 'keep_alpha': False, 'keep_alpha_nums': False, 'remove_stops': True, 'remove_oov': False, 'remove_nums': False, 'remove_url': False, 'remove_email': False, 'remove_punct': False } t = Tokenizer('spacy') f = TokenFilter('spacy', **params) s = Stemmer('spacy') p = Pipeline(t, f, s) test_output = p.apply(docs) self.assertEqual(test_output, expected_output, 'Spacy did not lemmatize')