예제 #1
0
    def test_remove_non_alphas_04(self):
        warnings.simplefilter('ignore')
        expected_output = [['look', 'sentence'],
                           ['sentence', '59', '1', 'numbers'],
                           ['sentence', 'punctuation'],
                           ['sentence', 'john', 'paul', 'emails']]
        docs = [
            self.test_docs['stopwords_01'],
            self.test_docs['numbers_01'],
            self.test_docs['punct_01'],
            self.test_docs['emails_01'],
        ]
        params = {
            'keep_alpha': False,
            'keep_alpha_nums': True,
            'remove_stops': True,
            'remove_oov': False,
            'remove_nums': False,
            'remove_url': False,
            'remove_email': False,
            'remove_punct': False
        }

        t = Tokenizer('nltk')
        f = TokenFilter('spacy', **params)
        p = Pipeline(t, f)
        test_output = p.apply(docs)
        self.assertEqual(test_output, expected_output)
예제 #2
0
    def test_nltk_lemmatizer(self):
        warnings.simplefilter('ignore')
        expected_output = [[
            'running', "n't", "n't", 'cry', 'going', 'gone', 'tested'
        ]]
        docs = []
        docs.append(self.test_docs['stems_01'])
        params = {
            'keep_alpha': False,
            'keep_alpha_nums': False,
            'remove_stops': True,
            'remove_oov': False,
            'remove_nums': False,
            'remove_url': False,
            'remove_email': False,
            'remove_punct': False
        }

        t = Tokenizer('spacy')
        f = TokenFilter('spacy', **params)
        s = Stemmer('nltk', lemmatizer='wordnet')
        p = Pipeline(t, f, s)
        test_output = p.apply(docs)
        self.assertEqual(test_output, expected_output,
                         'NLTK lemmatiZZZZzer did not lemmatize')
예제 #3
0
    def test_spacy_stops(self):
        warnings.simplefilter('ignore')
        expected_output = [[':', 'look', 'like', '-', 'sentence']]
        docs = []
        docs.append(self.test_docs['stopwords_01'])
        params = {
            'keep_alpha': False,
            'keep_alpha_nums': False,
            'remove_stops': True,
            'remove_oov': False,
            'remove_nums': False,
            'remove_url': False,
            'remove_email': True,
            'remove_punct': False
        }

        t = Tokenizer('spacy')
        f = TokenFilter('spacy', **params)
        p = Pipeline(t, f)
        test_output = p.apply(docs)
        self.assertEqual(test_output, expected_output,
                         'Spacy did not remove stop words')
예제 #4
0
    def test_nltk_oov(self):
        warnings.simplefilter('ignore')
        expected_output = [['this', 'sentence', 'only']]
        docs = []
        docs.append(self.test_docs['oov_01'])
        params = {
            'keep_alpha': False,
            'keep_alpha_nums': False,
            'remove_stops': False,
            'remove_oov': True,
            'remove_nums': False,
            'remove_url': False,
            'remove_email': True,
            'remove_punct': False
        }

        t = Tokenizer('nltk')
        f = TokenFilter('nltk', **params)
        p = Pipeline(t, f)
        test_output = p.apply(docs)
        self.assertEqual(test_output, expected_output,
                         'NLTK did not remove non English words')
예제 #5
0
    def test_spacy_lemma(self):
        warnings.simplefilter('ignore')
        expected_output = [['run', 'not', 'not', 'cry', 'go', 'go', 'test']]
        docs = []
        docs.append(self.test_docs['stems_01'])
        params = {
            'keep_alpha': False,
            'keep_alpha_nums': False,
            'remove_stops': True,
            'remove_oov': False,
            'remove_nums': False,
            'remove_url': False,
            'remove_email': False,
            'remove_punct': False
        }

        t = Tokenizer('spacy')
        f = TokenFilter('spacy', **params)
        s = Stemmer('spacy')
        p = Pipeline(t, f, s)
        test_output = p.apply(docs)
        self.assertEqual(test_output, expected_output,
                         'Spacy did not lemmatize')