def test_fit_transform(self): """Test NLTKPreprocessor `fit_transform` method.""" # custom parameters prep = NLTKPreprocessor(stopwords=True, lower=True) # test without feed argument and simple data transform = prep.fit_transform(X=TEST_DATA) self.assertTrue(len(transform), len(TEST_DATA)) # check that the array is not empty self.assertTrue(any(transform)) # create series to simulate output of LabelPreprocessor and make # use of `feed_attributes` argument Series = namedtuple('Series', 'description label') data = [Series(d, 'label') for d in TEST_DATA] transform = prep.fit_transform(X=data, feed_attributes=['description']) self.assertTrue(len(transform), len(data)) # check that the array is not empty self.assertTrue(any(transform)) # perform transformation and output labels as well transform = prep.fit_transform(X=data, feed_attributes=['description'], output_attributes=['label']) self.assertTrue(len(transform), len(data)) # check that the array is not empty self.assertTrue(any(transform))
def test_tokenize(self): """Test NLTKPreprocessor `tokenize` method.""" prep = NLTKPreprocessor(stopwords=True, lower=True) result = prep.tokenize(TEST_SENT) self.assertIsInstance(result, typing.Iterable) print(result) # check that punctuation has been gotten rid of self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result)) # check that the list contains elements of same type self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))
def test_inverse_transform(self): """Test NLTKPreprocessor `inverse_transform` method.""" prep = NLTKPreprocessor() # test without feed argument and simple data transform = prep.fit_transform(X=TEST_DATA) inversed = NLTKPreprocessor.inverse_transform(transform) self.assertTrue(len(inversed), len(transform)) # check that the array is not empty self.assertTrue(any(inversed))
def test_init(self): """Test NLTKPreprocessor initialization.""" # default parameters prep = NLTKPreprocessor() self.assertIsInstance(prep, NLTKPreprocessor) # custom parameters prep = NLTKPreprocessor(stopwords=True, lower=True) self.assertIsNotNone(prep._stopwords) # pylint: disable=protected-access self.assertIsInstance(prep, NLTKPreprocessor)
def test_init(self): """Test NLTKPreprocessor initialization.""" # default parameters prep = NLTKPreprocessor() self.assertIsInstance(prep, NLTKPreprocessor) # custom parameters prep = NLTKPreprocessor(stopwords=True, lower=True) self.assertIsNotNone(prep._stopwords) # pylint: disable=protected-access self.assertIsInstance(prep, NLTKPreprocessor) # raises with self.assertRaises(TypeError): # noinspection PyTypeChecker _ = NLTKPreprocessor(feed_attributes='attribute') with self.assertRaises(TypeError): # noinspection PyTypeChecker _ = NLTKPreprocessor(output_attributes='attribute')
def test_tokenize(self): """Test NLTKPreprocessor `tokenize` method.""" prep = NLTKPreprocessor(stopwords=True, lower=True) result = prep.tokenize(TEST_SENT) self.assertIsInstance(result, typing.Iterable) # check that punctuation has been gotten rid of self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result)) # check that the list contains elements of same type self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result)) # tag_correction prep = NLTKPreprocessor(tag_dict={'NUM': 'BUM'}) result_corrected = prep.tokenize(TEST_SENT) self.assertTrue( all([ c[1] == 'BUM' for c, r in zip(result_corrected, result) if r[1] == 'NUM' ])) # stemmer and lemmatizer prep = NLTKPreprocessor(lemmatizer=WordNetLemmatizer(), stemmer=SnowballStemmer(language='english')) result = prep.tokenize(TEST_SENT) self.assertIsInstance(result, typing.Iterable) # check that punctuation has been gotten rid of self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result)) # check that the list contains elements of same type self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result)) # raises with self.assertRaises(TypeError): # noinspection PyTypeChecker _ = prep.tokenize(['stream'])