예제 #1
0
    def test_fit_transform(self):
        """Test NLTKPreprocessor `fit_transform` method."""
        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        # test without feed argument and simple data
        transform = prep.fit_transform(X=TEST_DATA)

        self.assertTrue(len(transform), len(TEST_DATA))
        # check that the array is not empty
        self.assertTrue(any(transform))

        # create series to simulate output of LabelPreprocessor and make
        # use of `feed_attributes` argument
        Series = namedtuple('Series', 'description label')
        data = [Series(d, 'label') for d in TEST_DATA]

        transform = prep.fit_transform(X=data, feed_attributes=['description'])

        self.assertTrue(len(transform), len(data))
        # check that the array is not empty
        self.assertTrue(any(transform))

        # perform transformation and output labels as well
        transform = prep.fit_transform(X=data,
                                       feed_attributes=['description'],
                                       output_attributes=['label'])

        self.assertTrue(len(transform), len(data))
        # check that the array is not empty
        self.assertTrue(any(transform))
예제 #2
0
    def test_tokenize(self):
        """Test NLTKPreprocessor `tokenize` method."""
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)
        print(result)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))
    def test_inverse_transform(self):
        """Test NLTKPreprocessor `inverse_transform` method."""
        prep = NLTKPreprocessor()

        # test without feed argument and simple data
        transform = prep.fit_transform(X=TEST_DATA)

        inversed = NLTKPreprocessor.inverse_transform(transform)

        self.assertTrue(len(inversed), len(transform))
        # check that the array is not empty
        self.assertTrue(any(inversed))
예제 #4
0
    def test_init(self):
        """Test NLTKPreprocessor initialization."""
        # default parameters
        prep = NLTKPreprocessor()

        self.assertIsInstance(prep, NLTKPreprocessor)

        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)

        self.assertIsNotNone(prep._stopwords)  # pylint: disable=protected-access
        self.assertIsInstance(prep, NLTKPreprocessor)
    def test_init(self):
        """Test NLTKPreprocessor initialization."""
        # default parameters
        prep = NLTKPreprocessor()

        self.assertIsInstance(prep, NLTKPreprocessor)

        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)

        self.assertIsNotNone(prep._stopwords)  # pylint: disable=protected-access
        self.assertIsInstance(prep, NLTKPreprocessor)

        # raises
        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = NLTKPreprocessor(feed_attributes='attribute')

        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = NLTKPreprocessor(output_attributes='attribute')
    def test_tokenize(self):
        """Test NLTKPreprocessor `tokenize` method."""
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))

        # tag_correction
        prep = NLTKPreprocessor(tag_dict={'NUM': 'BUM'})

        result_corrected = prep.tokenize(TEST_SENT)
        self.assertTrue(
            all([
                c[1] == 'BUM' for c, r in zip(result_corrected, result)
                if r[1] == 'NUM'
            ]))

        # stemmer and lemmatizer
        prep = NLTKPreprocessor(lemmatizer=WordNetLemmatizer(),
                                stemmer=SnowballStemmer(language='english'))

        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))

        # raises
        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = prep.tokenize(['stream'])