Python NLTKPreprocessor 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: toolkit.preprocessing

클래스/타입: NLTKPreprocessor

hotexamples.com에서의 예제들: 6

Python NLTKPreprocessor - 6개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 toolkit.preprocessing.NLTKPreprocessor에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

NLTKPreprocessor(6)

fit_transform(2)

tokenize(2)

inverse_transform(1)

예제 #1

파일 보기

    def test_fit_transform(self):
        """Test NLTKPreprocessor `fit_transform` method."""
        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        # test without feed argument and simple data
        transform = prep.fit_transform(X=TEST_DATA)

        self.assertTrue(len(transform), len(TEST_DATA))
        # check that the array is not empty
        self.assertTrue(any(transform))

        # create series to simulate output of LabelPreprocessor and make
        # use of `feed_attributes` argument
        Series = namedtuple('Series', 'description label')
        data = [Series(d, 'label') for d in TEST_DATA]

        transform = prep.fit_transform(X=data, feed_attributes=['description'])

        self.assertTrue(len(transform), len(data))
        # check that the array is not empty
        self.assertTrue(any(transform))

        # perform transformation and output labels as well
        transform = prep.fit_transform(X=data,
                                       feed_attributes=['description'],
                                       output_attributes=['label'])

        self.assertTrue(len(transform), len(data))
        # check that the array is not empty
        self.assertTrue(any(transform))

예제 #2

파일 보기

    def test_tokenize(self):
        """Test NLTKPreprocessor `tokenize` method."""
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)
        print(result)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))

예제 #3

파일 보기

파일: test_preprocessors.py 프로젝트: msrb/fabric8-analytics-nvd-toolkit

    def test_inverse_transform(self):
        """Test NLTKPreprocessor `inverse_transform` method."""
        prep = NLTKPreprocessor()

        # test without feed argument and simple data
        transform = prep.fit_transform(X=TEST_DATA)

        inversed = NLTKPreprocessor.inverse_transform(transform)

        self.assertTrue(len(inversed), len(transform))
        # check that the array is not empty
        self.assertTrue(any(inversed))

예제 #4

파일 보기

    def test_init(self):
        """Test NLTKPreprocessor initialization."""
        # default parameters
        prep = NLTKPreprocessor()

        self.assertIsInstance(prep, NLTKPreprocessor)

        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)

        self.assertIsNotNone(prep._stopwords)  # pylint: disable=protected-access
        self.assertIsInstance(prep, NLTKPreprocessor)

예제 #5

파일 보기

파일: test_preprocessors.py 프로젝트: msrb/fabric8-analytics-nvd-toolkit

    def test_init(self):
        """Test NLTKPreprocessor initialization."""
        # default parameters
        prep = NLTKPreprocessor()

        self.assertIsInstance(prep, NLTKPreprocessor)

        # custom parameters
        prep = NLTKPreprocessor(stopwords=True, lower=True)

        self.assertIsNotNone(prep._stopwords)  # pylint: disable=protected-access
        self.assertIsInstance(prep, NLTKPreprocessor)

        # raises
        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = NLTKPreprocessor(feed_attributes='attribute')

        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = NLTKPreprocessor(output_attributes='attribute')

예제 #6

파일 보기

파일: test_preprocessors.py 프로젝트: msrb/fabric8-analytics-nvd-toolkit

    def test_tokenize(self):
        """Test NLTKPreprocessor `tokenize` method."""
        prep = NLTKPreprocessor(stopwords=True, lower=True)
        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))

        # tag_correction
        prep = NLTKPreprocessor(tag_dict={'NUM': 'BUM'})

        result_corrected = prep.tokenize(TEST_SENT)
        self.assertTrue(
            all([
                c[1] == 'BUM' for c, r in zip(result_corrected, result)
                if r[1] == 'NUM'
            ]))

        # stemmer and lemmatizer
        prep = NLTKPreprocessor(lemmatizer=WordNetLemmatizer(),
                                stemmer=SnowballStemmer(language='english'))

        result = prep.tokenize(TEST_SENT)
        self.assertIsInstance(result, typing.Iterable)

        # check that punctuation has been gotten rid of
        self.assertFalse(any(re.match(u"[,.]", t[0][0]) for t in result))
        # check that the list contains elements of same type
        self.assertTrue(all(isinstance(t[0], type(t[1])) for t in result))

        # raises
        with self.assertRaises(TypeError):
            # noinspection PyTypeChecker
            _ = prep.tokenize(['stream'])