Python NGramFeaturizer.fit_transform 예제들, nimbusml.feature_extraction.text.NGramFeaturizer.fit_transform Python 예제들

예제 #1

0

파일 보기

파일: test_naivebayesclassifier.py 프로젝트: zyw400/NimbusML-1

    def test_naivebayesclassifier(self):
        np.random.seed(0)
        train_file = get_dataset("wiki_detox_train").as_filepath()
        (train, label) = get_X_y(train_file, label_column='Sentiment',
                                 sep='\t')
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train)
        X_test = texttransform.transform(X_test)

        mymodel = NaiveBayesClassifier()
        mymodel.fit(X_train, y_train)

        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])[0]
        assert_greater(
            accuracy,
            0.5,
            "accuracy should be greater than %s" %
            0.5)

예제 #2

0

파일 보기

    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)

예제 #3

0

파일 보기

파일: test_ngramfeaturizer.py 프로젝트: yazici/NimbusML

    def test_ngramfeaturizer_single(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                             columns={'features': ['id', 'education']})

        features = xf.fit_transform(data)
        assert features.shape == (248, 652)

예제 #4

0

파일 보기

    def test_syntax9_multiple_inputs(self):
        df = pandas.DataFrame(
            dict(education1=['A', 'B', 'A', 'B', 'A'],
                 education2=['c', 'd', 'c', 'd', 'c'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'out1': ['education1', 'education2']
        }
        output4 = ng4.fit_transform(X)
        assert output4.shape == (5, 13)

예제 #5

0

파일 보기

파일: test_pipeline_syntax.py 프로젝트: zyw400/NimbusML-1

 def test_column_list_or_string(self):
     # Bug 142794
     data = pd.DataFrame({
         "Sentiment": [0, 1, 1, 0, 1, 1],
         "SentimentText": [
             "this is train ", "review ", "sentence ", "an apple",
             "sentence 22", "another one one one"
         ]
     })
     data['SentimentText'] = data['SentimentText'].astype(str)
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": 'SentimentText'
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": ['SentimentText']
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram(),
                                  columns=['SentimentText'])
     res1 = featurizer.fit_transform(data)
     featurizer = NGramFeaturizer(
         word_feature_extractor=n_gram()) << 'SentimentText'
     res2 = featurizer.fit_transform(data)
     assert_frame_equal(res1, res2)

예제 #6

0

파일 보기

파일: test_ngramfeaturizer.py 프로젝트: yazici/NimbusML

    def test_ngramfeaturizer_multi(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        try:
            xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                                 columns={
                                     'features': ['id'],
                                     'features2': ['education']
                                 })
        except TypeError as e:
            assert 'Only one output column is allowed' in str(e)
            return

        try:
            # System.InvalidCastException: 'Cannot cast
            # Newtonsoft.Json.Linq.JArray to Newtonsoft.Json.Linq.JToken.
            xf.fit_transform(data)
            assert False
        except RuntimeError:
            pass

예제 #7

0

파일 보기

파일: test_ngramfeaturizer.py 프로젝트: geeksperiments/NimbusML

    def test_ngramfeaturizer(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train, label) = get_X_y(train_file,
                                 label_column='Sentiment',
                                 sep='\t',
                                 encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train[:100])
        sum = X_train.iloc[:].sum().sum()
        assert_equal(sum, 30513, "sum of all features is incorrect!")

예제 #8

0

파일 보기

파일: test_ngramfeaturizer.py 프로젝트: yazici/NimbusML

    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )

예제 #9

0

파일 보기

파일: test_ngramfeaturizer.py 프로젝트: yazici/NimbusML

    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10

예제 #10

0

파일 보기

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

xf = NGramFeaturizer(word_feature_extractor=Ngram(),
                     stop_words_remover=CustomStopWordsRemover(['!',
                                                                '$',
                                                                '%',
                                                                '&',
                                                                '\'',
                                                                '\'d']),
                     columns={'features': ['SentimentText']})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())

#   Sentiment   ...         features.douchiest  features.award.
# 0          1  ...                        0.0              0.0
# 1          1  ...                        0.0              0.0
# 2          1  ...                        0.0              0.0
# 3          1  ...                        0.0              0.0
# 4          1  ...                        0.0              0.0

예제 #11

0

파일 보기

파일: NGramFeaturizer_df.py 프로젝트: vdedyukhin/NimbusML

    data=dict(
        review=[
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
# print(X.head())

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = ngram.transform(test_reviews)

scores = mymodel.predict(ngram.transform(test_reviews))

# view the scores
# print(scores.head())