Python Pipeline.transform示例，nimbusml.Pipeline.transform Python示例

示例#1

0

显示文件

    def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that two models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

示例#2

0

显示文件

    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)

示例#3

0

显示文件

文件： test_pipeline_split_models.py 项目： yazici/NimbusML

    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

示例#4

0

显示文件

    def test_fit_predictor_with_idv(self):
        train_data = {
            'c0': ['a', 'b', 'a', 'b'],
            'c1': [1, 2, 3, 4],
            'c2': [2, 3, 4, 5]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        test_data = {
            'c0': ['a', 'b', 'b'],
            'c1': [1.5, 2.3, 3.7],
            'c2': [2.2, 4.9, 2.7]
        }
        test_df = pd.DataFrame(test_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        # Fit a transform pipeline to the training data
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'])
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Fit a predictor pipeline given a transformed BinaryDataStream
        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor_pipeline = Pipeline([predictor])
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Create expected result
        xf = OneHotVectorizer() << 'c0'
        df = xf.fit_transform(train_df)
        predictor = OnlineGradientDescentRegressor(
            label='c2', feature=['c0.a', 'c0.b', 'c1'])
        predictor.fit(df)
        df = xf.transform(test_df)
        expected_result = predictor.predict(df)

        self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))

示例#5

0

显示文件

    def test_combine_two_pipelines_created_from_model_files(self):
        """
        This test verifies that two models can be combined
        after they are loaded from disk in to new Pipelines.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'],
                                        random_state=seed)
        transform_pipeline_1.fit(train_df)
        df = transform_pipeline_1.transform(train_df,
                                            as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline_1 = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline_1.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline_1.transform(test_df,
                                            as_binary_data_stream=True)
        result_1 = predictor_pipeline_1.predict(df)

        # Use the model files stored in the Pipelines
        # to create new Pipelines (aka. create new Pipelines
        # using the model files stored on disk).
        transform_pipeline_2 = Pipeline()
        transform_pipeline_2.load_model(transform_pipeline_1.model)
        predictor_pipeline_2 = Pipeline()
        predictor_pipeline_2.load_model(predictor_pipeline_1.model)

        # Combine the newly created Pipelines in to one Pipeline
        # and use it to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline_2,
                                                    predictor_pipeline_2)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

示例#6

0

显示文件

    def test_example_fails(self):

        like = [
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True
        ]
        x1 = [(5. if _ else 4.) for _ in like]
        x2 = [(-5. if _ else -4.) for _ in like]
        x1[0] = 50
        x2[1] = 50
        x2[2] = 50
        train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2),
                                      dtype=numpy.float32)

        # It works but I'm not sure what it does.
        transform_2 = MutualInformationSelector(slots_in_output=1,
                                                feature=['x1', 'x2'],
                                                label='like')
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        # assert transform_2.input == ['x1', 'x2']  # None
        # assert transform_2.output == ['Feature'] # None
        pipe = Pipeline([transform_2])
        pipe.fit(train_data)
        res = pipe.transform(train_data)
        assert res is not None

        # It works but I'm not sure what it does.
        try:
            transform_2 = MutualInformationSelector(slots_in_output=1,
                                                    feature2=['x1', 'x2'],
                                                    label='like')
            raise AssertionError("feature2 not allowed")
        except NameError as e:
            assert "Parameter 'feature2' is not allowed" in str(e)

        try:
            transform_2 = MutualInformationSelector(slots_in_output=2,
                                                    columns=['x1', 'x2'],
                                                    label='like')
            raise AssertionError("only one output is allowed")
        except RuntimeError as e:
            assert "use a dictionary" in str(e)

        try:
            transform_2 = MutualInformationSelector(slots_in_output=2,
                                                    columns={
                                                        'x1': 'x1',
                                                        'x2': 'x2'
                                                    },
                                                    label='like')
            raise AssertionError("only one output is allowed")
        except RuntimeError as e:
            assert "Output should contain only one output not" in str(e)

示例#7

0

显示文件

    def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same(
            self):
        """
        This test verifies that two models created using DataFrames
        can be combined if the output schema of the first is the same
        as the input schema of the second.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df = transform_pipeline.transform(df)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2')], random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df)
        result_1 = predictor_pipeline.predict(df)

        df = test_df.drop(['c0'], axis=1)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

示例#8

0

显示文件

文件： test_pipeline_get_schema.py 项目： yazici/NimbusML

    def test_get_schema_returns_correct_value_for_single_valued_columns(self):
        df = train_df.drop(['c0'], axis=1)

        pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'])
        pipeline.fit(df)
        df = pipeline.transform(df)

        schema = pipeline.get_output_columns()

        self.assertTrue('c1' in schema)
        self.assertTrue('c2' in schema)

        self.assertEqual(len(schema), 2)

示例#9

0

显示文件

文件： test_syntax_onehotvectorizer.py 项目： zyw400/NimbusML-1

 def test_syntax1_passing(self):
     df, X, y = self.get_simple_df()
     exp = Pipeline([
         OneHotVectorizer() << {
             'f1': 'education2'
         },
         OneHotVectorizer(max_num_terms=2) << {
             'f3': 'workclass'
         },
         LightGbmClassifier(min_data_per_leaf=1) << ['f1', 'f3']
     ])
     exp.fit(X, y)
     res = exp.transform(X)
     assert res.shape == (5, 16)

示例#10

0

显示文件

 def test_syntax2_passing(self):
     df, X, y = self.get_simple_df()
     exp = Pipeline([
         OneHotVectorizer() << {
             'f1': ['education']
         },
         OneHotVectorizer(max_num_terms=2) << {
             'f3': 'workclass'
         },
         FastLinearBinaryClassifier() << ['f1', 'f3']
     ])
     exp.fit(X, y)
     res = exp.transform(X)
     assert res.shape == (5, 16)

示例#11

0

显示文件

    def test_two_pipelines_created_using_dataframes_can_not_be_combined_when_the_schemas_are_different(
            self):
        """
        This test verifies that two models created using DataFrames
        can not be combined if the output schema of the first is
        different then the input schema of the second.
        NOTE: This issue only happens with Pipelines created and fit
        using dataframes. Pipelines created and fit using IDV binary
        streams do not have this issue (see the tests below).
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2')], random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df)
        result_1 = predictor_pipeline.predict(df)

        try:
            # This does not work because the output schema of the
            combined_pipeline = Pipeline.combine_models(
                transform_pipeline, predictor_pipeline)
        except Exception as e:
            pass
        else:
            self.fail()

示例#12

0

显示文件

    def test_fit_transform_with_idv(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        schema = featurized_data.schema
        num_columns = len(schema)
        self.assertTrue('case' in schema)
        self.assertTrue('row_num' in schema)

        pipeline = Pipeline([ColumnDropper() << ['case', 'row_num']])
        pipeline.fit(featurized_data)
        result = pipeline.transform(featurized_data,
                                    as_binary_data_stream=True)

        schema = result.schema
        self.assertEqual(len(schema), num_columns - 2)
        self.assertTrue('case' not in schema)
        self.assertTrue('row_num' not in schema)

示例#13

0

显示文件

    def test_schema_with_vectorized_column(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5
        # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9
        # col=pooled.stratum:I8:10 quote+
        schema = featurized_data.schema

        self.assertEqual(len(schema), 9)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertEqual(schema['education'].Type, 'R4')
        self.assertEqual(schema['education'].Name, 'education')
        self.assertEqual(len(schema['education'].Pos), 3)
        self.assertEqual(schema['education'].IsVector, True)

        self.assertTrue('education.0-5yrs' not in schema)
        self.assertTrue('education.6-11yrs' not in schema)
        self.assertTrue('education.12+yrs' not in schema)

        # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2
        # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6
        # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10
        # quote+ header=+
        schema = featurized_data.get_dataframe_schema()

        self.assertEqual(len(schema), 11)
        self.assertEqual(schema['age'].Type, 'I8')
        self.assertEqual(schema['age'].Name, 'age')
        self.assertEqual(schema['age'].IsVector, False)

        self.assertTrue('education' not in schema)
        self.assertTrue('education.0-5yrs' in schema)
        self.assertTrue('education.6-11yrs' in schema)
        self.assertTrue('education.12+yrs' in schema)

        self.assertEqual(schema['education.0-5yrs'].Type, 'R4')
        self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs')
        self.assertEqual(schema['education.0-5yrs'].IsVector, False)

示例#14

0

显示文件

文件： test_pipeline_split_models.py 项目： yazici/NimbusML

    def test_vectorized_with_prefixconcat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator(columns={'c0': 'c0.'})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])

示例#15

0

显示文件

文件： test_pipeline_split_models.py 项目： yazici/NimbusML

    def test_vectorized_output_predictor_model(self):
        """
        This test shows that outputted predictor model from 
        combined (with featurizers) pipeline fails to run
        on featurized data with vectors.
        """

        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        try:
            # This does not work because the input schema doesnt
            # match. Input schema looks for vector 'c0' with slots 'a,b'
            # but featurized data has only columns 'c0.a' and 'c0.b'
            predictor_pipeline.predict(df)

        except Exception as e:
            pass
        else:
            self.fail()

示例#16

0

显示文件

    'c2': np.float32
})

test_data = {'c1': [2.5, 30.5], 'c2': [1, 1]}
test_df = pd.DataFrame(test_data).astype({'c1': np.float32, 'c2': np.float32})

# Fit a MinMaxScaler Pipeline
r1 = Pipeline([MinMaxScaler()])
r1.fit(train_df)

# Export the pipeline to ONNX
onnx_path = get_tmp_file('.onnx')
r1.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')

# Perform the transform using the standard ML.Net backend
result_standard = r1.transform(test_df)
print(result_standard)
#          c1        c2
# 0  0.025025  0.000998
# 1  0.305305  0.000998

# Perform the transform using the ONNX backend.
# Note, the extra columns and column name differences
# is a known issue with the ML.Net backend.
onnxrunner = OnnxRunner(model_file=onnx_path)
result_onnx = onnxrunner.fit_transform(test_df)
print(result_onnx)
#      c1   c2     c12.0     c22.0
# 0   2.5  1.0  0.025025  0.000998
# 1  30.5  1.0  0.305305  0.000998

示例#17

0

显示文件

    def test_example_success(self):

        like = [
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True
        ]
        x1 = [(5. if _ else 4.) for _ in like]
        x2 = [(-5. if _ else -4.) for _ in like]
        x1[0] = 50
        x2[1] = 50
        x2[2] = 50
        train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2),
                                      dtype=numpy.float32)

        X = train_data.drop('like', axis=1)
        y = train_data[['like']]
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            Role.Feature: ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['Feature']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=1,
                                                columns=['x1'],
                                                label='like')
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['x1']
        pipe = Pipeline([transform_2])
        pipe.fit(train_data)
        res = pipe.transform(train_data)
        assert res is not None

示例#18

0

显示文件

文件： test_pipeline_transform_method.py 项目： yazici/NimbusML

 def test_transform_only_pipeline_transform_method(self):
     p = Pipeline(
         [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
     p.fit(X)
     xf = p.transform(X)
     assert 'SentimentText.==rude==' in xf.columns

示例#19

0

显示文件

文件： NGramExtractor_df.py 项目： yazici/NimbusML

    CharTokenizer(columns={'review_transform': 'review'}),
    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
    ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = pipeline.transform(test_reviews)
result = model.predict(X_test)

print(result)
# 0     True
# 1    False
# 2     True
# 3     True
# 4    False
# 5     True
# 6     True
# 7     True
# 8    False
# 9     True