def test_combining_two_dataset_transformers(self):
        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df)
        result_1 = std_pipeline.predict(test_df)

        # Create combined pipeline
        transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline1.fit(train_df)

        transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline2.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline1.model),
            DatasetTransformer(transform_model=transform_pipeline2.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        os.remove(transform_pipeline1.model)
        os.remove(transform_pipeline2.model)

        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
    def test_same_schema_with_dataframe_input(self):
        train_df_updated = train_df.drop(['c0'], axis=1)
        test_df_updated = test_df.drop(['c0'], axis=1)

        rf_max = 4.5

        # Create reference pipeline
        std_pipeline = Pipeline([
            RangeFilter(min=0.0, max=rf_max) << 'c2',
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df_updated)
        result_1 = std_pipeline.predict(test_df_updated)

        # Create combined pipeline
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2'])
        transform_pipeline.fit(train_df_updated)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df_updated)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_df_updated)

        self.assertTrue(result_1.equals(result_2))
    def test_different_schema_with_dataframe_input(self):
        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_df)
        result_1 = std_pipeline.predict(test_df)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
示例#4
0
    def test_passing_in_a_single_predictor_returns_new_pipeline(self):
        train_dropped_df = train_df.drop(['c0'], axis=1)
        test_dropped_df = test_df.drop(['c0'], axis=1)

        predictor = OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        predictor.fit(train_dropped_df)
        result_1 = predictor.predict(test_dropped_df)

        combined_pipeline = Pipeline.combine_models(predictor)
        result_2 = combined_pipeline.predict(test_dropped_df)

        self.assertEqual(result_1[0], result_2.loc[0, 'Score'])
        self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
        self.assertTrue(isinstance(combined_pipeline, Pipeline))
示例#5
0
    def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model(
            self):
        """
        This test verifies that two models can be combined
        even if the transform increases the number of columns.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#6
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
示例#7
0
    def test_combine_transform_and_predictor(self):
        transform = OneHotVectorizer() << 'c0'
        df = transform.fit_transform(train_df, as_binary_data_stream=True)

        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor.fit(df)

        df = transform.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(test_df)

        self.assertEqual(result_1[0], result_2.loc[0, 'Score'])
        self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
    def test_notvectorized_output_predictor_model(self):
        """
        This test verifies that outputted predictor model from 
        combined (with featurizers) pipeline runs successfully
        on featurized data with no vectors.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df1 = transform_pipeline.transform(df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            RangeFilter(min=0.0, max=4.5) << 'c2',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_2 = predictor_pipeline.predict(df1)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#9
0
    def test_ensemble_supports_user_defined_transforms(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}))

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test2_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test2_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test2_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([
            RangeFilter(min=0, max=10, columns='c1'),
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test2_df)

        self.assertEqual(len(result4), 3)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        average3 = (result1[2] + result2[2] + result3[2]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)
        self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
示例#10
0
 def test_ensemble_rejects_estimators_with_incorrect_type(self):
     r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
     r2 = OnlineGradientDescentRegressor(**ogdArgs)
     r3 = LogisticRegressionClassifier()
     try:
         vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
     except Exception as e:
         print(e)
     else:
         self.fail('VotingRegressor should only work with regressors.')
示例#11
0
    def test_get_schema_does_not_work_when_predictor_is_part_of_model(self):
        df = train_df.drop(['c0'], axis=1)

        pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')])
        pipeline.fit(df)

        try:
            schema = pipeline.get_output_columns()
        except Exception as e:
            pass
        else:
            self.fail()
示例#12
0
    def test_ensemble_supports_output_predictor_model(self):
        test2_df = test_df.copy(deep=True)
        test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}),
                                   ignore_index=True)
        test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32})

        # Create a ground truth pipeline
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df)
        result_1 = combined_pipeline.predict(test2_df)

        # Create a duplicate pipeline but also request a predictor model
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1',
                                      VotingRegressor(estimators=[r1, r2], combiner='Average')])
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_2 = combined_pipeline.predict(test2_df)

        # Create a predictor model only pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)
        result_3 = predictor_pipeline.predict(test2_df)

        # Verify the first rows are equal
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score'])

        # Verify the second rows are equal
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score'])

        # Verify the number of rows
        self.assertEqual(len(result_1), 2)
        self.assertEqual(len(result_2), 2)
        self.assertEqual(len(result_3), 4)
示例#13
0
    def test_get_fit_info(self):
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'])
        transform_pipeline.fit(train_df)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_df)

        info = combined_pipeline.get_fit_info(train_df)

        self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
示例#14
0
    def test_passing_in_a_single_pipeline_returns_new_pipeline(self):
        pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ])
        pipeline.fit(train_df)
        result_1 = pipeline.predict(test_df)

        combined_pipeline = Pipeline.combine_models(pipeline)
        result_2 = combined_pipeline.predict(test_df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
        self.assertTrue(isinstance(combined_pipeline, Pipeline))
示例#15
0
    def test_ensemble_supports_cv_with_user_defined_transforms(self):
        path = get_dataset("airquality").as_filepath()
        schema = DataSchema.read_schema(path)
        data = FileDataStream(path, schema)

        ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'}
        handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'}
        lgbm_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ols_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'normalize': 'Yes'
        }
        ogd_args = {
            'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'],
            'label': 'Wind',
            'shuffle': False,
            'normalize': 'Yes'
        }

        for split_start in ['before_transforms', 'after_transforms']:
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                LightGbmRegressor(**lgbm_args)
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            r1 = OrdinaryLeastSquaresRegressor(**ols_args)
            r2 = OnlineGradientDescentRegressor(**ogd_args)
            r3 = LightGbmRegressor(**lgbm_args)

            data = FileDataStream(path, schema)
            pipeline_steps = [
                Indicator() << ind_args,
                Handler(replace_with='Mean') << handler_args,
                VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
            ]

            cv_results = CV(pipeline_steps).fit(data, split_start=split_start)
            l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)']

            self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
示例#16
0
    def test_different_schema_with_filedatastream_input(self):
        train_filename = "train-data.csv"
        train_df.to_csv(train_filename, index=False, header=True)
        train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True)

        test_filename = "test-data.csv"
        test_df.to_csv(test_filename, index=False, header=True)
        test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True)

        # Create reference pipeline
        std_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)

        std_pipeline.fit(train_data_stream)
        result_1 = std_pipeline.predict(test_data_stream)

        # Create combined pipeline
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed)
        transform_pipeline.fit(train_data_stream)

        combined_pipeline = Pipeline([
            DatasetTransformer(transform_model=transform_pipeline.model),
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ], random_state=seed)
        combined_pipeline.fit(train_data_stream)

        os.remove(transform_pipeline.model)

        result_2 = combined_pipeline.predict(test_data_stream)

        self.assertTrue(result_1.equals(result_2))

        os.remove(train_filename)
        os.remove(test_filename)
示例#17
0
    def test_ensemble_with_average_and_median_combiner(self):
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r1.fit(train_df)
        result1 = r1.predict(test_df)

        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r2.fit(train_df)
        result2 = r2.predict(test_df)

        r3 = LightGbmRegressor(**lgbmArgs)
        r3.fit(train_df)
        result3 = r3.predict(test_df)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        average1 = (result1[0] + result2[0] + result3[0]) / 3
        average2 = (result1[1] + result2[1] + result3[1]) / 3
        self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5)
        self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5)

        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)

        pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')])
        pipeline.fit(train_df)
        result4 = pipeline.predict(test_df)

        median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1]
        median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1]

        self.assertEqual(median1, result4.loc[0, 'Score'])
        self.assertEqual(median2, result4.loc[1, 'Score'])
示例#18
0
    def test_combine_transform_and_pipeline(self):
        transform = RangeFilter(min=0.0, max=4.5) << 'c2'
        df = transform.fit_transform(train_df, as_binary_data_stream=True)

        pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])
        ])
        pipeline.fit(df)

        df = transform.transform(test_df, as_binary_data_stream=True)
        result_1 = pipeline.predict(df)

        combined_pipeline = Pipeline.combine_models(transform, pipeline)
        result_2 = combined_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
示例#19
0
    def test_fit_predictor_with_idv(self):
        train_data = {
            'c0': ['a', 'b', 'a', 'b'],
            'c1': [1, 2, 3, 4],
            'c2': [2, 3, 4, 5]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        test_data = {
            'c0': ['a', 'b', 'b'],
            'c1': [1.5, 2.3, 3.7],
            'c2': [2.2, 4.9, 2.7]
        }
        test_df = pd.DataFrame(test_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        # Fit a transform pipeline to the training data
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'])
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Fit a predictor pipeline given a transformed BinaryDataStream
        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor_pipeline = Pipeline([predictor])
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Create expected result
        xf = OneHotVectorizer() << 'c0'
        df = xf.fit_transform(train_df)
        predictor = OnlineGradientDescentRegressor(
            label='c2', feature=['c0.a', 'c0.b', 'c1'])
        predictor.fit(df)
        df = xf.transform(test_df)
        expected_result = predictor.predict(df)

        self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
示例#20
0
    def test_combine_two_pipelines_created_from_model_files(self):
        """
        This test verifies that two models can be combined
        after they are loaded from disk in to new Pipelines.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'],
                                        random_state=seed)
        transform_pipeline_1.fit(train_df)
        df = transform_pipeline_1.transform(train_df,
                                            as_binary_data_stream=True)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline_1 = Pipeline(
            [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])],
            random_state=seed)
        predictor_pipeline_1.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline_1.transform(test_df,
                                            as_binary_data_stream=True)
        result_1 = predictor_pipeline_1.predict(df)

        # Use the model files stored in the Pipelines
        # to create new Pipelines (aka. create new Pipelines
        # using the model files stored on disk).
        transform_pipeline_2 = Pipeline()
        transform_pipeline_2.load_model(transform_pipeline_1.model)
        predictor_pipeline_2 = Pipeline()
        predictor_pipeline_2.load_model(predictor_pipeline_1.model)

        # Combine the newly created Pipelines in to one Pipeline
        # and use it to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline_2,
                                                    predictor_pipeline_2)
        result_2 = combined_pipeline.predict(test_df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#21
0
    def test_data_role_info_has_been_removed_from_estimators(self):
        r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
        r2 = OnlineGradientDescentRegressor(**ogdArgs)
        r3 = LightGbmRegressor(**lgbmArgs)
        vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average')

        pipeline = Pipeline([vr])
        pipeline.fit(train_df)

        self.assertTrue(not hasattr(vr, 'feature_column_name'))

        self.assertTrue(not hasattr(vr.estimators[0], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[0], 'feature_column_name_'))

        self.assertTrue(not hasattr(vr.estimators[1], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[1], 'feature_column_name_'))

        self.assertTrue(not hasattr(vr.estimators[2], 'feature_column_name'))
        self.assertTrue(hasattr(vr.estimators[2], 'feature_column_name_'))
示例#22
0
    def test_syntax10_weights_fail(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 weights=[1., 1., 1., 2., 1.],
                 y=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop(['y', 'weights'], axis=1)
        y = df['y']
        weights = df['weights']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            OnlineGradientDescentRegressor()
        ])
        try:
            exp.fit(X, y, weight=weights, verbose=0)
            assert False
        except RuntimeError as e:
            assert "does not support role 'Weight'" in str(e)
    def test_vectorized_with_prefixconcat_output_predictor_model(self):
        """
        This test shows how to prepend ColumnConcatenator transform
        to outputted predictor model from combined (with featurizers) pipeline
        so it successfully runs on featurized data with vectors.
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create, fit and score with combined model.
        # Output predictor model separately.
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # train ColumnConcatenator on featurized data
        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator(columns={'c0': 'c0.'})])
        concat_pipeline.fit(df)

        # Load predictor pipeline
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        # combine concat and predictor models and score
        combined_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)
        result_2 = combined_predictor_pipeline.predict(df)

        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
示例#24
0
    def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same(
            self):
        """
        This test verifies that two models created using DataFrames
        can be combined if the output schema of the first is the same
        as the input schema of the second.
        """
        df = train_df.drop(['c0'], axis=1)

        # Create and fit a RangeFilter transform using the training
        # data and use it to transform the training data.
        transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'],
                                      random_state=seed)
        transform_pipeline.fit(df)
        df = transform_pipeline.transform(df)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2')], random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df)
        result_1 = predictor_pipeline.predict(df)

        df = test_df.drop(['c0'], axis=1)

        # Combine the above Pipelines in to one Pipeline and use
        # the new Pipeline to get predictions given the test data.
        combined_pipeline = Pipeline.combine_models(transform_pipeline,
                                                    predictor_pipeline)
        result_2 = combined_pipeline.predict(df)

        # Verify that the prediction from the combined Pipeline
        # matches the prediction from the original two Pipelines.
        self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score'])
        self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
    def test_vectorized_output_predictor_model(self):
        """
        This test shows that outputted predictor model from 
        combined (with featurizers) pipeline fails to run
        on featurized data with vectors.
        """

        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit a combined model and spit out predictor model
        combined_pipeline = Pipeline([
            OneHotVectorizer() << 'c0',
            OnlineGradientDescentRegressor(label='c2')
        ],
                                     random_state=seed)
        combined_pipeline.fit(train_df, output_predictor_model=True)
        result_1 = combined_pipeline.predict(train_df)

        # Load predictor pipeline and score featurized data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(combined_pipeline.predictor_model)

        try:
            # This does not work because the input schema doesnt
            # match. Input schema looks for vector 'c0' with slots 'a,b'
            # but featurized data has only columns 'c0.a' and 'c0.b'
            predictor_pipeline.predict(df)

        except Exception as e:
            pass
        else:
            self.fail()
示例#26
0
    def test_pickled_pipeline_with_predictor_model(self):
        train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]}
        train_df = pd.DataFrame(train_data).astype({'c1': np.float64,
                                                    'c2': np.float64})

        test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]}
        test_df = pd.DataFrame(test_data).astype({'c1': np.float64,
                                                  'c2': np.float64})

        # Create predictor model and use it to predict 
        pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0)
        pipeline.fit(train_df, output_predictor_model=True)
        result_1 = pipeline.predict(test_df)

        self.assertTrue(pipeline.model)
        self.assertTrue(pipeline.predictor_model)
        self.assertNotEqual(pipeline.model, pipeline.predictor_model)

        pickle_filename = get_temp_file(suffix='.p')
        with open(pickle_filename, 'wb') as f:
            pickle.dump(pipeline, f)

        os.remove(pipeline.model)
        os.remove(pipeline.predictor_model)

        with open(pickle_filename, "rb") as f:
            pipeline_pickle = pickle.load(f)

        os.remove(pickle_filename)

        # Load predictor pipeline and score data
        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(pipeline_pickle.predictor_model)
        result_2 = predictor_pipeline.predict(test_df)

        self.assertTrue(result_1.equals(result_2))
示例#27
0
    def test_two_pipelines_created_using_dataframes_can_not_be_combined_when_the_schemas_are_different(
            self):
        """
        This test verifies that two models created using DataFrames
        can not be combined if the output schema of the first is
        different then the input schema of the second.
        NOTE: This issue only happens with Pipelines created and fit
        using dataframes. Pipelines created and fit using IDV binary
        streams do not have this issue (see the tests below).
        """
        # Create and fit a OneHotVectorizer transform using the
        # training data and use it to transform the training data.
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'],
                                      random_state=seed)
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df)

        # Create and fit an OnlineGradientDescentRegressor using
        # the transformed training data from the previous step.
        predictor_pipeline = Pipeline(
            [OnlineGradientDescentRegressor(label='c2')], random_state=seed)
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df)
        result_1 = predictor_pipeline.predict(df)

        try:
            # This does not work because the output schema of the
            combined_pipeline = Pipeline.combine_models(
                transform_pipeline, predictor_pipeline)
        except Exception as e:
            pass
        else:
            self.fail()
示例#28
0
    reg2 = LinearRegression()
    vr = VotingRegressor_sklearn(estimators=[('gb', reg1), ('rf', reg2)])
    vr.fit(X_train, y_train)
    result = vr.predict(X_test)
    results.append(('All scikit-learn', result))

    # Perform regression using the scikit-learn
    # VotingRegressor and NimbusML predictors.
    olsrArgs = { 'normalize': "Yes" }
    ogdArgs = {
        'shuffle': False,
        'number_of_iterations': 800,
        'learning_rate': 0.1,
        'normalize': "Yes"
    }
    r1 = OnlineGradientDescentRegressor(**ogdArgs)
    r2 = OrdinaryLeastSquaresRegressor(**olsrArgs)
    vr = VotingRegressor_sklearn(estimators=[('ogd', r1), ('ols', r2)])
    vr.fit(X_train, y_train)
    result = vr.predict(X_test)
    results.append(('scikit-learn VotingRegressor with NimbusML predictors', result))

# Perform regression using only NimbusML classes
olsrArgs = { 'normalize': "Yes" }
ogdArgs = {
    'shuffle': False,
    'number_of_iterations': 800,
    'learning_rate': 0.1,
    'normalize': "Yes"
}
r1 = OnlineGradientDescentRegressor(**ogdArgs)
示例#29
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    OnlineGradientDescentRegressor(feature=['parity', 'edu'], label='age')
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  28.103731
# 1  21.805904
# 2  28.103731
# 3  25.584600
# 4  33.743286
# print evaluation metrics
print(metrics)
示例#30
0
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView