def test_combining_two_dataset_transformers(self): rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_df) result_1 = std_pipeline.predict(test_df) # Create combined pipeline transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline1.fit(train_df) transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline2.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline1.model), DatasetTransformer(transform_model=transform_pipeline2.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_df) os.remove(transform_pipeline1.model) os.remove(transform_pipeline2.model) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_same_schema_with_dataframe_input(self): train_df_updated = train_df.drop(['c0'], axis=1) test_df_updated = test_df.drop(['c0'], axis=1) rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) std_pipeline.fit(train_df_updated) result_1 = std_pipeline.predict(test_df_updated) # Create combined pipeline transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline.fit(train_df_updated) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df_updated) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_df_updated) self.assertTrue(result_1.equals(result_2))
def test_different_schema_with_dataframe_input(self): # Create reference pipeline std_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_df) result_1 = std_pipeline.predict(test_df) # Create combined pipeline transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_df) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_passing_in_a_single_predictor_returns_new_pipeline(self): train_dropped_df = train_df.drop(['c0'], axis=1) test_dropped_df = test_df.drop(['c0'], axis=1) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c1']) predictor.fit(train_dropped_df) result_1 = predictor.predict(test_dropped_df) combined_pipeline = Pipeline.combine_models(predictor) result_2 = combined_pipeline.predict(test_dropped_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score']) self.assertTrue(isinstance(combined_pipeline, Pipeline))
def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model( self): """ This test verifies that two models can be combined even if the transform increases the number of columns. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline, predictor_pipeline) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_ensemble_supports_get_fit_info(self): df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info r3 = LightGbmRegressor(normalize="Yes") << col_info pipeline = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], ColumnDropper() << 'yy', VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) info = pipeline.get_fit_info(df) last_info_node = info[0][-1] self.assertEqual(last_info_node['inputs'], ['Feature:education,workclass', 'Label:new_y']) self.assertEqual(last_info_node['name'], 'VotingRegressor') self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor)) self.assertEqual(last_info_node['outputs'], ['Score']) self.assertEqual(last_info_node['schema_after'], ['Score']) self.assertEqual(last_info_node['type'], 'regressor')
def test_combine_transform_and_predictor(self): transform = OneHotVectorizer() << 'c0' df = transform.fit_transform(train_df, as_binary_data_stream=True) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = predictor.predict(df) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(test_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
def test_notvectorized_output_predictor_model(self): """ This test verifies that outputted predictor model from combined (with featurizers) pipeline runs successfully on featurized data with no vectors. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df1 = transform_pipeline.transform(df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ RangeFilter(min=0.0, max=4.5) << 'c2', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_2 = predictor_pipeline.predict(df1) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_ensemble_rejects_estimators_with_incorrect_type(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LogisticRegressionClassifier() try: vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average') except Exception as e: print(e) else: self.fail('VotingRegressor should only work with regressors.')
def test_get_schema_does_not_work_when_predictor_is_part_of_model(self): df = train_df.drop(['c0'], axis=1) pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')]) pipeline.fit(df) try: schema = pipeline.get_output_columns() except Exception as e: pass else: self.fail()
def test_ensemble_supports_output_predictor_model(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}), ignore_index=True) test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32}) # Create a ground truth pipeline r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df) result_1 = combined_pipeline.predict(test2_df) # Create a duplicate pipeline but also request a predictor model r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df, output_predictor_model=True) result_2 = combined_pipeline.predict(test2_df) # Create a predictor model only pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_3 = predictor_pipeline.predict(test2_df) # Verify the first rows are equal self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score']) # Verify the second rows are equal self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score']) # Verify the number of rows self.assertEqual(len(result_1), 2) self.assertEqual(len(result_2), 2) self.assertEqual(len(result_3), 4)
def test_get_fit_info(self): transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) transform_pipeline.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df) info = combined_pipeline.get_fit_info(train_df) self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
def test_passing_in_a_single_pipeline_returns_new_pipeline(self): pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ]) pipeline.fit(train_df) result_1 = pipeline.predict(test_df) combined_pipeline = Pipeline.combine_models(pipeline) result_2 = combined_pipeline.predict(test_df) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) self.assertTrue(isinstance(combined_pipeline, Pipeline))
def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_different_schema_with_filedatastream_input(self): train_filename = "train-data.csv" train_df.to_csv(train_filename, index=False, header=True) train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True) test_filename = "test-data.csv" test_df.to_csv(test_filename, index=False, header=True) test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True) # Create reference pipeline std_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_data_stream) result_1 = std_pipeline.predict(test_data_stream) # Create combined pipeline transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_data_stream) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_data_stream) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_data_stream) self.assertTrue(result_1.equals(result_2)) os.remove(train_filename) os.remove(test_filename)
def test_ensemble_with_average_and_median_combiner(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1] median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1] self.assertEqual(median1, result4.loc[0, 'Score']) self.assertEqual(median2, result4.loc[1, 'Score'])
def test_combine_transform_and_pipeline(self): transform = RangeFilter(min=0.0, max=4.5) << 'c2' df = transform.fit_transform(train_df, as_binary_data_stream=True) pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ]) pipeline.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = pipeline.predict(df) combined_pipeline = Pipeline.combine_models(transform, pipeline) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_fit_predictor_with_idv(self): train_data = { 'c0': ['a', 'b', 'a', 'b'], 'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5] } train_df = pd.DataFrame(train_data).astype({ 'c1': np.float64, 'c2': np.float64 }) test_data = { 'c0': ['a', 'b', 'b'], 'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7] } test_df = pd.DataFrame(test_data).astype({ 'c1': np.float64, 'c2': np.float64 }) # Fit a transform pipeline to the training data transform_pipeline = Pipeline([OneHotVectorizer() << 'c0']) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Fit a predictor pipeline given a transformed BinaryDataStream predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor_pipeline = Pipeline([predictor]) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Create expected result xf = OneHotVectorizer() << 'c0' df = xf.fit_transform(train_df) predictor = OnlineGradientDescentRegressor( label='c2', feature=['c0.a', 'c0.b', 'c1']) predictor.fit(df) df = xf.transform(test_df) expected_result = predictor.predict(df) self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
def test_combine_two_pipelines_created_from_model_files(self): """ This test verifies that two models can be combined after they are loaded from disk in to new Pipelines. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline_1 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline_1.fit(train_df) df = transform_pipeline_1.transform(train_df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline_1 = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline_1.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline_1.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline_1.predict(df) # Use the model files stored in the Pipelines # to create new Pipelines (aka. create new Pipelines # using the model files stored on disk). transform_pipeline_2 = Pipeline() transform_pipeline_2.load_model(transform_pipeline_1.model) predictor_pipeline_2 = Pipeline() predictor_pipeline_2.load_model(predictor_pipeline_1.model) # Combine the newly created Pipelines in to one Pipeline # and use it to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline_2, predictor_pipeline_2) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_data_role_info_has_been_removed_from_estimators(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average') pipeline = Pipeline([vr]) pipeline.fit(train_df) self.assertTrue(not hasattr(vr, 'feature_column_name')) self.assertTrue(not hasattr(vr.estimators[0], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[0], 'feature_column_name_')) self.assertTrue(not hasattr(vr.estimators[1], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[1], 'feature_column_name_')) self.assertTrue(not hasattr(vr.estimators[2], 'feature_column_name')) self.assertTrue(hasattr(vr.estimators[2], 'feature_column_name_'))
def test_syntax10_weights_fail(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weights=[1., 1., 1., 2., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop(['y', 'weights'], axis=1) y = df['y'] weights = df['weights'] exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], OnlineGradientDescentRegressor() ]) try: exp.fit(X, y, weight=weights, verbose=0) assert False except RuntimeError as e: assert "does not support role 'Weight'" in str(e)
def test_vectorized_with_prefixconcat_output_predictor_model(self): """ This test shows how to prepend ColumnConcatenator transform to outputted predictor model from combined (with featurizers) pipeline so it successfully runs on featurized data with vectors. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df) # Create, fit and score with combined model. # Output predictor model separately. combined_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(train_df, output_predictor_model=True) result_1 = combined_pipeline.predict(train_df) # train ColumnConcatenator on featurized data concat_pipeline = Pipeline( [PrefixColumnConcatenator(columns={'c0': 'c0.'})]) concat_pipeline.fit(df) # Load predictor pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) # combine concat and predictor models and score combined_predictor_pipeline = Pipeline.combine_models( concat_pipeline, predictor_pipeline) result_2 = combined_predictor_pipeline.predict(df) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same( self): """ This test verifies that two models created using DataFrames can be combined if the output schema of the first is the same as the input schema of the second. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df = transform_pipeline.transform(df) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2')], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df) result_1 = predictor_pipeline.predict(df) df = test_df.drop(['c0'], axis=1) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline, predictor_pipeline) result_2 = combined_pipeline.predict(df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_vectorized_output_predictor_model(self): """ This test shows that outputted predictor model from combined (with featurizers) pipeline fails to run on featurized data with vectors. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(train_df, output_predictor_model=True) result_1 = combined_pipeline.predict(train_df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) try: # This does not work because the input schema doesnt # match. Input schema looks for vector 'c0' with slots 'a,b' # but featurized data has only columns 'c0.a' and 'c0.b' predictor_pipeline.predict(df) except Exception as e: pass else: self.fail()
def test_pickled_pipeline_with_predictor_model(self): train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]} train_df = pd.DataFrame(train_data).astype({'c1': np.float64, 'c2': np.float64}) test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]} test_df = pd.DataFrame(test_data).astype({'c1': np.float64, 'c2': np.float64}) # Create predictor model and use it to predict pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) pipeline.fit(train_df, output_predictor_model=True) result_1 = pipeline.predict(test_df) self.assertTrue(pipeline.model) self.assertTrue(pipeline.predictor_model) self.assertNotEqual(pipeline.model, pipeline.predictor_model) pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(pipeline, f) os.remove(pipeline.model) os.remove(pipeline.predictor_model) with open(pickle_filename, "rb") as f: pipeline_pickle = pickle.load(f) os.remove(pickle_filename) # Load predictor pipeline and score data predictor_pipeline = Pipeline() predictor_pipeline.load_model(pipeline_pickle.predictor_model) result_2 = predictor_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_two_pipelines_created_using_dataframes_can_not_be_combined_when_the_schemas_are_different( self): """ This test verifies that two models created using DataFrames can not be combined if the output schema of the first is different then the input schema of the second. NOTE: This issue only happens with Pipelines created and fit using dataframes. Pipelines created and fit using IDV binary streams do not have this issue (see the tests below). """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2')], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df) result_1 = predictor_pipeline.predict(df) try: # This does not work because the output schema of the combined_pipeline = Pipeline.combine_models( transform_pipeline, predictor_pipeline) except Exception as e: pass else: self.fail()
reg2 = LinearRegression() vr = VotingRegressor_sklearn(estimators=[('gb', reg1), ('rf', reg2)]) vr.fit(X_train, y_train) result = vr.predict(X_test) results.append(('All scikit-learn', result)) # Perform regression using the scikit-learn # VotingRegressor and NimbusML predictors. olsrArgs = { 'normalize': "Yes" } ogdArgs = { 'shuffle': False, 'number_of_iterations': 800, 'learning_rate': 0.1, 'normalize': "Yes" } r1 = OnlineGradientDescentRegressor(**ogdArgs) r2 = OrdinaryLeastSquaresRegressor(**olsrArgs) vr = VotingRegressor_sklearn(estimators=[('ogd', r1), ('ols', r2)]) vr.fit(X_train, y_train) result = vr.predict(X_test) results.append(('scikit-learn VotingRegressor with NimbusML predictors', result)) # Perform regression using only NimbusML classes olsrArgs = { 'normalize': "Yes" } ogdArgs = { 'shuffle': False, 'number_of_iterations': 800, 'learning_rate': 0.1, 'normalize': "Yes" } r1 = OnlineGradientDescentRegressor(**ogdArgs)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), OnlineGradientDescentRegressor(feature=['parity', 'edu'], label='age') ]) # train, predict, and evaluate metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # Score # 0 28.103731 # 1 21.805904 # 2 28.103731 # 3 25.584600 # 4 33.743286 # print evaluation metrics print(metrics)
label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView