def test_same_schema_with_dataframe_input(self): train_df_updated = train_df.drop(['c0'], axis=1) test_df_updated = test_df.drop(['c0'], axis=1) rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) std_pipeline.fit(train_df_updated) result_1 = std_pipeline.predict(test_df_updated) # Create combined pipeline transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline.fit(train_df_updated) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df_updated) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_df_updated) self.assertTrue(result_1.equals(result_2))
def test_combining_two_dataset_transformers(self): rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_df) result_1 = std_pipeline.predict(test_df) # Create combined pipeline transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline1.fit(train_df) transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline2.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline1.model), DatasetTransformer(transform_model=transform_pipeline2.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_df) os.remove(transform_pipeline1.model) os.remove(transform_pipeline2.model) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_notvectorized_output_predictor_model(self): """ This test verifies that outputted predictor model from combined (with featurizers) pipeline runs successfully on featurized data with no vectors. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df1 = transform_pipeline.transform(df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ RangeFilter(min=0.0, max=4.5) << 'c2', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_2 = predictor_pipeline.predict(df1) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_combine_transform_and_transform(self): transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2' df = transform_1.fit_transform(train_df) transform_2 = OneHotVectorizer() << 'c0' transform_2.fit(df) df = transform_1.transform(test_df) result_1 = transform_2.transform(df) combined_pipeline = Pipeline.combine_models(transform_1, transform_2, contains_predictor=False) result_2 = combined_pipeline.transform(test_df) self.assertTrue(result_1.equals(result_2))
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_combine_transform_and_pipeline(self): transform = RangeFilter(min=0.0, max=4.5) << 'c2' df = transform.fit_transform(train_df, as_binary_data_stream=True) pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ]) pipeline.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = pipeline.predict(df) combined_pipeline = Pipeline.combine_models(transform, pipeline) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_get_fit_info(self): transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) transform_pipeline.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df) info = combined_pipeline.get_fit_info(train_df) self.assertTrue(info[0][1]['name'] == 'DatasetTransformer')
def test_ensemble_supports_output_predictor_model(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}), ignore_index=True) test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32}) # Create a ground truth pipeline r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df) result_1 = combined_pipeline.predict(test2_df) # Create a duplicate pipeline but also request a predictor model r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df, output_predictor_model=True) result_2 = combined_pipeline.predict(test2_df) # Create a predictor model only pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_3 = predictor_pipeline.predict(test2_df) # Verify the first rows are equal self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score']) # Verify the second rows are equal self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score']) # Verify the number of rows self.assertEqual(len(result_1), 2) self.assertEqual(len(result_2), 2) self.assertEqual(len(result_3), 4)
def test_get_schema_returns_correct_value_for_single_valued_columns(self): df = train_df.drop(['c0'], axis=1) pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) pipeline.fit(df) df = pipeline.transform(df) schema = pipeline.get_output_columns() self.assertTrue('c1' in schema) self.assertTrue('c2' in schema) self.assertEqual(len(schema), 2)
def test_three_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model( self): """ This test verifies that three models can be combined even if the transform increases the number of columns. """ # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline_1 = Pipeline( [RangeFilter(min=0.0, max=4.5) << 'c2']) df = transform_pipeline_1.fit_transform(train_df, as_binary_data_stream=True) # Create and fit a OneHotVectorizer transform using # the transformed data from the previous step and use it # to transform the data from the previous step. transform_pipeline_2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline_2.fit(df) df = transform_pipeline_2.transform(df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transforms and predictor defined previously. df = transform_pipeline_1.transform(test_df, as_binary_data_stream=True) df = transform_pipeline_2.transform(df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline_1, transform_pipeline_2, predictor_pipeline) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_reange_filter_short_example(self): d = pd.DataFrame([[1., 1.9, 3.], [2., 3., 4.], [2., 3., 4.]]) d.columns = ['aa', 'bb', 'cc'] hdl = RangeFilter(min=0, max=2) << 'aa' res1 = hdl.fit_transform(d) assert res1 is not None assert res1.shape == (1, 3) hdl = RangeFilter(min=0, max=2) << 'bb' res2 = hdl.fit_transform(d) assert res2 is not None assert res2.shape == (1, 3) assert res1.values.ravel().tolist() == res2.values.ravel().tolist()
def test_two_pipelines_created_using_dataframes_can_be_combined_when_the_schemas_are_the_same( self): """ This test verifies that two models created using DataFrames can be combined if the output schema of the first is the same as the input schema of the second. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df = transform_pipeline.transform(df) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2')], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df) result_1 = predictor_pipeline.predict(df) df = test_df.drop(['c0'], axis=1) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline, predictor_pipeline) result_2 = combined_pipeline.predict(df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
############################################################################### # RangeFilter import numpy as np from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.filter import RangeFilter # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, numeric_dtype=np.float32) print(data.head()) # age case education induced parity pooled.stratum row_num ... # 0 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... # 1 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... # 2 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... # 3 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... # transform usage xf = RangeFilter(min=20, max=30, columns='age') # fit and transform, rows with age outside the range will be deleted features = xf.fit_transform(data) print(features.head()) # age case education id induced parity pooled.stratum ... # 0 26.0 1 0-5yrs 1 1 6 3 ... # 1 23.0 1 6-11yrs 7 0 1 6 ... # 2 21.0 1 6-11yrs 9 0 1 5 ... # 3 28.0 1 6-11yrs 10 0 2 19 ... # 4 29.0 1 6-11yrs 11 1 2 20 ...
'Resizer': Pipeline([ Loader(columns={'ImgPath': 'Path'}), Resizer(image_width=227, image_height=227, columns={'ImgResize': 'ImgPath'}) ]), 'SkipFilter': SkipFilter(count=5), 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'], seasonal_window_size=2), 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'], seasonal_window_size=2), 'SsaForecaster': SsaForecaster(columns=['Sepal_Length'], window_size=2, series_length=5, train_size=5, horizon=1), 'RangeFilter': RangeFilter(min=5.0, max=5.1, columns=['Sepal_Length']), 'TakeFilter': TakeFilter(count=100), 'TensorFlowScorer': TensorFlowScorer( model_location=os.path.join( script_dir, '..', 'nimbusml', 'examples', 'frozen_saved_model.pb'), columns={'c': ['a', 'b']}), 'ToKey': ToKey(columns={'edu_1': 'education_str'}), 'TypeConverter': TypeConverter(columns=['group'], result_type='R4'), 'WordTokenizer': WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'} } DATASETS = {
############################################################################### # RangeFilter import numpy as np from nimbusml.datasets import get_dataset from nimbusml.preprocessing.filter import RangeFilter from sklearn.model_selection import train_test_split # use 'iris' data set to create test and train data # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) # select rows where 5.0 <= Sepal_Length <= 5.1 filter = RangeFilter(min=5.0, max=5.1) << 'Sepal_Length' print(filter.fit_transform(X_train)) # select rows where Sepal_Length <= 4.5 or Sepal_Length >= 7.5 filter = RangeFilter(min=4.5, max=7.5, complement=True) << 'Sepal_Length'