def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_passing_in_a_single_predictor_returns_new_pipeline(self): train_dropped_df = train_df.drop(['c0'], axis=1) test_dropped_df = test_df.drop(['c0'], axis=1) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c1']) predictor.fit(train_dropped_df) result_1 = predictor.predict(test_dropped_df) combined_pipeline = Pipeline.combine_models(predictor) result_2 = combined_pipeline.predict(test_dropped_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score']) self.assertTrue(isinstance(combined_pipeline, Pipeline))
def test_combine_transform_and_predictor(self): transform = OneHotVectorizer() << 'c0' df = transform.fit_transform(train_df, as_binary_data_stream=True) predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor.fit(df) df = transform.transform(test_df, as_binary_data_stream=True) result_1 = predictor.predict(df) combined_pipeline = Pipeline.combine_models(transform, predictor) result_2 = combined_pipeline.predict(test_df) self.assertEqual(result_1[0], result_2.loc[0, 'Score']) self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
def test_fit_predictor_with_idv(self): train_data = { 'c0': ['a', 'b', 'a', 'b'], 'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5] } train_df = pd.DataFrame(train_data).astype({ 'c1': np.float64, 'c2': np.float64 }) test_data = { 'c0': ['a', 'b', 'b'], 'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7] } test_df = pd.DataFrame(test_data).astype({ 'c1': np.float64, 'c2': np.float64 }) # Fit a transform pipeline to the training data transform_pipeline = Pipeline([OneHotVectorizer() << 'c0']) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Fit a predictor pipeline given a transformed BinaryDataStream predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) predictor_pipeline = Pipeline([predictor]) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Create expected result xf = OneHotVectorizer() << 'c0' df = xf.fit_transform(train_df) predictor = OnlineGradientDescentRegressor( label='c2', feature=['c0.a', 'c0.b', 'c1']) predictor.fit(df) df = xf.transform(test_df) expected_result = predictor.predict(df) self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
def test_ensemble_with_average_and_median_combiner(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1] median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1] self.assertEqual(median1, result4.loc[0, 'Score']) self.assertEqual(median2, result4.loc[1, 'Score'])
'feature': ['c1'], 'label': 'c2', 'random_state': 1, 'number_of_leaves': 200, 'minimum_example_count_per_leaf': 1, 'normalize': 'Yes' } if show_individual_predictions: r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result = r1.predict(test_df) print(result) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result = r2.predict(test_df) print(result) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result = r3.predict(test_df) print(result) # Perform a prediction using an ensemble # of all three of the above predictors. r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline(