def test_multioutput_regressorchain_with_cv_randomizedsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) stack = dict(regressors=['kneighborsregressor', 'bayesianridge'], final_regressor='lasso') reg = Regressor(regressor_choice='mlxtendstackingcvregressor', stacking_layer=stack, pipeline_transform='standardscaler', chain_order=[2, 0, 1]) search_params = {'kneighborsregressor__n_neighbors': randint(low=2, high=5), 'bayesianridge__alpha_1': [1e-7, 1e-6], 'meta_regressor__alpha': [1.0]} reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 10.0) self.assertLessEqual(reg.best_params_['reg__base_estimator__kneighborsregressor__n_neighbors'], 5) self.assertGreaterEqual(reg.best_params_['reg__base_estimator__kneighborsregressor__n_neighbors'], 2) self.assertIn(reg.best_params_['reg__base_estimator__bayesianridge__alpha_1'], [1e-7, 1e-6]) self.assertIn(reg.best_params_['reg__base_estimator__meta_regressor__alpha'], [1.0])
def test_multioutput_regressorchain_randomizedsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='ridge', pipeline_transform='standardscaler', randomizedcv_n_iter=6, chain_order=[2, 0, 1]) search_params = dict(reg__alpha=uniform(loc=0.01, scale=1.5), reg__fit_intercept=[True, False], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 12.0) self.assertLessEqual(reg.best_params_['reg__base_estimator__alpha'], 1.51) self.assertGreaterEqual(reg.best_params_['reg__base_estimator__alpha'], 0.01) self.assertIn(reg.best_params_['reg__base_estimator__fit_intercept'], [True, False])
def test_stacking_regressor_with_cv_gridsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) stack = dict(regressors=['kneighborsregressor', 'bayesianridge'], final_regressor='lasso') reg = Regressor(regressor_choice='mlxtendstackingcvregressor', pipeline_transform='standardscaler', stacking_options=dict(layers=stack)) search_params = dict(reg__kneighborsregressor__n_neighbors=[2, 4, 5], reg__bayesianridge__alpha_1=[1e-7, 1e-6], reg__meta_regressor__alpha=[1.0], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 2.8) self.assertIn( reg.best_params_['reg__kneighborsregressor__n_neighbors'], [2, 4, 5]) self.assertIn(reg.best_params_['reg__bayesianridge__alpha_1'], [1e-7, 1e-6]) self.assertIn(reg.best_params_['reg__meta_regressor__alpha'], [1.0])
def test_stacking_regressor_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) stack = dict(regressors=['kneighborsregressor', 'bayesianridge'], final_regressor='lasso') reg = Regressor(regressor_choice='stackingregressor', stacking_layer=stack, pipeline_transform='standardscaler') search_params = { '0__n_neighbors': randint(low=2, high=5), '1__alpha_1': [1e-7, 1e-6], 'final_estimator__alpha': [1.0] } reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 2.8) self.assertLessEqual(reg.best_params_['reg__0__n_neighbors'], 5) self.assertGreaterEqual(reg.best_params_['reg__0__n_neighbors'], 2) self.assertIn(reg.best_params_['reg__1__alpha_1'], [1e-7, 1e-6]) self.assertIn(reg.best_params_['reg__final_estimator__alpha'], [1.0])
def test_multioutput_regressor_without_cv_gridsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) stack = dict(regressors=['kneighborsregressor', 'bayesianridge'], final_regressor='lasso') reg = Regressor(regressor_choice='mlxtendstackingregressor', pipeline_transform='standardscaler', stacking_options=dict(layers=stack)) search_params = dict(reg__kneighborsregressor__n_neighbors=[2, 4, 5], reg__bayesianridge__alpha_1=[1e-7, 1e-6], reg__meta_regressor__alpha=[1.0], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 10.0) self.assertIn( reg. best_params_['reg__estimator__kneighborsregressor__n_neighbors'], [2, 4, 5]) self.assertIn( reg.best_params_['reg__estimator__bayesianridge__alpha_1'], [1e-7, 1e-6]) self.assertIn( reg.best_params_['reg__estimator__meta_regressor__alpha'], [1.0])
def test_multioutput_regressorchain_randomizedsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(n_estimators=10, objective='reg:squarederror', booster='gbtree') reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler', params=params, randomizedcv_n_iter=6, chain_order=[2, 0, 1]) search_params = dict(reg__n_estimators=randint(low=3, high=10), tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 42.0) self.assertLessEqual( reg.best_params_['reg__base_estimator__n_estimators'], 10) self.assertGreaterEqual( reg.best_params_['reg__base_estimator__n_estimators'], 3)
def test_stacking_regressor_without_cv_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) stack = dict(regressors=['kneighborsregressor', 'bayesianridge'], final_regressor='lasso') reg = Regressor(regressor_choice='mlxtendstackingregressor', pipeline_transform='standardscaler', stacking_options=dict(layers=stack), randomizedcv_n_iter=6) search_params = dict(reg__kneighborsregressor__n_neighbors=randint( low=2, high=5), reg__bayesianridge__alpha_1=[1e-7, 1e-6], reg__meta_regressor__alpha=[1.0], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 3.0) self.assertLessEqual( reg.best_params_['reg__kneighborsregressor__n_neighbors'], 5) self.assertGreaterEqual( reg.best_params_['reg__kneighborsregressor__n_neighbors'], 2) self.assertIn(reg.best_params_['reg__bayesianridge__alpha_1'], [1e-7, 1e-6]) self.assertIn(reg.best_params_['reg__meta_regressor__alpha'], [1.0])
def test_multioutput_regressor_gridsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) search_params = dict(n_estimators=[3, 5, 10]) reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler') reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 10.5) self.assertIn(reg.best_params_['reg__estimator__n_estimators'], [3, 5, 10])
def test_regressor_gridsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) search_params = dict(n_estimators=[3, 5, 10]) reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler') reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 3.8) self.assertIn(reg.best_params_['reg__n_estimators'], [3, 5, 10])
def test_regressor_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) search_params = dict(n_estimators=randint(low=3, high=10)) reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler') reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 4.0) self.assertLessEqual(reg.best_params_['reg__n_estimators'], 10) self.assertGreaterEqual(reg.best_params_['reg__n_estimators'], 3)
def test_multioutput_regressor_randomizedsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) search_params = dict(n_estimators=randint(low=3, high=10)) reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler') reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 12.0) self.assertLessEqual(reg.best_params_['reg__estimator__n_estimators'], 10) self.assertGreaterEqual(reg.best_params_['reg__estimator__n_estimators'], 3)
def test_regressor_gridsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='ridge', pipeline_transform='standardscaler') search_params = dict(alpha=[0.1, 0.2, 0.5], fit_intercept=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 3.6) self.assertIn(reg.best_params_['reg__alpha'], [0.1, 0.2, 0.5]) self.assertIn(reg.best_params_['reg__fit_intercept'], [True, False])
def test_regressor_gridsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(iterations=10, loss_function='RMSE') reg = Regressor(regressor_choice='catboostregressor', pipeline_transform='standardscaler', params=params) search_params = dict(reg__iterations=[3, 5, 10], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 3.6) self.assertIn(reg.best_params_['reg__iterations'], [3, 5, 10])
def test_multioutput_regressor_gridsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='ridge', pipeline_transform='standardscaler') search_params = dict(alpha=[0.1, 0.2, 0.5], fit_intercept=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 10.0) self.assertIn(reg.best_params_['reg__estimator__alpha'], [0.1, 0.2, 0.5]) self.assertIn(reg.best_params_['reg__estimator__fit_intercept'], [True, False])
def test_multioutput_regressor_gridsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(iterations=10, loss_function='RMSE') reg = Regressor(regressor_choice='catboostregressor', pipeline_transform='standardscaler', params=params) search_params = dict(reg__iterations=[3, 5, 10], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 10.0) self.assertIn(reg.best_params_['reg__estimator__iterations'], [3, 5, 10])
def test_multioutput_regressorchain_gridsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(n_estimators=3, objective='mean_squared_error') reg = Regressor(regressor_choice='lgbmregressor', pipeline_transform='standardscaler', params=params, chain_order=[2, 0, 1]) search_params = dict(reg__n_estimators=[3, 5, 10], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 10.0) self.assertIn(reg.best_params_['reg__base_estimator__n_estimators'], [3, 5, 10])
def test_regressor_gridsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(n_estimators=10, objective='reg:squarederror', booster='gbtree') reg = Regressor(regressor_choice='xgbregressor', pipeline_transform='standardscaler', params=params) search_params = dict(reg__n_estimators=[3, 5, 10], tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params) self.assertLess(reg.best_score_.values, 9.0) self.assertIn(reg.best_params_['reg__n_estimators'], [3, 5, 10])
def test_regressor_bayesoptcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='svr', pipeline_transform='standardscaler') search_pbounds = dict(gamma=(0.1, 2.0), epsilon=(0.1, 0.4)) reg.search(X_train, y_train, search_params=search_pbounds, search_method='bayesoptcv') self.assertLess(reg.best_score_.values, 3.7) self.assertLessEqual(reg.best_params_['reg__gamma'], 2.0) self.assertGreaterEqual(reg.best_params_['reg__gamma'], 0.1) self.assertLessEqual(reg.best_params_['reg__epsilon'], 0.4) self.assertGreaterEqual(reg.best_params_['reg__epsilon'], 0.1)
def test_regressor_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='ridge', pipeline_transform='standardscaler') search_params = dict(alpha=uniform(loc=0.01, scale=1.5), fit_intercept=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 3.6) self.assertLessEqual(reg.best_params_['reg__alpha'], 1.51) self.assertGreaterEqual(reg.best_params_['reg__alpha'], 0.01) self.assertIn(reg.best_params_['reg__fit_intercept'], [True, False])
def test_multioutput_regressor_bayesoptcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) reg = Regressor(regressor_choice='svr', pipeline_transform='standardscaler') search_pbounds = dict(gamma=(0.1, 2.0), epsilon=(0.1, 0.4)) reg.search(X_train, y_train, search_params=search_pbounds, search_method='bayesoptcv') self.assertLess(reg.best_score_.values, 10.0) self.assertLessEqual(reg.best_params_['reg__estimator__gamma'], 2.0) self.assertGreaterEqual(reg.best_params_['reg__estimator__gamma'], 0.1) self.assertLessEqual(reg.best_params_['reg__estimator__epsilon'], 0.4) self.assertGreaterEqual(reg.best_params_['reg__estimator__epsilon'], 0.1)
def test_regressor_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(iterations=10, loss_function='RMSE') reg = Regressor(regressor_choice='catboostregressor', pipeline_transform='standardscaler', params=params, randomizedcv_n_iter=6) search_params = dict(reg__iterations=randint(low=3, high=10), tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 3.6) self.assertLessEqual(reg.best_params_['reg__iterations'], 10) self.assertGreaterEqual(reg.best_params_['reg__iterations'], 3)
def test_regressor_randomizedsearchcv(self): X, y = load_boston(return_X_y=True) X, y = pd.DataFrame(X), pd.Series(y) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(n_estimators=3, objective='mean_squared_error') reg = Regressor(regressor_choice='lgbmregressor', pipeline_transform='standardscaler', params=params, randomizedcv_n_iter=6) search_params = dict(reg__n_estimators=randint(low=3, high=10), tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 4.5) self.assertLessEqual(reg.best_params_['reg__n_estimators'], 10) self.assertGreaterEqual(reg.best_params_['reg__n_estimators'], 3)
def test_multioutput_regressor_randomizedsearchcv(self): bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) params = dict(iterations=10, loss_function='RMSE') reg = Regressor(regressor_choice='catboostregressor', pipeline_transform='standardscaler', params=params, randomizedcv_n_iter=6) search_params = dict(reg__iterations=randint(low=3, high=10), tr__with_std=[True, False]) reg.search(X_train, y_train, search_params=search_params, search_method='randomizedsearchcv') self.assertLess(reg.best_score_.values, 10.0) self.assertLessEqual(reg.best_params_['reg__estimator__iterations'], 10) self.assertGreaterEqual(reg.best_params_['reg__estimator__iterations'], 3)
# The default regressor choice is ridge regression, so we do not # need to specify regressor_choice='ridge'. The default pipeline # transform choice is the Sklearn QuantileTransformer with the # Gaussian output distribution. The default number of folds in # K-fold cross-validation is 5, number of jobs to run in parallel # is -1, and scoring choice is 'neg_mean_absolute_error', but # we explicitly right these choices for clarity. reg = Regressor(cv=5, n_jobs=-1, scoring='neg_mean_absolute_error') # Now we perform the exhausitive search over the search_params. # The default search method is 'gridsearchcv', which uses the # Sklearn GridSearchCV object. Other choices include 'randomizedsearchcv', # which uses the Sklearn RandomizedSearchCV object, and 'bayesoptcv', # which uses the https://github.com/fmfn/BayesianOptimization # BayesianOptimization object. reg.search(X_train, y_train, search_params=search_params, search_method='gridsearchcv') # The aforementioned search fit 5 folds for each of the # 6 candidates (choice of alpha, choice of fit_intercept). # Hence, there were 30 total fits. To retrieve the exhausitive # search results, we access the atrribute search_summary_, # which is a DataFrame containing the best score, best choice # of alpha and fit_intercept, and the refit time. As # 'neg_mean_absolute_error' results in nonpositive version of # the mean absolute error due to Sklearn conventions, we automatically # restore nonnegativity in the search method, i.e., best score # will be greater than or equal to 0. print(reg.search_summary_)