def _ada_boost_regression_train(table, feature_cols, label_col, max_depth=3, n_estimators=50, learning_rate=1.0, loss='linear', random_state=None): feature_names, x_train = check_col_type(table, feature_cols) y_train = table[label_col] base_estimator = DecisionTreeRegressor(max_depth=max_depth) regressor = AdaBoostRegressor(base_estimator, n_estimators, learning_rate, loss, random_state) regressor.fit(x_train, y_train) params = { 'feature_cols': feature_cols, 'label_col': label_col, 'feature_importance': regressor.feature_importances_, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss, 'random_state': random_state } model = _model_dict('ada_boost_regression_model') get_param = regressor.get_params() model['parameters'] = get_param model['regressor'] = regressor model['params'] = params fig_feature_importance = _plot_feature_importance(feature_names, regressor) params = dict2MD(get_param) rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## AdaBoost Regression Train Result | | ### Feature Importance | {fig_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_feature_importance=fig_feature_importance, list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance = regressor.feature_importances_ feature_importance_table = pd.DataFrame( [[feature_names[i], feature_importance[i]] for i in range(len(feature_names))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def test_parameters(self): """ Testing parameters of Model class. """ #1.) #create instance of PLS model using Model class & creating instance # using SKlearn libary, comparing if the parameters of both instances are equal pls_parameters = {"n_components": 20, "scale": False, "max_iter": 200} model = Model(algorithm="PlsRegression", parameters=pls_parameters) pls_model = PLSRegression(n_components=20, scale="svd", max_iter=200) for k, v in model.model.get_params().items(): self.assertIn(k, list(pls_model.get_params())) #2.) rf_parameters = {"n_estimators": 200, "max_depth": 50,"min_samples_split": 10} model = Model(algorithm="RandomForest", parameters=rf_parameters) rf_model = RandomForestRegressor(n_estimators=200, max_depth=50, min_samples_split=10) for k, v in model.model.get_params().items(): self.assertIn(k, list(rf_model.get_params())) #3.) knn_parameters = {"n_neighbors": 10, "weights": "distance", "algorithm": "ball_tree"} model = Model(algorithm="KNN", parameters=knn_parameters) knn_model = KNeighborsRegressor(n_neighbors=10, weights='distance', algorithm="kd_tree") for k, v in model.model.get_params().items(): self.assertIn(k, list(knn_model.get_params())) #4.) svr_parameters = {"kernel": "poly", "degree": 5, "coef0": 1} model = Model(algorithm="SVR",parameters=svr_parameters) svr_model = SVR(kernel='poly', degree=5, coef0=1) for k, v in model.model.get_params().items(): self.assertIn(k, list(svr_model.get_params())) #5.) ada_parameters = {"n_estimators": 150, "learning_rate": 1.2, "loss": "square"} model = Model(algorithm="AdaBoost", parameters=ada_parameters) ada_model = AdaBoostRegressor(n_estimators=150, learning_rate=1.2, loss="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(ada_model.get_params())) #6.) bagging_parameters = {"n_estimators": 50, "max_samples": 1.5, "max_features": 2} model = Model(algorithm="Bagging", parameters=bagging_parameters) bagging_model = BaggingRegressor(n_estimators=50, max_samples=1.5, max_features="square") for k, v in model.model.get_params().items(): self.assertIn(k, list(bagging_model.get_params())) #7.) lasso_parameters = {"alpha": 1.5, "max_iter": 500, "tol": 0.004} model = Model(algorithm="lasso", parameters=lasso_parameters) lasso_model = Lasso(alpha=1.5, max_iter=500, tol=0.004) for k, v in model.model.get_params().items(): self.assertIn(k, list(lasso_model.get_params()))
class _AdaBoostRegressorImpl: def __init__( self, base_estimator=None, *, n_estimators=50, learning_rate=1.0, loss="linear", random_state=None, ): if base_estimator is None: estimator_impl = None else: estimator_impl = _FitSpecProxy(base_estimator) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "loss": loss, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if isinstance(X, pd.DataFrame): feature_transformer = FunctionTransformer( func=lambda X_prime: pd.DataFrame(X_prime, columns=X.columns), inverse_func=None, check_inverse=False, ) self._hyperparams["base_estimator"] = _FitSpecProxy( feature_transformer >> self._hyperparams["base_estimator"]) self._wrapped_model = SKLModel(**self._hyperparams) if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
class _AdaBoostRegressorImpl: def __init__( self, base_estimator=None, n_estimators=50, learning_rate=1.0, loss="linear", random_state=None, ): estimator_impl = base_estimator if isinstance(estimator_impl, lale.operators.Operator): if isinstance(estimator_impl, lale.operators.IndividualOp): estimator_impl = estimator_impl._impl_instance() wrapped_model = getattr(estimator_impl, "_wrapped_model", None) if wrapped_model is not None: estimator_impl = wrapped_model else: raise ValueError( "If base_estimator is a Lale operator, it needs to be an individual operator. " ) self._hyperparams = { "base_estimator": estimator_impl, "n_estimators": n_estimators, "learning_rate": learning_rate, "loss": loss, "random_state": random_state, } self._wrapped_model = SKLModel(**self._hyperparams) self._hyperparams["base_estimator"] = base_estimator def get_params(self, deep=True): out = self._wrapped_model.get_params(deep=deep) # we want to return the lale operator, not the underlying impl out["base_estimator"] = self._hyperparams["base_estimator"] return out def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X) def score(self, X, y, sample_weight=None): return self._wrapped_model.score(X, y, sample_weight)
def ada_boost(df, significant_cols, target, cat_cols, num_cols): ss = StandardScaler() ohe = OneHotEncoder(drop='first', sparse=False) X = df[significant_cols] y = df[target] base = DecisionTreeRegressor(max_depth=3, random_state=0) estimator = AdaBoostRegressor(base_estimator=base, random_state=0) params = { 'n_estimators': np.arange(5, int(X.shape[0] * 0.1)), 'learning_rate': np.arange(0.1, 1.1, 0.1), 'loss': ['linear', 'square', 'exponential'], } X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0) X_train_cat = ohe.fit_transform(X_train[cat_cols]) X_train_num = ss.fit_transform(X_train[num_cols]) X_test_cat = ohe.transform(X_test[cat_cols]) X_test_num = ss.transform(X_test[num_cols]) train_data = np.c_[X_train_cat, X_train_num] test_data = np.c_[X_test_cat, X_test_num] gs = GridSearchCV(estimator, params, scoring='r2', cv=3) gs.fit(train_data, y_train) estimator = gs.best_estimator_ r2_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='r2', cv=3, n_jobs=-1) rmse_cv_scores = cross_val_score(estimator, train_data, y_train, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1) params = estimator.get_params() r2 = np.mean(r2_cv_scores) rmse = np.abs(np.mean(rmse_cv_scores)) r2_variance = np.var(r2_cv_scores, ddof=1) rmse_variance = np.abs(np.var(rmse_cv_scores, ddof=1)) estimator.fit(train_data, y_train) y_pred = estimator.predict(test_data) r2_validation = r2_score(y_test, y_pred) rmse_validation = np.sqrt(mean_squared_error(y_test, y_pred)) return r2, rmse, r2_variance, rmse_variance, r2_validation, rmse_validation, params
def ada(X, Y, kfold=3, feature_set=None): arr = index_splitter(N=len(X), fold=kfold) ps = PredefinedSplit(arr) for train, test in ps.split(): train_index = train test_index = test train_X, train_y = X.values[train_index, :], Y.values[train_index] test_X, test_y = X.values[test_index, :], Y.values[test_index] arr = index_splitter(N=len(train_X), fold=kfold) ps2 = PredefinedSplit(arr) learning_rate = [x for x in np.linspace(0.1, 1, num=10)] n_estimators = [int(x) for x in np.linspace(start=20, stop=1000, num=100)] loss = ['square'] random_grid = { 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'loss': loss } # Use the random grid to search for best hyperparameters # First create the base model to tune ada = AdaBoostRegressor(random_state=42, loss='square') # Look at parameters used by our current forest print('Parameters for baseline:\n') pprint(ada.get_params()) # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores ada_random = RandomizedSearchCV(estimator=ada, n_iter=200, param_distributions=random_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, random_state=42, n_jobs=-1) # Fit the random search model ada_random.fit(train_X, train_y) pprint(ada_random.best_params_) cv_result_rd = ada_random.cv_results_ BestPara_random = ada_random.best_params_ ## Grid search of parameters, using 3 fold cross validation based on Random search lr = [BestPara_random['learning_rate']] #n_estimators = [BestPara_random["n_estimators"]] n_estimators = [ int(x) for x in range(BestPara_random["n_estimators"] - 10, BestPara_random["n_estimators"] + 10, 20) ] n_estimators = [item for item in n_estimators if item > 0] grid_grid = { 'n_estimators': n_estimators, 'learning_rate': lr, 'loss': loss } ada_grid = GridSearchCV(estimator=ada, param_grid=grid_grid, scoring='neg_mean_squared_error', cv=ps2.split(), verbose=2, n_jobs=-1) # Fit the grid search model ada_grid.fit(train_X, train_y) BestPara_grid = ada_grid.best_params_ pprint(ada_grid.best_params_) cv_results_grid = ada_grid.cv_results_ # Fit the base line search model ada.fit(train_X, train_y) #prediction predict_y = ada_random.predict(test_X) predict_y_grid = ada_grid.predict(test_X) predict_y_base = ada.predict(test_X) # Performance metrics def RMLSE(predict_y_grid, predict_y, predict_y_base, test_y): errors_Grid_CV = np.sqrt(mean_squared_log_error( predict_y_grid, test_y)) errors_Random_CV = np.sqrt(mean_squared_log_error(predict_y, test_y)) errors_baseline = np.sqrt( mean_squared_log_error(predict_y_base, test_y)) return errors_Grid_CV, errors_Random_CV, errors_baseline errors_Grid_CV = (mean_squared_error(predict_y_grid, test_y)) #,squared = False)) errors_Random_CV = (mean_squared_error(predict_y, test_y)) #,squared = False)) errors_baseline = (mean_squared_error(predict_y_base, test_y)) #,squared = False)) x_axis = range(3) results = [errors_Grid_CV, errors_Random_CV, errors_baseline] print('Adaboot Results:', results) if True: fig = plt.figure(figsize=(15, 8)) x_axis = range(3) plt.bar(x_axis, results) plt.xticks(x_axis, ('GridSearchCV', 'RandomizedSearchCV', 'Baseline')) #plt.show() plt.savefig('ada_compare_error.png') #feature importance num_feature = len(ada_grid.best_estimator_.feature_importances_) plt.figure(figsize=(24, 6)) plt.bar(range(0, num_feature * 4, 4), ada_grid.best_estimator_.feature_importances_) label_name = X.keys() plt.xticks(range(0, num_feature * 4, 4), label_name) plt.title("Feature Importances" + ",kfold=" + str(kfold)) #plt.show() plt.savefig('ada_feature_importance.png') fig = plt.figure(figsize=(20, 8)) ax = fig.gca() x_label = range(0, len(predict_y_grid)) plt.title("kfold=" + str(kfold)) ax.plot(x_label, predict_y_grid, 'r--', label="predict") ax.plot(x_label, test_y, label="ground_truth") ax.set_ylim(0, 200) ax.legend() #plt.show() plt.savefig('ada_prediction.png') #return a dictionary for all results return ada_grid.predict, ada_grid.best_estimator_
y = yacht["resid_resist"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) # MinMaxScaler da mejores resultados que StanderScaler scaler = MinMaxScaler() train_scaled = scaler.fit_transform(X_train) test_scaled = scaler.transform(X_test) model = AdaBoostRegressor(base_estimator=RandomForestRegressor()) model.fit(train_scaled, y_train) print("Accuracy on train data: ", round(model.score(train_scaled, y_train)*100, 2), "%") print("Accuracy on test data: ", round(model.score(test_scaled, y_test)*100, 2), "%") print("Parameters: ", model.get_params()) print("MAE: ", mean_absolute_error(y_test, model.predict(test_scaled))) # TODO: Se puede mejorar el Grid gridParams = { "n_estimators": [200], 'base_estimator__n_estimators': np.arange(1, 20)} grid = GridSearchCV(model, gridParams, verbose=1, cv=5) grid.fit(train_scaled, y_train) print("Best params:", grid.best_params_) print("Best score:", grid.best_score_) params = grid.best_params_
y = np.sin(X1).ravel() + np.sin(6 * X1).ravel() + rng.normal( 0, 0.1, X1.shape[0]) print(X.shape, y.shape) # Fit regression model regr_1 = DecisionTreeRegressor(max_depth=4) regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300, random_state=rng) regr_1.fit(X, y) regr_2.fit(X, y) # Predict y_1 = regr_1.predict(X) y_2 = regr_2.predict(X) print(regr_2.get_params()) # Plot the results # plt.figure() # plt.scatter(X1, y, c="k", label="training samples") # plt.plot(X1, y_1, c="g", label="n_estimators=1", linewidth=2) # plt.plot(X1, y_2, c="r", label="n_estimators=300", linewidth=2) # plt.xlabel("data") # plt.ylabel("target") # plt.title("Boosted Decision Tree Regression") # plt.legend() # plt.show()