def train(self): X_train, y_train, _ = self.load_results_from_result_paths(self.train_paths) X_val, y_val, _ = self.load_results_from_result_paths(self.val_paths) base_learner_config = self.parse_config("base:") param_config = self.parse_config("param:") # train base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best', **base_learner_config) self.model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=True, **param_config) self.model = self.model.fit(X_train, y_train, X_val=X_val, Y_val=y_val, early_stopping_rounds=self.model_config["early_stopping_rounds"]) train_pred, var_train = self.model.predict(X_train), None val_pred, var_val = self.model.predict(X_val), None # self.save() fig_train = utils.scatter_plot(np.array(train_pred), np.array(y_train), xlabel='Predicted', ylabel='True', title='') fig_train.savefig(os.path.join(self.log_dir, 'pred_vs_true_train.jpg')) plt.close() fig_val = utils.scatter_plot(np.array(val_pred), np.array(y_val), xlabel='Predicted', ylabel='True', title='') fig_val.savefig(os.path.join(self.log_dir, 'pred_vs_true_val.jpg')) plt.close() train_metrics = utils.evaluate_metrics(y_train, train_pred, prediction_is_first_arg=False) valid_metrics = utils.evaluate_metrics(y_val, val_pred, prediction_is_first_arg=False) logging.info('train metrics: %s', train_metrics) logging.info('valid metrics: %s', valid_metrics) return valid_metrics
def fit(self, xtrain, ytrain, train_info, learn_hyper=True): # if we are below the min train size, use the zero_cost and lce info if len(xtrain) < self.min_train_size: self.trained = False return None self.trained = True self.train_size = len(xtrain) # prepare training data labels self.mean = np.mean(ytrain) self.std = np.std(ytrain) ytrain = (np.array(ytrain) - self.mean) / self.std xtrain = self.prepare_features(xtrain, train_info, train=True) params = self.run_hpo(xtrain, ytrain) # todo: this code is repeated in cross_validate base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best', **parse_params(params, 'base:')) self.model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=True, **parse_params(params, 'param:')) self.model.fit(xtrain, ytrain)
def fixture_learners_data(breast_cancer_data, boston_data, boston_survival_data): """ Returns: A list of iterables, each iterable containing a fitted model and X data and the predictions for the X_data """ models_data = [] X_class_train, _, Y_class_train, _ = breast_cancer_data ngb = NGBClassifier(verbose=False, n_estimators=10) ngb.fit(X_class_train, Y_class_train) models_data.append((ngb, X_class_train, ngb.predict(X_class_train))) X_reg_train, _, Y_reg_train, _ = boston_data ngb = NGBRegressor(verbose=False, n_estimators=10) ngb.fit(X_reg_train, Y_reg_train) models_data.append((ngb, X_reg_train, ngb.predict(X_reg_train))) X_surv_train, _, T_surv_train, E_surv_train, _ = boston_survival_data ngb = NGBSurvival(verbose=False, n_estimators=10) ngb.fit(X_surv_train, T_surv_train, E_surv_train) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) ngb = NGBRegressor(Dist=MultivariateNormal(2), n_estimators=10) ngb.fit(X_surv_train, np.vstack([T_surv_train, E_surv_train]).T) models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train))) return models_data
class ModelNgbRegressor(Model): def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # ハイパーパラメータの設定 params = dict(self.params) early_stopping_rounds = params.pop('early_stopping_rounds') self.model = NGBRegressor(**params) self.model.fit(tr_x.values, tr_y.astype(int).values, va_x.values, va_y.astype(int).values, early_stopping_rounds=early_stopping_rounds) def predict(self, te_x): return self.model.predict(te_x.values) def save_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') os.makedirs(os.path.dirname(model_path), exist_ok=True) Data.dump(self.model, model_path) def load_model(self): model_path = os.path.join('../output/model', f'{self.run_fold_name}.model') self.model = Data.load(model_path)
def ngb_Normal(): ngb_Normal = NGBRegressor(Dist=Normal).fit(X_train, Y_train) globals()['ngb_Normal'] = ngb_Normal Y_preds = ngb_Normal.predict(X_test) Y_dists = ngb_Normal.pred_dist(X_test) # test Mean Squared Error test_MSE = mean_squared_error(Y_preds, Y_test) print('Test MSE_Normal', test_MSE) # test Negative Log Likelihood test_NLL = -Y_dists.logpdf(Y_test).mean() print('Test NLL_Normal', test_NLL)
def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None): # ハイパーパラメータの設定 params = dict(self.params) early_stopping_rounds = params.pop('early_stopping_rounds') self.model = NGBRegressor(**params) self.model.fit(tr_x.values, tr_y.astype(int).values, va_x.values, va_y.astype(int).values, early_stopping_rounds=early_stopping_rounds)
def feature_importance(): global ngb_Normal ngb_Normal = NGBRegressor(verbose=True).fit(X_train, Y_train) # Feature importance for loc trees feature_importance_loc = ngb_Normal.feature_importances_[0] # Feature importance for scale trees feature_importance_scale = ngb_Normal.feature_importances_[1] # dataframe制作 df_loc = pd.DataFrame({ 'feature': load_boston()['feature_names'], 'importance': feature_importance_loc }).sort_values('importance', ascending=False) # dataframe制作 df_scale = pd.DataFrame({ 'feature': load_boston()['feature_names'], 'importance': feature_importance_scale }).sort_values('importance', ascending=False) # 通过sns绘图 fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6)) fig.suptitle("Feature importance plot for distribution parameters", fontsize=17) sns.barplot(x='importance', y='feature', ax=ax1, data=df_loc, color="skyblue").set_title('loc param') sns.barplot(x='importance', y='feature', ax=ax2, data=df_scale, color="skyblue").set_title('scale param') plt.show()
def frc_plain_ngboost(num_iterations, learning_rate, validation_test_size, X_train, y_train, X_test): # ngboost ngb_model = NGBRegressor(learning_rate=learning_rate, n_estimators=num_iterations) # Split into training and evaluation X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = \ train_test_split(X_train, y_train, test_size=validation_test_size) # Fit NGBoost ngb_model.fit(X_train_xgb, y_train_xgb, X_val=X_val_xgb, Y_val=y_val_xgb) # differences regarding the reference promotions ngb_frc = ngb_model.predict(X_test) return ngb_frc
class myNGBoostBinary: def make(self , params ): self.model = NGBRegressor(**params ) return self def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}): if type(xtrain) == pd.core.frame.DataFrame: xtrain = xtrain.values ytrain = ytrain.values if type(xtest) != type(None) and type(ytest) != type(None): xtest = xtest.values ytest = ytest.values if type(xtest) == type(None) or type(ytest) == type(None) : self.model.fit( xtrain , ytrain , **fit_params ) else: self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params ) def predict(self , xs , threshold = 0.5): return np.where(self.model.predict(xs) > threshold , 1 , 0) def predict_proba(self, xs): if len(xs.shape) == 1: return self.model.predict(xs.reshape(1,-1)) else: return self.model.predict(xs)[:,1]
def train(self, train_data): X_train, y_train = train_data min_samples_leaf = min(max(len(X_train) // 2, 1), 15) min_samples_split = min(max(len(X_train) // 2, 2), 20) base_learner = DecisionTreeRegressor( criterion='friedman_mse', min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, random_state=None, splitter='best', **self.parameters(identifier='base:')) model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=True, **self.parameters(identifier='param:')) return model.fit(X_train, y_train)
def objective(trial): param = { 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0), 'n_estimators': trial.suggest_int('n_estimators', 100, 800), 'minibatch_frac':trial.suggest_discrete_uniform('minibatch_frac', 0.1, 0.9, 0.1), } regression_model = NGBRegressor(**param, Base= best_base, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=False) estimated_y_in_cv = model_selection.cross_val_predict(regression_model, train_x, train_y, cv=fold_number) r2 = metrics.r2_score(train_y, estimated_y_in_cv) return 1.0 - r2
def objective(params): params.update(default_params) ngb = NGBRegressor(**params, verbose=False).fit( X_train, y_train, X_val=X_validation, Y_val=y_validation, # 假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了 early_stopping_rounds=2) loss = ngb.evals_result['val']['LOGSCORE'][ngb.best_val_loss_itr] results = {'loss': loss, 'status': STATUS_OK} return results
def ngb_cv(): print("====================================") b1 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=2) b2 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4) param_grid = {'minibatch_frac': [1.0, 0.5], 'Base': [b1, b2]} ngb = NGBRegressor(Dist=Normal, verbose=True) grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=3) grid_search.fit(X_train, Y_train) best_params = grid_search.best_params_ print(best_params) ngb_cv = NGBRegressor(Dist=Normal, verbose=True, **best_params).fit(X_train, Y_train) globals()['ngb_cv'] = ngb_cv Y_preds = ngb_cv.predict(X_test) Y_dists = ngb_cv.pred_dist(X_test) # test Mean Squared Error test_MSE_CV = mean_squared_error(Y_preds, Y_test) print('Test MSE_CV', test_MSE_CV) # test Negative Log Likelihood test_NLL_CV = -Y_dists.logpdf(Y_test).mean() print('Test NLL_CV', test_NLL_CV)
def cross_validate(self, xtrain, ytrain, params): base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best', **parse_params(params, 'base:')) model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=False, **parse_params(params, 'param:')) scores = cross_val_score(model, xtrain, ytrain, cv=3) return np.mean(scores)
def model_test_for_esn_base(Base, esn_param, X_train, X_test, Y_train, Y_test, n_estimators=500, learning_rate=0.01, Score=MLE, Dist=Normal, verbose=True, verbose_eval=100, plot_predict=True, return_y_pred=False, return_y_dists=False, return_mse=False): ESN = SimpleESN(n_readout=esn_param['n_readout'], n_components=esn_param['n_components'], damping=esn_param['damping'], weight_scaling=esn_param['weight_scaling'], discard_steps=0, random_state=None) X_train = ESN.fit_transform(X_train) X_test = ESN.fit_transform(X_test) ngb = NGBRegressor(Base=Base, n_estimators=n_estimators, verbose=verbose, verbose_eval=verbose_eval, learning_rate=learning_rate, Dist=Dist, Score=Score) print(ESN, '\n') print(ngb, '\n') ngb.fit(X_train, Y_train) Y_preds = ngb.predict(X_test) Y_dists = ngb.pred_dist(X_test) # return norm method: mean std # test Mean Squared Error test_MSE = mean_squared_error(Y_preds, Y_test) print('\nTest MSE', test_MSE) # test Negative Log Likelihood test_NLL = -Y_dists.logpdf(Y_test).mean() print('Test NLL', test_NLL) if plot_predict: df = pd.concat([Y_test, pd.Series(Y_preds, index=Y_test.index)], axis=1) df.columns = ['test', 'pred'] df.plot(figsize=(10, 4), title='MSE:{} NLL:{}'.format(round(test_MSE, 4), round(test_NLL, 4))) if (return_y_pred) & (not (return_y_dists)): return pd.Series(Y_preds, index=Y_test.index) if (not (return_y_pred)) & (return_y_dists): return Y_dists if (return_y_pred) & (return_y_dists): return pd.Series(Y_preds, index=Y_test.index), Y_dists if return_mse: return test_MSE
def test_dists_runs_on_examples_crpscore(dist: Distn, learner, boston_data: Tuple4Array): X_train, X_test, y_train, y_test = boston_data # TODO: test early stopping features ngb = NGBRegressor(Dist=dist, Score=CRPScore, Base=learner, verbose=False) ngb.fit(X_train, y_train) y_pred = ngb.predict(X_test) y_dist = ngb.pred_dist(X_test)
def train(self, train_data): X_train, y_train = train_data # note: cross-validation will error unless these values are set: min_samples_leaf = 1 min_samples_split = 2 minibatch_frac = 0.5 base_learner = DecisionTreeRegressor( criterion='friedman_mse', min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, random_state=None, splitter='best', **parse_params(self.hyperparams, identifier='base:')) model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, minibatch_frac=minibatch_frac, verbose=True, **parse_params(self.hyperparams, identifier='param:')) return model.fit(X_train, y_train)
def test_dists(self, learners, reg_dists, reg_data): X_reg_train, X_reg_test, Y_reg_train, Y_reg_test = reg_data for Dist, Scores in reg_dists.items(): for Score in Scores: for Learner in learners: # test early stopping features ngb = NGBRegressor(Dist=Dist, Score=Score, Base=Learner, verbose=False) ngb.fit(X_reg_train, Y_reg_train) y_pred = ngb.predict(X_reg_test) y_dist = ngb.pred_dist(X_reg_test)
def objective(params): params.update(default_params) print("current params:", params) ngb = NGBRegressor(**params).fit( X_train, y_train, X_val=X_validation, Y_val=y_validation, # 假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了 early_stopping_rounds=2) loss = ngb.evals_result['val']['LOGSCORE'][ngb.best_val_loss_itr] logger.info("current params:{}".format(params)) results = {'loss': loss, 'status': STATUS_OK} return results
def test_regression(boston_data): from sklearn.metrics import mean_squared_error x_train, x_test, y_train, y_test = boston_data ngb = NGBRegressor(verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = mean_squared_error(y_test, preds) assert score <= 15 score = ngb.score(x_test, y_test) assert score <= 15 dist = ngb.pred_dist(x_test) assert isinstance(dist, Normal) score = mean_squared_error(y_test, preds) assert score <= 15
def test_regression(): from sklearn.datasets import load_boston from sklearn.metrics import mean_squared_error data, target = load_boston(True) x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42) ngb = NGBRegressor(verbose=False) ngb.fit(x_train, y_train) preds = ngb.predict(x_test) score = mean_squared_error(y_test, preds) assert score <= 8.0 score = ngb.score(x_test, y_test) assert score <= 8.0 dist = ngb.pred_dist(x_test) assert isinstance(dist, Normal) preds = ngb.dist_to_prediction(dist) score = mean_squared_error(y_test, preds) assert score <= 8.0
def choose_ML_alg(self): models = [ RANSACRegressor(), HuberRegressor(), LinearRegression(), ElasticNet(), ElasticNetCV(), Lars(), Lasso(), LassoLars(), LassoLarsIC(), OrthogonalMatchingPursuit(), OrthogonalMatchingPursuitCV(), Ridge(), SGDRegressor(), RandomForestRegressor(), GradientBoostingRegressor(), AdaBoostRegressor(), NGBRegressor(Dist=Normal), DecisionTreeRegressor() ] return models
def test_multivariatenormal(k: 2, learner): dist = MultivariateNormal(k) # Generate some sample data N = 500 X_train = np.random.randn(N, k) y_fns = [np.sin, np.cos, np.exp] y_cols = [ fn(X_train[:, num_col]).reshape(-1, 1) + np.random.randn(N, 1) for num_col, fn in enumerate(y_fns[:k]) ] y_train = np.hstack(y_cols) X_test = np.random.randn(N, k) ngb = NGBRegressor(Dist=dist, Score=LogScore, Base=learner, verbose=False) ngb.fit(X_train, y_train) y_pred = ngb.predict(X_test) y_dist = ngb.pred_dist(X_test) mean = y_dist.mean sample = y_dist.rv() scipy_list = y_dist.scipy_distribution()
from ngboost import NGBRegressor from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error # 加载数据集 X, Y = load_boston(return_X_y=True) # 切分训练集,测试集 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) # 使用NGRegressor ngb = NGBRegressor().fit(X_train, Y_train) Y_preds = ngb.predict(X_test) # 计算MSE test_MSE = mean_squared_error(Y_preds, Y_test) print('MSE', test_MSE) # 计算NLL Negative Log Likelihood Y_dists = ngb.pred_dist(X_test) test_NLL = -Y_dists.logpdf(Y_test.flatten()).mean() print('NLL', test_NLL)
# Fit and predict rf = RandomForestRegressor(n_estimators=400, random_state=SEED).fit(X_train, y_train) y_pred = rf.predict(X_test) print('Random Forest: R2 score on testing data: {:.2f}%'.format( 100 * r2_score(y_test, y_pred))) # Fit and predict lgb = LGBMRegressor(n_estimators=400, random_state=SEED).fit(X_train, y_train) y_pred = lgb.predict(X_test) print('LightGBM: R2 score on testing data: {:.2f}%'.format( 100 * r2_score(y_test, y_pred))) # Fit and predict np.random.seed(SEED) ngb = NGBRegressor(n_estimators=400, Base=default_tree_learner, Dist=Normal, Score=MLE).fit(X_train, y_train) y_pred = ngb.predict(X_test) print('NGBoost: R2 score on testing data: {:.2f}%'.format( 100 * r2_score(y_test, y_pred))) # Probability distribution obs_idx = [0, 1] dist = ngb.pred_dist(X_test[obs_idx, :]) print('P(y_0|x_0) is normally distributed with loc={:.2f} and scale={:.2f}'. format(dist.loc[0], dist.scale[0])) print('P(y_1|x_1) is normally distributed with loc={:.2f} and scale={:.2f}'. format(dist.loc[1], dist.scale[1]))
Y, test_size=0.2, random_state=SEED) # baseline not using predictor data avg_tds = np.mean(Y_train) y_dist = dist(avg_tds) naive_NLL = -y_dist.logpmf(Y_test).mean() print("Mean squared error using only the mean: {:.4f}".format( mean_squared_error(np.repeat(avg_tds, len(Y_test)), Y_test))) print( "Poisson negative log liklihood without using predictor variables: {:.4f}" .format(naive_NLL)) ngb = NGBRegressor(Dist=Poisson) ngb.fit(X_train, Y_train) Y_preds = ngb.predict(X_test) Y_dists = ngb.pred_dist(X_test) # test Mean Squared Error test_MSE = mean_squared_error(Y_preds, Y_test) print("NGBoost MSE: {:.4f}".format(test_MSE)) # test Negative Log Likelihood test_NLL = -Y_dists.logpmf(Y_test.flatten()).mean() print("NGBoost NLL: {:.4f}".format(test_NLL)) # Let's see if we can improve by dropping confounding variables
def model_test(Base, X_train, X_test, Y_train, Y_test, n_estimators=500, learning_rate=0.01, Score=MLE, Dist=Normal, verbose=True, verbose_eval=100, plot_predict=True, return_y_pred=False, return_y_dists=False, return_mse=False, Y_scaler=None): ngb = NGBRegressor(Base=Base, n_estimators=n_estimators, verbose=verbose, verbose_eval=verbose_eval, learning_rate=learning_rate, Dist=Dist, Score=Score) print(ngb, '\n') ngb.fit(X_train, Y_train) Y_preds = ngb.predict(X_test) Y_dists = ngb.pred_dist(X_test) # return norm method: mean std # test Mean Squared Error test_MSE = mean_squared_error(Y_preds, Y_test) print('\nTest MSE', test_MSE) # test Negative Log Likelihood test_NLL = -Y_dists.logpdf(Y_test).mean() print('Test NLL', test_NLL) if plot_predict: if Y_scaler is not None: df = pd.concat([ pd.Series(Y_scaler.inverse_transform( Y_test.copy().values.reshape(-1, 1)).reshape(-1, ), index=Y_test.index), pd.Series(Y_scaler.inverse_transform( np.array(Y_preds).reshape(-1, 1)).reshape(-1, ), index=Y_test.index) ], axis=1) df.columns = ['test', 'pred'] df.plot(figsize=(10, 4), title='MSE:{} NLL:{}'.format(round(test_MSE, 4), round(test_NLL, 4))) else: df = pd.concat( [Y_test, pd.Series(Y_preds, index=Y_test.index)], axis=1) df.columns = ['test', 'pred'] df.plot(figsize=(10, 4), title='MSE:{} NLL:{}'.format(round(test_MSE, 4), round(test_NLL, 4))) if (return_y_pred) & (not (return_y_dists)): return pd.Series(Y_preds, index=Y_test.index) if (not (return_y_pred)) & (return_y_dists): return Y_dists if (return_y_pred) & (return_y_dists): return pd.Series(Y_preds, index=Y_test.index), Y_dists if return_mse: return test_MSE
random_state=1) # delete intermediate variables del X_intermediate, y_intermediate # print proportions # 显示数据集的分配比例 print('train: {}% | validation: {}% | test {}%'.format( round(len(y_train) / len(target), 2), round(len(y_validation) / len(target), 2), round(len(y_test) / len(target), 2))) ngb = NGBRegressor().fit( X_train, y_train, X_val=X_validation, Y_val=y_validation, # 假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了 early_stopping_rounds=2) y_pred = ngb.predict(X_test) print("y_pred=", y_pred) print("y_test=", y_test) test_MSE = mean_squared_error(y_pred, y_test) print('Test MSE_ngb', test_MSE) logger.info("...done") plt.figure(figsize=(8, 6)) plt.scatter(x=y_pred, y=y_test, s=20) # 创建一条斜线,然后让2个值作为x和y输出,如果完全相同那就会和斜线重合,越靠近斜线说明拟合效果越好
class OmniPredictor(Predictor): def __init__(self, zero_cost, lce, encoding_type, ss_type=None, config=None, n_hypers=35, run_pre_compute=True, min_train_size=0, max_zerocost=np.inf): self.zero_cost = zero_cost self.lce = lce self.encoding_type = encoding_type self.config = config self.n_hypers = n_hypers self.config = config self.lce = lce self.ss_type = ss_type self.run_pre_compute = run_pre_compute self.min_train_size = min_train_size self.max_zerocost = max_zerocost def pre_compute(self, xtrain, xtest): """ All of this computation could go into fit() and query(), but we do it here to save time, so that we don't have to re-compute Jacobian covariances for all train_sizes when running experiment_types that vary train size or fidelity. """ self.xtrain_zc_info = {} self.xtest_zc_info = {} if len(self.zero_cost) > 0: self.train_loader, _, _, _, _ = utils.get_train_val_loaders( self.config, mode='train') for method_name in self.zero_cost: zc_method = ZeroCostEstimators(self.config, batch_size=64, method_type=method_name) zc_method.train_loader = copy.deepcopy(self.train_loader) xtrain_zc_scores = zc_method.query(xtrain) xtest_zc_scores = zc_method.query(xtest) train_mean = np.mean(np.array(xtrain_zc_scores)) train_std = np.std((np.array(xtrain_zc_scores))) normalized_train = (np.array(xtrain_zc_scores) - train_mean) / train_std normalized_test = (np.array(xtest_zc_scores) - train_mean) / train_std self.xtrain_zc_info[f'{method_name}_scores'] = normalized_train self.xtest_zc_info[f'{method_name}_scores'] = normalized_test def get_random_params(self): params = { 'param:n_estimators': int(loguniform(128, 512)), 'param:learning_rate': loguniform(.001, .1), 'param:minibatch_frac': np.random.uniform(.1, 1), 'base:max_depth': np.random.choice(24) + 1, 'base:max_features': np.random.uniform(.1, 1), 'base:min_samples_leaf': np.random.choice(18) + 2, 'base:min_samples_split': np.random.choice(18) + 2, } return params def run_hpo(self, xtrain, ytrain): min_score = 100000 best_params = None for i in range(self.n_hypers): params = self.get_random_params() for key in ['base:min_samples_leaf', 'base:min_samples_split']: params[key] = max(2, min(params[key], int(len(xtrain) / 3) - 1)) score = self.cross_validate(xtrain, ytrain, params) if score < min_score: min_score = score best_params = params logger.info('{} new best {}, {}'.format(i, score, params)) return best_params def cross_validate(self, xtrain, ytrain, params): base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best', **parse_params(params, 'base:')) model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=False, **parse_params(params, 'param:')) scores = cross_val_score(model, xtrain, ytrain, cv=3) return np.mean(scores) def prepare_features(self, xdata, info, train=True): # prepare training data features full_xdata = [[] for _ in range(len(xdata))] if len(self.zero_cost) > 0 and self.train_size <= self.max_zerocost: if self.run_pre_compute: for key in self.xtrain_zc_info: if train: full_xdata = [[*x, self.xtrain_zc_info[key][i]] for i, x in enumerate(full_xdata)] else: full_xdata = [[*x, self.xtest_zc_info[key][i]] for i, x in enumerate(full_xdata)] else: # if the zero_cost scores were not precomputed, they are in info full_xdata = [[*x, info[i]] for i, x in enumerate(full_xdata)] if 'sotle' in self.lce and len(info[0]['TRAIN_LOSS_lc']) >= 3: train_losses = np.array([lcs['TRAIN_LOSS_lc'][-1] for lcs in info]) mean = np.mean(train_losses) std = np.std(train_losses) normalized = (train_losses - mean) / std full_xdata = [[*x, normalized[i]] for i, x in enumerate(full_xdata)] elif 'sotle' in self.lce and len(info[0]['TRAIN_LOSS_lc']) < 3: logger.info('Not enough fidelities to use train loss') if 'valacc' in self.lce and len(info[0]['VAL_ACCURACY_lc']) >= 3: val_accs = [lcs['VAL_ACCURACY_lc'][-1] for lcs in info] mean = np.mean(val_accs) std = np.std(val_accs) normalized = (val_accs - mean) / std full_xdata = [[*x, normalized[i]] for i, x in enumerate(full_xdata)] if self.encoding_type is not None: xdata_encoded = np.array([ encode(arch, encoding_type=self.encoding_type, ss_type=self.ss_type) for arch in xdata ]) full_xdata = [[*x, *xdata_encoded[i]] for i, x in enumerate(full_xdata)] return np.array(full_xdata) def fit(self, xtrain, ytrain, train_info, learn_hyper=True): # if we are below the min train size, use the zero_cost and lce info if len(xtrain) < self.min_train_size: self.trained = False return None self.trained = True self.train_size = len(xtrain) # prepare training data labels self.mean = np.mean(ytrain) self.std = np.std(ytrain) ytrain = (np.array(ytrain) - self.mean) / self.std xtrain = self.prepare_features(xtrain, train_info, train=True) params = self.run_hpo(xtrain, ytrain) # todo: this code is repeated in cross_validate base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best', **parse_params(params, 'base:')) self.model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=True, **parse_params(params, 'param:')) self.model.fit(xtrain, ytrain) def query(self, xtest, info): if self.trained: test_data = self.prepare_features(xtest, info, train=False) return np.squeeze( self.model.predict(test_data)) * self.std + self.mean else: logger.info('below the train size, so returning info') return info def get_data_reqs(self): """ Returns a dictionary with info about whether the predictor needs extra info to train/query. """ if len(self.lce) > 0: # add the metrics needed for the lce predictors required_metric_dict = { 'sotle': Metric.TRAIN_LOSS, 'valacc': Metric.VAL_ACCURACY } self.metric = [required_metric_dict[key] for key in self.lce] reqs = { 'requires_partial_lc': True, 'metric': self.metric, 'requires_hyperparameters': False, 'hyperparams': {} } else: reqs = super().get_data_reqs() return reqs
from ngboost import NGBRegressor from sklearn.datasets import load_boston from sklearn.metrics import mean_squared_error from sklearn.model_selection import GridSearchCV, train_test_split if __name__ == "__main__": X, y = load_boston(True) X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2) param_grid = { 'n_estimators': [200, 500], 'minibatch_frac': [1.0, 0.5], } ngb = NGBRegressor( natural_gradient=True, verbose=False, ) grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5) grid_search.fit(X_train, y_train) print(grid_search.best_params_)