Exemplo n.º 1
0
    def train(self):
        X_train, y_train, _ = self.load_results_from_result_paths(self.train_paths)
        X_val, y_val, _ = self.load_results_from_result_paths(self.val_paths)

        base_learner_config = self.parse_config("base:")
        param_config = self.parse_config("param:")

        # train
        base_learner = DecisionTreeRegressor(criterion='friedman_mse', random_state=None, splitter='best',
                                             **base_learner_config)
        self.model = NGBRegressor(Dist=Normal, Base=base_learner, Score=LogScore, verbose=True, **param_config)
        self.model = self.model.fit(X_train, y_train, X_val=X_val, Y_val=y_val,
                                    early_stopping_rounds=self.model_config["early_stopping_rounds"])

        train_pred, var_train = self.model.predict(X_train), None
        val_pred, var_val = self.model.predict(X_val), None

        # self.save()

        fig_train = utils.scatter_plot(np.array(train_pred), np.array(y_train), xlabel='Predicted', ylabel='True',
                                       title='')
        fig_train.savefig(os.path.join(self.log_dir, 'pred_vs_true_train.jpg'))
        plt.close()

        fig_val = utils.scatter_plot(np.array(val_pred), np.array(y_val), xlabel='Predicted', ylabel='True', title='')
        fig_val.savefig(os.path.join(self.log_dir, 'pred_vs_true_val.jpg'))
        plt.close()

        train_metrics = utils.evaluate_metrics(y_train, train_pred, prediction_is_first_arg=False)
        valid_metrics = utils.evaluate_metrics(y_val, val_pred, prediction_is_first_arg=False)

        logging.info('train metrics: %s', train_metrics)
        logging.info('valid metrics: %s', valid_metrics)

        return valid_metrics
Exemplo n.º 2
0
    def fit(self, xtrain, ytrain, train_info, learn_hyper=True):

        # if we are below the min train size, use the zero_cost and lce info
        if len(xtrain) < self.min_train_size:
            self.trained = False
            return None
        self.trained = True
        self.train_size = len(xtrain)

        # prepare training data labels
        self.mean = np.mean(ytrain)
        self.std = np.std(ytrain)
        ytrain = (np.array(ytrain) - self.mean) / self.std
        xtrain = self.prepare_features(xtrain, train_info, train=True)
        params = self.run_hpo(xtrain, ytrain)

        # todo: this code is repeated in cross_validate
        base_learner = DecisionTreeRegressor(criterion='friedman_mse',
                                             random_state=None,
                                             splitter='best',
                                             **parse_params(params, 'base:'))
        self.model = NGBRegressor(Dist=Normal,
                                  Base=base_learner,
                                  Score=LogScore,
                                  verbose=True,
                                  **parse_params(params, 'param:'))
        self.model.fit(xtrain, ytrain)
Exemplo n.º 3
0
def fixture_learners_data(breast_cancer_data, boston_data,
                          boston_survival_data):
    """
    Returns:
        A list of iterables,
        each iterable containing a fitted model and
        X data and the predictions for the X_data
    """

    models_data = []
    X_class_train, _, Y_class_train, _ = breast_cancer_data
    ngb = NGBClassifier(verbose=False, n_estimators=10)
    ngb.fit(X_class_train, Y_class_train)
    models_data.append((ngb, X_class_train, ngb.predict(X_class_train)))

    X_reg_train, _, Y_reg_train, _ = boston_data
    ngb = NGBRegressor(verbose=False, n_estimators=10)
    ngb.fit(X_reg_train, Y_reg_train)
    models_data.append((ngb, X_reg_train, ngb.predict(X_reg_train)))

    X_surv_train, _, T_surv_train, E_surv_train, _ = boston_survival_data
    ngb = NGBSurvival(verbose=False, n_estimators=10)
    ngb.fit(X_surv_train, T_surv_train, E_surv_train)
    models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train)))

    ngb = NGBRegressor(Dist=MultivariateNormal(2), n_estimators=10)
    ngb.fit(X_surv_train, np.vstack([T_surv_train, E_surv_train]).T)
    models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train)))
    return models_data
Exemplo n.º 4
0
class ModelNgbRegressor(Model):
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # ハイパーパラメータの設定
        params = dict(self.params)
        early_stopping_rounds = params.pop('early_stopping_rounds')

        self.model = NGBRegressor(**params)
        self.model.fit(tr_x.values,
                       tr_y.astype(int).values,
                       va_x.values,
                       va_y.astype(int).values,
                       early_stopping_rounds=early_stopping_rounds)

    def predict(self, te_x):
        return self.model.predict(te_x.values)

    def save_model(self):
        model_path = os.path.join('../output/model',
                                  f'{self.run_fold_name}.model')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        Data.dump(self.model, model_path)

    def load_model(self):
        model_path = os.path.join('../output/model',
                                  f'{self.run_fold_name}.model')
        self.model = Data.load(model_path)
Exemplo n.º 5
0
def ngb_Normal():
    ngb_Normal = NGBRegressor(Dist=Normal).fit(X_train, Y_train)
    globals()['ngb_Normal'] = ngb_Normal
    Y_preds = ngb_Normal.predict(X_test)
    Y_dists = ngb_Normal.pred_dist(X_test)
    # test Mean Squared Error
    test_MSE = mean_squared_error(Y_preds, Y_test)
    print('Test MSE_Normal', test_MSE)
    # test Negative Log Likelihood
    test_NLL = -Y_dists.logpdf(Y_test).mean()
    print('Test NLL_Normal', test_NLL)
Exemplo n.º 6
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # ハイパーパラメータの設定
        params = dict(self.params)
        early_stopping_rounds = params.pop('early_stopping_rounds')

        self.model = NGBRegressor(**params)
        self.model.fit(tr_x.values,
                       tr_y.astype(int).values,
                       va_x.values,
                       va_y.astype(int).values,
                       early_stopping_rounds=early_stopping_rounds)
Exemplo n.º 7
0
def feature_importance():
    global ngb_Normal
    ngb_Normal = NGBRegressor(verbose=True).fit(X_train, Y_train)
    # Feature importance for loc trees
    feature_importance_loc = ngb_Normal.feature_importances_[0]
    # Feature importance for scale trees
    feature_importance_scale = ngb_Normal.feature_importances_[1]

    # dataframe制作
    df_loc = pd.DataFrame({
        'feature': load_boston()['feature_names'],
        'importance': feature_importance_loc
    }).sort_values('importance', ascending=False)

    # dataframe制作
    df_scale = pd.DataFrame({
        'feature': load_boston()['feature_names'],
        'importance': feature_importance_scale
    }).sort_values('importance', ascending=False)

    # 通过sns绘图
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6))
    fig.suptitle("Feature importance plot for distribution parameters",
                 fontsize=17)
    sns.barplot(x='importance',
                y='feature',
                ax=ax1,
                data=df_loc,
                color="skyblue").set_title('loc param')
    sns.barplot(x='importance',
                y='feature',
                ax=ax2,
                data=df_scale,
                color="skyblue").set_title('scale param')
    plt.show()
Exemplo n.º 8
0
def frc_plain_ngboost(num_iterations, learning_rate, validation_test_size,
                      X_train, y_train, X_test):

    # ngboost
    ngb_model = NGBRegressor(learning_rate=learning_rate,
                             n_estimators=num_iterations)

    # Split into training and evaluation
    X_train_xgb, X_val_xgb, y_train_xgb, y_val_xgb = \
        train_test_split(X_train, y_train, test_size=validation_test_size)

    # Fit NGBoost
    ngb_model.fit(X_train_xgb, y_train_xgb, X_val=X_val_xgb, Y_val=y_val_xgb)

    # differences regarding the reference promotions
    ngb_frc = ngb_model.predict(X_test)
    return ngb_frc
Exemplo n.º 9
0
class myNGBoostBinary:
    def make(self , params ):
        self.model =  NGBRegressor(**params  )
        return self

    def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}):
        if type(xtrain) == pd.core.frame.DataFrame:
                xtrain = xtrain.values
                ytrain = ytrain.values
                if type(xtest) != type(None) and type(ytest) != type(None):
                    xtest = xtest.values
                    ytest = ytest.values
                  
        if type(xtest) == type(None) or type(ytest) == type(None) :
            self.model.fit( xtrain , ytrain , **fit_params )
        else:
            self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params )
        
    def predict(self , xs , threshold = 0.5):
        return np.where(self.model.predict(xs) > threshold , 1 , 0)
        
    def predict_proba(self, xs):
        if len(xs.shape) == 1:
            return self.model.predict(xs.reshape(1,-1))
        else:
            return self.model.predict(xs)[:,1]
Exemplo n.º 10
0
    def train(self, train_data):
        X_train, y_train = train_data
        min_samples_leaf = min(max(len(X_train) // 2, 1), 15)
        min_samples_split = min(max(len(X_train) // 2, 2), 20)

        base_learner = DecisionTreeRegressor(
            criterion='friedman_mse',
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            random_state=None,
            splitter='best',
            **self.parameters(identifier='base:'))
        model = NGBRegressor(Dist=Normal,
                             Base=base_learner,
                             Score=LogScore,
                             verbose=True,
                             **self.parameters(identifier='param:'))

        return model.fit(X_train, y_train)
Exemplo n.º 11
0
    def objective(trial):
        param = {
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 1e0),
            'n_estimators': trial.suggest_int('n_estimators', 100, 800),
            'minibatch_frac':trial.suggest_discrete_uniform('minibatch_frac', 0.1, 0.9, 0.1),
        }

        regression_model = NGBRegressor(**param, Base= best_base, Dist=Normal, Score=MLE(), natural_gradient=True, verbose=False)
        estimated_y_in_cv = model_selection.cross_val_predict(regression_model, train_x, train_y, cv=fold_number)
        r2 = metrics.r2_score(train_y, estimated_y_in_cv)
        return 1.0 - r2
 def objective(params):
     params.update(default_params)
     ngb = NGBRegressor(**params, verbose=False).fit(
         X_train,
         y_train,
         X_val=X_validation,
         Y_val=y_validation,
         #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
         early_stopping_rounds=2)
     loss = ngb.evals_result['val']['LOGSCORE'][ngb.best_val_loss_itr]
     results = {'loss': loss, 'status': STATUS_OK}
     return results
Exemplo n.º 13
0
def ngb_cv():
    print("====================================")
    b1 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=2)
    b2 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4)
    param_grid = {'minibatch_frac': [1.0, 0.5], 'Base': [b1, b2]}
    ngb = NGBRegressor(Dist=Normal, verbose=True)
    grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=3)
    grid_search.fit(X_train, Y_train)
    best_params = grid_search.best_params_
    print(best_params)
    ngb_cv = NGBRegressor(Dist=Normal, verbose=True,
                          **best_params).fit(X_train, Y_train)
    globals()['ngb_cv'] = ngb_cv
    Y_preds = ngb_cv.predict(X_test)
    Y_dists = ngb_cv.pred_dist(X_test)
    # test Mean Squared Error
    test_MSE_CV = mean_squared_error(Y_preds, Y_test)
    print('Test MSE_CV', test_MSE_CV)
    # test Negative Log Likelihood
    test_NLL_CV = -Y_dists.logpdf(Y_test).mean()
    print('Test NLL_CV', test_NLL_CV)
Exemplo n.º 14
0
 def cross_validate(self, xtrain, ytrain, params):
     base_learner = DecisionTreeRegressor(criterion='friedman_mse',
                                          random_state=None,
                                          splitter='best',
                                          **parse_params(params, 'base:'))
     model = NGBRegressor(Dist=Normal,
                          Base=base_learner,
                          Score=LogScore,
                          verbose=False,
                          **parse_params(params, 'param:'))
     scores = cross_val_score(model, xtrain, ytrain, cv=3)
     return np.mean(scores)
Exemplo n.º 15
0
def model_test_for_esn_base(Base,
                            esn_param,
                            X_train,
                            X_test,
                            Y_train,
                            Y_test,
                            n_estimators=500,
                            learning_rate=0.01,
                            Score=MLE,
                            Dist=Normal,
                            verbose=True,
                            verbose_eval=100,
                            plot_predict=True,
                            return_y_pred=False,
                            return_y_dists=False,
                            return_mse=False):

    ESN = SimpleESN(n_readout=esn_param['n_readout'],
                    n_components=esn_param['n_components'],
                    damping=esn_param['damping'],
                    weight_scaling=esn_param['weight_scaling'],
                    discard_steps=0,
                    random_state=None)
    X_train = ESN.fit_transform(X_train)
    X_test = ESN.fit_transform(X_test)

    ngb = NGBRegressor(Base=Base,
                       n_estimators=n_estimators,
                       verbose=verbose,
                       verbose_eval=verbose_eval,
                       learning_rate=learning_rate,
                       Dist=Dist,
                       Score=Score)
    print(ESN, '\n')
    print(ngb, '\n')
    ngb.fit(X_train, Y_train)
    Y_preds = ngb.predict(X_test)
    Y_dists = ngb.pred_dist(X_test)  # return norm method: mean std
    # test Mean Squared Error
    test_MSE = mean_squared_error(Y_preds, Y_test)
    print('\nTest MSE', test_MSE)
    # test Negative Log Likelihood
    test_NLL = -Y_dists.logpdf(Y_test).mean()
    print('Test NLL', test_NLL)

    if plot_predict:
        df = pd.concat([Y_test, pd.Series(Y_preds, index=Y_test.index)],
                       axis=1)
        df.columns = ['test', 'pred']
        df.plot(figsize=(10, 4),
                title='MSE:{}  NLL:{}'.format(round(test_MSE, 4),
                                              round(test_NLL, 4)))
    if (return_y_pred) & (not (return_y_dists)):
        return pd.Series(Y_preds, index=Y_test.index)
    if (not (return_y_pred)) & (return_y_dists):
        return Y_dists
    if (return_y_pred) & (return_y_dists):
        return pd.Series(Y_preds, index=Y_test.index), Y_dists
    if return_mse:
        return test_MSE
Exemplo n.º 16
0
def test_dists_runs_on_examples_crpscore(dist: Distn, learner,
                                         boston_data: Tuple4Array):
    X_train, X_test, y_train, y_test = boston_data
    # TODO: test early stopping features
    ngb = NGBRegressor(Dist=dist, Score=CRPScore, Base=learner, verbose=False)
    ngb.fit(X_train, y_train)
    y_pred = ngb.predict(X_test)
    y_dist = ngb.pred_dist(X_test)
Exemplo n.º 17
0
    def train(self, train_data):
        X_train, y_train = train_data
        # note: cross-validation will error unless these values are set:
        min_samples_leaf = 1
        min_samples_split = 2
        minibatch_frac = 0.5

        base_learner = DecisionTreeRegressor(
            criterion='friedman_mse',
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            random_state=None,
            splitter='best',
            **parse_params(self.hyperparams, identifier='base:'))
        model = NGBRegressor(Dist=Normal,
                             Base=base_learner,
                             Score=LogScore,
                             minibatch_frac=minibatch_frac,
                             verbose=True,
                             **parse_params(self.hyperparams,
                                            identifier='param:'))

        return model.fit(X_train, y_train)
Exemplo n.º 18
0
	def test_dists(self, learners, reg_dists, reg_data):
		X_reg_train, X_reg_test, Y_reg_train, Y_reg_test = reg_data
		for Dist, Scores in reg_dists.items():
			for Score in Scores:
				for Learner in learners:
					# test early stopping features
					ngb = NGBRegressor(Dist=Dist, Score=Score, Base=Learner, verbose=False)
					ngb.fit(X_reg_train, Y_reg_train)
					y_pred = ngb.predict(X_reg_test)
					y_dist = ngb.pred_dist(X_reg_test)
def objective(params):

    params.update(default_params)

    print("current params:", params)
    ngb = NGBRegressor(**params).fit(
        X_train,
        y_train,
        X_val=X_validation,
        Y_val=y_validation,
        #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
        early_stopping_rounds=2)
    loss = ngb.evals_result['val']['LOGSCORE'][ngb.best_val_loss_itr]
    logger.info("current params:{}".format(params))
    results = {'loss': loss, 'status': STATUS_OK}

    return results
Exemplo n.º 20
0
def test_regression(boston_data):
    from sklearn.metrics import mean_squared_error

    x_train, x_test, y_train, y_test = boston_data
    ngb = NGBRegressor(verbose=False)
    ngb.fit(x_train, y_train)
    preds = ngb.predict(x_test)
    score = mean_squared_error(y_test, preds)
    assert score <= 15

    score = ngb.score(x_test, y_test)
    assert score <= 15

    dist = ngb.pred_dist(x_test)
    assert isinstance(dist, Normal)

    score = mean_squared_error(y_test, preds)
    assert score <= 15
Exemplo n.º 21
0
def test_regression():
    from sklearn.datasets import load_boston
    from sklearn.metrics import mean_squared_error
    data, target = load_boston(True)
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=42)
    ngb = NGBRegressor(verbose=False)
    ngb.fit(x_train, y_train)
    preds = ngb.predict(x_test)
    score = mean_squared_error(y_test, preds)
    assert score <= 8.0

    score = ngb.score(x_test, y_test)
    assert score <= 8.0

    dist = ngb.pred_dist(x_test)
    assert isinstance(dist, Normal)

    preds = ngb.dist_to_prediction(dist)
    score = mean_squared_error(y_test, preds)
    assert score <= 8.0
Exemplo n.º 22
0
        def choose_ML_alg(self):

            models = [
                RANSACRegressor(),
                HuberRegressor(),
                LinearRegression(),
                ElasticNet(),
                ElasticNetCV(),
                Lars(),
                Lasso(),
                LassoLars(),
                LassoLarsIC(),
                OrthogonalMatchingPursuit(),
                OrthogonalMatchingPursuitCV(),
                Ridge(),
                SGDRegressor(),
                RandomForestRegressor(),
                GradientBoostingRegressor(),
                AdaBoostRegressor(),
                NGBRegressor(Dist=Normal),
                DecisionTreeRegressor()
            ]

            return models
Exemplo n.º 23
0
def test_multivariatenormal(k: 2, learner):
    dist = MultivariateNormal(k)

    # Generate some sample data
    N = 500
    X_train = np.random.randn(N, k)
    y_fns = [np.sin, np.cos, np.exp]
    y_cols = [
        fn(X_train[:, num_col]).reshape(-1, 1) + np.random.randn(N, 1)
        for num_col, fn in enumerate(y_fns[:k])
    ]
    y_train = np.hstack(y_cols)
    X_test = np.random.randn(N, k)

    ngb = NGBRegressor(Dist=dist, Score=LogScore, Base=learner, verbose=False)
    ngb.fit(X_train, y_train)
    y_pred = ngb.predict(X_test)
    y_dist = ngb.pred_dist(X_test)

    mean = y_dist.mean
    sample = y_dist.rv()
    scipy_list = y_dist.scipy_distribution()
Exemplo n.º 24
0
from ngboost import NGBRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# 加载数据集
X, Y = load_boston(return_X_y=True)
# 切分训练集,测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
# 使用NGRegressor
ngb = NGBRegressor().fit(X_train, Y_train)
Y_preds = ngb.predict(X_test)
# 计算MSE
test_MSE = mean_squared_error(Y_preds, Y_test)
print('MSE', test_MSE)
# 计算NLL Negative Log Likelihood
Y_dists = ngb.pred_dist(X_test)
test_NLL = -Y_dists.logpdf(Y_test.flatten()).mean()
print('NLL', test_NLL)
Exemplo n.º 25
0
# Fit and predict
rf = RandomForestRegressor(n_estimators=400,
                           random_state=SEED).fit(X_train, y_train)
y_pred = rf.predict(X_test)
print('Random Forest: R2 score on testing data: {:.2f}%'.format(
    100 * r2_score(y_test, y_pred)))

# Fit and predict
lgb = LGBMRegressor(n_estimators=400, random_state=SEED).fit(X_train, y_train)
y_pred = lgb.predict(X_test)
print('LightGBM: R2 score on testing data: {:.2f}%'.format(
    100 * r2_score(y_test, y_pred)))

# Fit and predict
np.random.seed(SEED)
ngb = NGBRegressor(n_estimators=400,
                   Base=default_tree_learner,
                   Dist=Normal,
                   Score=MLE).fit(X_train, y_train)
y_pred = ngb.predict(X_test)
print('NGBoost: R2 score on testing data: {:.2f}%'.format(
    100 * r2_score(y_test, y_pred)))

# Probability distribution
obs_idx = [0, 1]
dist = ngb.pred_dist(X_test[obs_idx, :])
print('P(y_0|x_0) is normally distributed with loc={:.2f} and scale={:.2f}'.
      format(dist.loc[0], dist.scale[0]))
print('P(y_1|x_1) is normally distributed with loc={:.2f} and scale={:.2f}'.
      format(dist.loc[1], dist.scale[1]))
Exemplo n.º 26
0
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=SEED)

    # baseline not using predictor data
    avg_tds = np.mean(Y_train)
    y_dist = dist(avg_tds)
    naive_NLL = -y_dist.logpmf(Y_test).mean()

    print("Mean squared error using only the mean: {:.4f}".format(
        mean_squared_error(np.repeat(avg_tds, len(Y_test)), Y_test)))
    print(
        "Poisson negative log liklihood without using predictor variables: {:.4f}"
        .format(naive_NLL))

    ngb = NGBRegressor(Dist=Poisson)

    ngb.fit(X_train, Y_train)

    Y_preds = ngb.predict(X_test)
    Y_dists = ngb.pred_dist(X_test)

    # test Mean Squared Error
    test_MSE = mean_squared_error(Y_preds, Y_test)
    print("NGBoost MSE: {:.4f}".format(test_MSE))

    # test Negative Log Likelihood
    test_NLL = -Y_dists.logpmf(Y_test.flatten()).mean()
    print("NGBoost NLL: {:.4f}".format(test_NLL))

    # Let's see if we can improve by dropping confounding variables
Exemplo n.º 27
0
def model_test(Base,
               X_train,
               X_test,
               Y_train,
               Y_test,
               n_estimators=500,
               learning_rate=0.01,
               Score=MLE,
               Dist=Normal,
               verbose=True,
               verbose_eval=100,
               plot_predict=True,
               return_y_pred=False,
               return_y_dists=False,
               return_mse=False,
               Y_scaler=None):
    ngb = NGBRegressor(Base=Base,
                       n_estimators=n_estimators,
                       verbose=verbose,
                       verbose_eval=verbose_eval,
                       learning_rate=learning_rate,
                       Dist=Dist,
                       Score=Score)
    print(ngb, '\n')
    ngb.fit(X_train, Y_train)
    Y_preds = ngb.predict(X_test)
    Y_dists = ngb.pred_dist(X_test)  # return norm method: mean std
    # test Mean Squared Error
    test_MSE = mean_squared_error(Y_preds, Y_test)
    print('\nTest MSE', test_MSE)
    # test Negative Log Likelihood
    test_NLL = -Y_dists.logpdf(Y_test).mean()
    print('Test NLL', test_NLL)

    if plot_predict:
        if Y_scaler is not None:
            df = pd.concat([
                pd.Series(Y_scaler.inverse_transform(
                    Y_test.copy().values.reshape(-1, 1)).reshape(-1, ),
                          index=Y_test.index),
                pd.Series(Y_scaler.inverse_transform(
                    np.array(Y_preds).reshape(-1, 1)).reshape(-1, ),
                          index=Y_test.index)
            ],
                           axis=1)
            df.columns = ['test', 'pred']
            df.plot(figsize=(10, 4),
                    title='MSE:{}  NLL:{}'.format(round(test_MSE, 4),
                                                  round(test_NLL, 4)))
        else:
            df = pd.concat(
                [Y_test, pd.Series(Y_preds, index=Y_test.index)], axis=1)
            df.columns = ['test', 'pred']
            df.plot(figsize=(10, 4),
                    title='MSE:{}  NLL:{}'.format(round(test_MSE, 4),
                                                  round(test_NLL, 4)))
    if (return_y_pred) & (not (return_y_dists)):
        return pd.Series(Y_preds, index=Y_test.index)
    if (not (return_y_pred)) & (return_y_dists):
        return Y_dists
    if (return_y_pred) & (return_y_dists):
        return pd.Series(Y_preds, index=Y_test.index), Y_dists
    if return_mse:
        return test_MSE
                                                                random_state=1)

# delete intermediate variables
del X_intermediate, y_intermediate

# print proportions
# 显示数据集的分配比例
print('train: {}% | validation: {}% | test {}%'.format(
    round(len(y_train) / len(target), 2),
    round(len(y_validation) / len(target), 2),
    round(len(y_test) / len(target), 2)))

ngb = NGBRegressor().fit(
    X_train,
    y_train,
    X_val=X_validation,
    Y_val=y_validation,
    #  假定n_estimators迭代器有100个设定了早期停止后也许不到100次迭代就完成了训练停止了
    early_stopping_rounds=2)

y_pred = ngb.predict(X_test)
print("y_pred=", y_pred)
print("y_test=", y_test)
test_MSE = mean_squared_error(y_pred, y_test)
print('Test MSE_ngb', test_MSE)

logger.info("...done")

plt.figure(figsize=(8, 6))
plt.scatter(x=y_pred, y=y_test, s=20)
# 创建一条斜线,然后让2个值作为x和y输出,如果完全相同那就会和斜线重合,越靠近斜线说明拟合效果越好
Exemplo n.º 29
0
class OmniPredictor(Predictor):
    def __init__(self,
                 zero_cost,
                 lce,
                 encoding_type,
                 ss_type=None,
                 config=None,
                 n_hypers=35,
                 run_pre_compute=True,
                 min_train_size=0,
                 max_zerocost=np.inf):

        self.zero_cost = zero_cost
        self.lce = lce
        self.encoding_type = encoding_type
        self.config = config
        self.n_hypers = n_hypers
        self.config = config
        self.lce = lce
        self.ss_type = ss_type
        self.run_pre_compute = run_pre_compute
        self.min_train_size = min_train_size
        self.max_zerocost = max_zerocost

    def pre_compute(self, xtrain, xtest):
        """
        All of this computation could go into fit() and query(), but we do it
        here to save time, so that we don't have to re-compute Jacobian covariances
        for all train_sizes when running experiment_types that vary train size or fidelity.        
        """
        self.xtrain_zc_info = {}
        self.xtest_zc_info = {}

        if len(self.zero_cost) > 0:
            self.train_loader, _, _, _, _ = utils.get_train_val_loaders(
                self.config, mode='train')

            for method_name in self.zero_cost:
                zc_method = ZeroCostEstimators(self.config,
                                               batch_size=64,
                                               method_type=method_name)
                zc_method.train_loader = copy.deepcopy(self.train_loader)
                xtrain_zc_scores = zc_method.query(xtrain)
                xtest_zc_scores = zc_method.query(xtest)

                train_mean = np.mean(np.array(xtrain_zc_scores))
                train_std = np.std((np.array(xtrain_zc_scores)))

                normalized_train = (np.array(xtrain_zc_scores) -
                                    train_mean) / train_std
                normalized_test = (np.array(xtest_zc_scores) -
                                   train_mean) / train_std

                self.xtrain_zc_info[f'{method_name}_scores'] = normalized_train
                self.xtest_zc_info[f'{method_name}_scores'] = normalized_test

    def get_random_params(self):
        params = {
            'param:n_estimators': int(loguniform(128, 512)),
            'param:learning_rate': loguniform(.001, .1),
            'param:minibatch_frac': np.random.uniform(.1, 1),
            'base:max_depth': np.random.choice(24) + 1,
            'base:max_features': np.random.uniform(.1, 1),
            'base:min_samples_leaf': np.random.choice(18) + 2,
            'base:min_samples_split': np.random.choice(18) + 2,
        }
        return params

    def run_hpo(self, xtrain, ytrain):
        min_score = 100000
        best_params = None
        for i in range(self.n_hypers):
            params = self.get_random_params()
            for key in ['base:min_samples_leaf', 'base:min_samples_split']:
                params[key] = max(2, min(params[key],
                                         int(len(xtrain) / 3) - 1))

            score = self.cross_validate(xtrain, ytrain, params)
            if score < min_score:
                min_score = score
                best_params = params
                logger.info('{} new best {}, {}'.format(i, score, params))
        return best_params

    def cross_validate(self, xtrain, ytrain, params):
        base_learner = DecisionTreeRegressor(criterion='friedman_mse',
                                             random_state=None,
                                             splitter='best',
                                             **parse_params(params, 'base:'))
        model = NGBRegressor(Dist=Normal,
                             Base=base_learner,
                             Score=LogScore,
                             verbose=False,
                             **parse_params(params, 'param:'))
        scores = cross_val_score(model, xtrain, ytrain, cv=3)
        return np.mean(scores)

    def prepare_features(self, xdata, info, train=True):
        # prepare training data features
        full_xdata = [[] for _ in range(len(xdata))]
        if len(self.zero_cost) > 0 and self.train_size <= self.max_zerocost:
            if self.run_pre_compute:
                for key in self.xtrain_zc_info:
                    if train:
                        full_xdata = [[*x, self.xtrain_zc_info[key][i]]
                                      for i, x in enumerate(full_xdata)]
                    else:
                        full_xdata = [[*x, self.xtest_zc_info[key][i]]
                                      for i, x in enumerate(full_xdata)]
            else:
                # if the zero_cost scores were not precomputed, they are in info
                full_xdata = [[*x, info[i]] for i, x in enumerate(full_xdata)]

        if 'sotle' in self.lce and len(info[0]['TRAIN_LOSS_lc']) >= 3:
            train_losses = np.array([lcs['TRAIN_LOSS_lc'][-1] for lcs in info])
            mean = np.mean(train_losses)
            std = np.std(train_losses)
            normalized = (train_losses - mean) / std
            full_xdata = [[*x, normalized[i]]
                          for i, x in enumerate(full_xdata)]

        elif 'sotle' in self.lce and len(info[0]['TRAIN_LOSS_lc']) < 3:
            logger.info('Not enough fidelities to use train loss')

        if 'valacc' in self.lce and len(info[0]['VAL_ACCURACY_lc']) >= 3:
            val_accs = [lcs['VAL_ACCURACY_lc'][-1] for lcs in info]
            mean = np.mean(val_accs)
            std = np.std(val_accs)
            normalized = (val_accs - mean) / std
            full_xdata = [[*x, normalized[i]]
                          for i, x in enumerate(full_xdata)]

        if self.encoding_type is not None:
            xdata_encoded = np.array([
                encode(arch,
                       encoding_type=self.encoding_type,
                       ss_type=self.ss_type) for arch in xdata
            ])
            full_xdata = [[*x, *xdata_encoded[i]]
                          for i, x in enumerate(full_xdata)]

        return np.array(full_xdata)

    def fit(self, xtrain, ytrain, train_info, learn_hyper=True):

        # if we are below the min train size, use the zero_cost and lce info
        if len(xtrain) < self.min_train_size:
            self.trained = False
            return None
        self.trained = True
        self.train_size = len(xtrain)

        # prepare training data labels
        self.mean = np.mean(ytrain)
        self.std = np.std(ytrain)
        ytrain = (np.array(ytrain) - self.mean) / self.std
        xtrain = self.prepare_features(xtrain, train_info, train=True)
        params = self.run_hpo(xtrain, ytrain)

        # todo: this code is repeated in cross_validate
        base_learner = DecisionTreeRegressor(criterion='friedman_mse',
                                             random_state=None,
                                             splitter='best',
                                             **parse_params(params, 'base:'))
        self.model = NGBRegressor(Dist=Normal,
                                  Base=base_learner,
                                  Score=LogScore,
                                  verbose=True,
                                  **parse_params(params, 'param:'))
        self.model.fit(xtrain, ytrain)

    def query(self, xtest, info):
        if self.trained:
            test_data = self.prepare_features(xtest, info, train=False)
            return np.squeeze(
                self.model.predict(test_data)) * self.std + self.mean
        else:
            logger.info('below the train size, so returning info')
            return info

    def get_data_reqs(self):
        """
        Returns a dictionary with info about whether the predictor needs
        extra info to train/query.
        """
        if len(self.lce) > 0:
            # add the metrics needed for the lce predictors
            required_metric_dict = {
                'sotle': Metric.TRAIN_LOSS,
                'valacc': Metric.VAL_ACCURACY
            }
            self.metric = [required_metric_dict[key] for key in self.lce]

            reqs = {
                'requires_partial_lc': True,
                'metric': self.metric,
                'requires_hyperparameters': False,
                'hyperparams': {}
            }
        else:
            reqs = super().get_data_reqs()

        return reqs
Exemplo n.º 30
0
from ngboost import NGBRegressor
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, train_test_split

if __name__ == "__main__":
    X, y = load_boston(True)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    param_grid = {
        'n_estimators': [200, 500],
        'minibatch_frac': [1.0, 0.5],
    }

    ngb = NGBRegressor(
        natural_gradient=True,
        verbose=False,
    )
    grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)