def grid_search(parameters,
                X_train_res,
                y_train_res,
                X_test,
                y_test,
                useTrainCV=False):
    xgbmodel = XGBRegressor()
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=10)
    grid_search_xg = GridSearchCV(xgbmodel,
                                  parameters,
                                  scoring='roc_auc',
                                  n_jobs=1,
                                  cv=kfold,
                                  verbose=1)
    result_gcv_xgb = grid_search_xg.fit(X_train_res, y_train_res)
    best_params = result_gcv_xgb.best_params_
    print("Best params: %s" % (best_params))

    # rebuild using best params
    xg_reg = XGBRegressor(objective=best_params['objective'],
                          learning_rate=best_params['learning_rate'],
                          max_depth=best_params['max_depth'],
                          n_estimators=best_params['n_estimators'],
                          min_child_weight=best_params['min_child_weight'],
                          gamma=best_params['gamma'],
                          colsample_bytree=best_params['colsample_bytree'],
                          subsample=best_params['subsample'],
                          reg_alpha=best_params['reg_alpha'])

    if useTrainCV:
        xgb_param = xg_reg.get_xgb_params()
        xgtrain = DMatrix(X_train_res, label=y_train_res)
        cvresult = cv(xgb_param,
                      xgtrain,
                      num_boost_round=xg_reg.get_params()['n_estimators'],
                      folds=kfold,
                      metrics='auc',
                      early_stopping_rounds=20)
        xg_reg.set_params(n_estimators=cvresult.shape[0])
        print("Best number of estimators: %i" % (cvresult.shape[0]))

    eval_set = [(X_test, y_test)]
    xg_reg.fit(X_train_res,
               y_train_res,
               eval_metric="error",
               eval_set=eval_set,
               verbose=False)
    y_pred_train = xg_reg.predict(X_train_res)
    #print("Accuracy train: %f" % (accuracy_score(y_train_res, y_pred_train)))
    #print("Recall train: %f" % (recall_score(y_train_res, y_pred_train)))
    #print("Precision train: %f" % (precision_score(y_train_res, y_pred_train)))
    print("AUC train: %f" % (roc_auc_score(y_train_res, y_pred_train)))
    y_pred = xg_reg.predict(X_test)
    #print("Accuracy test: %f" % (accuracy_score(y_test, y_pred)))
    #print("Recall test: %f" % (recall_score(y_test, y_pred)))
    #print("Precision test: %f" % (precision_score(y_test, y_pred)))
    print("AUC test: %f" % (roc_auc_score(y_test, y_pred)))
示例#2
0
class XGBoostRegressor(Model):

    def create_model(self):
        self.xgb_regressor = XGBRegressor()

    def fit(self, train_x, train_y):
        self.xgb_regressor.fit(train_x, train_y)

    def set_config(self, config):
        self.xgb_regressor.set_params(**config)

    def predict(self, test_x):
        return self.xgb_regressor.predict(test_x)
def fit_gbm(data, fixed_gbm_params, variable_gbm_params):
    gbm_parameters = deepcopy(variable_gbm_params)
    gbm_parameters.update(fixed_gbm_params)
    gbm = XGBRegressor(objective="reg:gamma")
    gbm.set_params(**gbm_parameters)
    gbm.fit(
        data["X_train_encoded"],
        data["y_train"],
        early_stopping_rounds=30,
        eval_metric="mae",
        eval_set=[(data["X_test_encoded"], data["y_test"]),
                  (data["X_holdout_encoded"], data["y_holdout"])],
    )
    return gbm
示例#4
0
def create_model(model_type='xgb', model_params=None):
    if model_params is None:
        model_params = get_default_params(model_type=model_type)
    else:
        pass

    if model_type == 'lgb':
        model = lgb.LGBMRegressor()
        model.set_params(**model_params)

    if model_type == 'xgb':
        model = XGBRegressor()
        model.set_params(**model_params)

    return model
class XGBaseline(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.xgb_mean = XGBRegressor(**kwargs)

    def fit(self, X, y):
        self.xgb_mean.fit(X, y)
        errors = y - self.xgb_mean.predict(X)
        self.std = np.std(errors)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_std = self.std * np.ones(len(pred_mean))
        return pred_mean, pred_std

    def get_params(self, deep=True):
        return self.xgb_mean.get_params()

    def set_params(self, **params):
        self.xgb_mean.set_params(**params)
        return self
class RegressionLearner:
    def __init__(self, **kwargs):
        self.estimator = XGBRegressor(**kwargs)
        self.fit_info = None

    # noinspection PyPep8Naming
    # pylint: disable-msg=too-many-arguments
    # pylint: disable-msg=too-many-locals
    # pylint: disable-msg=invalid-name
    def fit(self, X, y):
        # If there is no evaluation data, split some.
        x_train, x_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=42)

        if X.shape[0] < 10000:
            best_param = search_parameters(self.estimator, x_train, y_train)
            self.estimator.set_params(**best_param)

        self.estimator.fit(x_train,
                           y_train,
                           eval_set=[(x_test, y_test)],
                           early_stopping_rounds=10,
                           verbose=False)

        y_train_pred = self.predict(x_train)
        train_r2 = sklearn.metrics.r2_score(y_train, y_train_pred)
        y_test_pred = self.predict(x_test)
        test_r2 = sklearn.metrics.r2_score(y_test, y_test_pred)

        self.fit_info = 'Train/Test R2: {:.2f}/{:.2f}'.format(
            train_r2, test_r2)

        return self

    def predict(self, x):
        return self.estimator.predict(x)
class XGBLogLikelihood(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.xgb_mean = XGBRegressor(**kwargs)
        kwargs["objective"] = ll_objective
        self.xgb_log_var = XGBRegressor(**kwargs)

    def fit(self, X, y):
        self.xgb_mean.fit(X, y)
        errors = y - self.xgb_mean.predict(X)
        self.xgb_log_var.fit(X, errors)
        return self

    def predict(self, X, y=None):
        pred_mean = self.xgb_mean.predict(X)
        pred_std = np.exp(self.xgb_log_var.predict(X) / 2)
        return pred_mean, pred_std

    def get_params(self, deep=True):
        return self.xgb_mean.get_params()

    def set_params(self, **params):
        self.xgb_mean.set_params(**params)
        self.xgb_log_var.set_params(**params)
        return self
示例#8
0
def fun_xgb_fs(x, *args):
    X, y, flag, n_splits, random_seed = args
    clf = XGBRegressor(random_state=int(random_seed))
    n_samples, n_var = X.shape

    cr = {
        0: 'reg:squarederror',
        1: 'reg:logistic',
        2: 'binary:logistic',
    }

    #x=[0.1, 200, 5, 0.3, 2, 0.8, ]
    p = {
        'learning_rate': x[0],
        'n_estimators': int(round(x[1])),
        'max_depth': int(round(x[2])),
        'colsample_bytree': x[3],
        'min_child_weight': int(round(x[4])),
        'subsample': int(x[5] * 1000) / 1000,
        #'alpha':x[6],
        'objective': cr[0],
        #'presort':ps[0],
        #'max_iter':1000,
    }

    clf.set_params(**p)
    #x[2::] = [1 if k>0.5 else 0 for k in x[4::]]
    if len(x) <= 6:
        ft = np.array([1 for i in range(n_var)])
    else:
        ft = np.array([1 if k > 0.5 else 0 for k in x[2::]])

    ft = np.where(ft > 0.5)

    try:
        # print('Começando KFold', flag)
        cv = KFold(n_splits=n_splits,
                   shuffle=True,
                   random_state=int(random_seed))
        #print('Terminando KFold', flag)
        y_p = cross_val_predict(clf, X, np.ravel(y), cv=cv, n_jobs=1)

        r = RMSE(y_p, y)
        r2 = MAPE(y_p, y)
        r3 = RRMSE(y_p, y)
        r4 = -r2_score(y_p, y)
        #r =  mean_squared_error(y,y_p)**0.5
        #r =  -accuracy_score(y,y_p)
        #r =  -f1_score(y,y_p,average='weighted')
        #r =  -precision_score(y,y_p)
        #print(r,p)
    except:
        y_p = [None]
        r = 1e12

    #print(r,'\t',p)
    if flag == 'eval':
        return r
    else:
        clf.fit(X[:, ft].squeeze(), y)
        return {
            'Y_TRUE': y,
            'Y_PRED': y_p,
            'EST_PARAMS': p,
            'PARAMS': x,
            'EST_NAME': 'XGB',
            'ESTIMATOR': clf,
            'ACTIVE_VAR': ft,
            'DATA': X,
            'SEED': random_seed,
            'ERROR_TRAIN': {
                'RMSE': r,
                'MAPE': r2,
                'RRMSE': r3,
                'R2_SCORE': r4
            }
        }
示例#9
0
    thresholds = np.sort(model_.best_estimator_.feature_importances_)
    print(thresholds)

    print('=======================')
    break

score_list = []
for i, thresh in enumerate(thresholds):
    selection = SelectFromModel(best_model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)
    print(select_x_train.shape)

    selection_model = XGBRegressor(n_jobs=8)
    selection_model.set_params(**model_.best_params_)
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)

    y_predict = selection_model.predict(select_x_test)

    score = r2_score(y_test, y_predict)
    score_list.append(score)

    print('Thresh=%.3f, n=%d, R2:%.2f%%' %
          (thresh, select_x_train.shape[1], score * 100))

    if i == 0:
        reduce_x_train = select_x_train
        reduce_x_test = select_x_test
示例#10
0
        "id": "colsample_bytree",
        "conditions": np.linspace(0.3, 1, 5)
    },
}

boston = load_boston()
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

scores = []
reg = XGBRegressor(silent=1, nthread=-1)
thompson_parameters = ThompsonParameters(xbg_tuning_parameters, 80, 20)
while thompson_parameters.hasNext():
    params_obj = thompson_parameters.getParameters()
    cur_parameters = dict(params_obj["parameters"])
    cur_parameters["max_depth"] = int(round(cur_parameters["max_depth"][0]))
    cur_parameters["subsample"] = cur_parameters["subsample"][0]
    print(cur_parameters["max_depth"], cur_parameters["subsample"])
    reg.set_params(**cur_parameters)
    reg.fit(X_train, y_train)

    score = reg.score(X_test, y_test)
    thompson_parameters.setScore(params_obj, score)

    scores.append(score)

print(
    zip(thompson_parameters.bayes_opt["max_depth"].X,
        thompson_parameters.bayes_opt["max_depth"].Y))
plt.plot(scores)
plt.show()
示例#11
0
def main():

    list_file_path = sorted(
        glob.glob(os.path.join(DATA_DIR, 'train_join_all_5_cl_qua/*gz')))

    df = pandas.read_csv(list_file_path[0], compression='gzip')
    df = df.fillna(0)
    data = df[LIST_FEATURE_COLUMN_NAME].values
    target = df[TARGET_COLUMN_NAME].values

    model = XGBRegressor(seed=0)
    """
    params = {'max_depth': [3, 5, 10],
              'learning_rate': [0.01, 0.1, 1],
              'min_child_weight': [0.01, 0.1, 1],
              'subsample': [0.1, 0.5, 1],
              'colsample_bytree': [0.3, 0.5, 1],
              }
    cv = GridSearchCV(model, params, scoring=bimbo_scoring, n_jobs=3, refit=False, verbose=10)
    cv.fit(data, target)
    """

    params = {
        'subsample': 1,
        'learning_rate': 0.1,
        'colsample_bytree': 0.5,
        'max_depth': 13,
        'min_child_weight': 0.01
    }

    logger.info('best_params: %s' % params)
    list_estimator = []
    flg = 0
    for i in range(1, len(list_file_path)):
        logger.info('%s: %s' % (i, list_file_path[i]))
        test_df = pandas.read_csv(list_file_path[i], compression='gzip')
        test_df = test_df.fillna(0)
        test_data = test_df[LIST_FEATURE_COLUMN_NAME].values
        test_target = test_df[TARGET_COLUMN_NAME].values

        if flg < 4:
            data = numpy.r_[data, test_data]
            target = numpy.r_[target, test_target]
            flg += 1
            continue
        else:
            flg = 0

        model = XGBRegressor(seed=0)
        model.set_params(**params)
        model.fit(data, target)
        list_estimator.append(model)

        if 1:
            predict = numpy.mean([est.predict(data) for est in list_estimator],
                                 axis=0)
            predict = numpy.where(predict < 0, 0, predict)
            score = bimbo_score_func(predict, target)
            logger.info('INSAMPLE score: %s' % score)

            predict = numpy.mean(
                [est.predict(test_data) for est in list_estimator], axis=0)
            predict = numpy.where(predict < 0, 0, predict)
            score = bimbo_score_func(predict, test_target)
            logger.info('score: %s' % score)

        # model.set_params(n_estimators=n_estimators)

        df = test_df
        data = test_data
        target = test_target

    with open('list_xgb_model_5_cl_qua.pkl', 'wb') as f:
        pickle.dump(list_estimator, f, -1)
#%%
#after we have searched the best set of hyper-parameter, we now apply all the data to train the model with cross-validation
#we want to test for classification, regression or stacking, which one is the best.
cv = StratifiedKFold(5, random_state=model_random_state)
updated_dict = gridsearch.best_params_
updated_dict['learning_rate'] = .1
updated_dict['n_estimators'] = 800
updated_dict['min_child_weight'] = 50

#%%
print('===============XGboost regression with rounding===============')
xgbr = XGBRegressor(random_state=model_random_state,
                    n_jobs=-1,
                    early_stopping_rounds=80)
xgbr.set_params(**updated_dict)
xgbr_scores = evaluation.cv_scores(xgbr,
                                   X_train,
                                   y_train,
                                   cv=cv,
                                   scoring=quadratic_weighted_kappa_round,
                                   return_estimator=True)

# train mean of score: 0.6473931873941305
# train std of score: 0.001041262887225388
# test mean of score: 0.6063831053298916
# test std of score: 0.003201456042199307

#%%
print('===============XGboost regression with decision tree===============')
#It is very easy for Stacking to get overfitted, so we reduce the model complexity here
                  subsample=0.8,
                  colsample_bytree=0.8,
                  objective='reg:gamma',
                  nthread=4,
                  scale_pos_weight=1,
                  seed=27)

xgb_param = gb.get_xgb_params()
xgtrain = xgb.DMatrix(df[features].values, label=df['SPEED_AVG'].values)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=gb.get_params()['n_estimators'],
                  nfold=10,
                  metrics='mae',
                  early_stopping_rounds=50)
gb.set_params(n_estimators=cvresult.shape[0])

gb.fit(x_train, y_train, eval_metric='mae')


def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


predictions = gb.predict(x_train)

print("MAE Score (Train): %f" % mean_absolute_error(y_train, predictions))

print("MAE Score (Test): %f" % cvresult['test-mae-mean'].tail(1))