def fit_xgboost_regression(self): if (self.X_val is not None): X_train_aux = pd.concat([pd.DataFrame(self.X_train.copy()), pd.DataFrame(self.X_val.copy())]) y_train_aux = pd.Series( pd.concat([pd.DataFrame(self.y_train.copy()), pd.DataFrame(self.y_val.copy())]).values.reshape( self.y_train.shape[0] + self.y_val.shape[0], )) else: X_train_aux = self.X_train y_train_aux = self.y_train xgbreg = XGBRegressor(nthreads=-1) params = { "max_depth": [i for i in range(5,55,5)], "learning_rate": [0.001,0.01,0.1], "gamma": [i for i in range(1,20)], "n_estimators": [i * 10 for i in range(5, 55, 5)] } self.gs_xgboost = RandomizedSearchCV(xgbreg, params, n_jobs=-1,verbose=2) self.gs_xgboost.fit(X_train_aux, y_train_aux) self.xgboost_reg_model = self.gs_xgboost.best_estimator_ #self.xgboost_reg_model = self.xgboost_reg_model.fit(X_train_aux, y_train_aux) return self.xgboost_reg_model
def generate_XGB_model(train_df): train_df.drop(['conversionTime'], axis=1, inplace=True) print 'Train And Fix Missing App Count Value...' train_df, xgb_appcount = train_model_for_appcounts(train_df) joblib.dump(xgb_appcount, 'XGB_missing.model') '''print 'Train And Fix Missing Age Value...' train_df, xgb_age = train_model_for_age(train_df) joblib.dump(xgb_age, 'XGB_age.model')''' train_df.drop(['marriageStatus', 'haveBaby', 'sitesetID', 'positionType'], axis=1, inplace=True) print 'Done' print train_df.info() print train_df.describe() print train_df.isnull().sum() train_np = train_df.as_matrix() y = train_np[:, 0] X = train_np[:, 1:] print 'Train Xgboost Model...' start_time = datetime.datetime.now() xbg_clf = XGBRegressor(n_estimators=100, max_depth=6, objective="binary:logistic", silent=False) xbg_clf.fit(X, y) end_time = datetime.datetime.now() print 'Training Done..., Time Cost: %d' % ((end_time - start_time).seconds) model_df = pd.DataFrame({ 'columns': list(train_df.columns)[1:], 'values': xbg_clf.feature_importances_ }) print model_df return xbg_clf
def FI_xgb_sklearn(): X, y = load_traindata(encodetype='le') cols = list(X.columns) rndcol = np.random.randn(X.shape[0]) X = np.column_stack((X, rndcol)) cols.append('random') xgb1 = XGBRegressor(learning_rate=0.01, n_estimators=3320, max_depth=3, min_child_weight=4, colsample_bytree=0.8, subsample=0.8, importance_type='total_gain', objective='reg:linear', n_jobs=-1, random_state=0, seed=27, silent=True) xgb1.fit(X, y) imp = sorted(list(zip(cols, xgb1.feature_importances_)), key=lambda t: abs(t[1]), reverse=True) imp = pd.DataFrame(imp, columns=['Feature', 'Importance']) rnd_idx = np.argwhere(imp['Feature'] == 'random')[0][0] print(imp.iloc[:rnd_idx + 1, :]) return imp
def skl_cv(self): logging.info("{0}:正在进行网格搜索".format(self.now_time())) if self.model == 'C': grid_search = GridSearchCV(estimator=self.rf, param_grid=self.cv_param, scoring='accuracy') grid_search.fit(self.X_train, self.Y_train) logging.info("{0}:最优参数:{1}".format(self.now_time(), grid_search.best_params_)) logging.info("{0}:最优参数acc结果:{1}".format(self.now_time(), grid_search.best_score_)) self.rf = XGBClassifier( n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'], min_child_weight=grid_search.best_params_['min_child_weight'], gamma=grid_search.best_params_['gamma'], learning_rate=grid_search.best_params_['learning_rate']) elif self.model == 'R': grid_search = GridSearchCV(estimator=self.rf, param_grid=self.cv_param, scoring='neg_mean_absolute_error') grid_search.fit(self.X_train, self.Y_train) logging.info("{0}:最优参数:{1}".format(self.now_time(), grid_search.best_params_)) logging.info("{0}:最优参数R平方结果:{1}".format(self.now_time(), grid_search.best_score_)) self.rf = XGBRegressor( n_estimators=grid_search.best_params_['n_estimators'], max_depth=grid_search.best_params_['max_depth'], min_child_weight=grid_search.best_params_['min_child_weight'], gamma=grid_search.best_params_['gamma'], learning_rate=grid_search.best_params_['learning_rate'])
def fit_model(self, data, target, test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) data = np.array(data).astype(float) scaler = MinMaxScaler() temp = scaler.fit(data) data = scaler.transform(data) test = scaler.transform(test) target = scaler.fit_transform(target) clf.fit(data, target) new_feature = clf.apply(data) new_test = clf.apply(test) X_train_new = self.mergeToOne(pd.DataFrame(data), new_feature) X_test_new = self.mergeToOne(pd.DataFrame(test), new_test) X_train_new = pd.DataFrame(X_train_new) X_test_new = pd.DataFrame(X_test_new) return X_train_new, target, X_test_new
def __init__(self, mean_model_params={}, upper_quantile_params={ 'alpha': 0.95, 'delta': 1.0, 'thresh': 1.0, 'variance': 1.0 }, lower_quantile_params={ 'alpha': 0.05, 'delta': 1.0, 'thresh': 1.0, 'variance': 1.0 }): self.mean_model_params = mean_model_params self.upper_quantile_params = upper_quantile_params self.lower_quantile_params = lower_quantile_params self.gb = XGBRegressor(**mean_model_params) mean_model_params.pop('alpha', None) upper_quantile_params_combined = {**mean_model_params} upper_quantile_params_combined.update(upper_quantile_params) lower_quantile_params_combiled = {**mean_model_params} lower_quantile_params_combiled.update(lower_quantile_params) self.gb_quantile_upper = XGBQuantileRegressor( **upper_quantile_params_combined) self.gb_quantile_lower = XGBQuantileRegressor( **lower_quantile_params_combiled) self.upper_alpha = upper_quantile_params['alpha'] self.lower_alpha = lower_quantile_params['alpha']
def set_grid_search(regrs, X_train, y_train, reg): if (regrs == 'tree'): random_grid = build_grid_tree() prms = grid_search(reg, X_train, y_train, random_grid) reg_prms = DecisionTreeRegressor(max_features=prms['max_features'], max_depth=prms['max_depth'], \ min_samples_split=prms['min_samples_split'], max_leaf_nodes=prms['max_leaf_nodes'], \ min_samples_leaf=prms['min_samples_leaf']) elif (regrs == 'forest'): random_grid = build_grid_rf() prms = grid_search(reg, X_train, y_train, random_grid) reg_prms = RandomForestRegressor(n_estimators=prms['n_estimators'],max_features=prms['max_features'], \ max_depth=prms['max_depth'], min_samples_split=prms['min_samples_split'], \ min_samples_leaf=prms['min_samples_leaf'], n_jobs=-1) elif (regrs == 'xgbr'): random_grid = build_grid_xgbr() prms = grid_search(reg, X_train, y_train, random_grid) reg_prms = XGBRegressor(learning_rate=prms['learning_rate'], max_depth=prms['max_depth'], \ min_child_weight=prms['min_child_weight'], n_estimators=prms['n_estimators'],\ subsample=prms['subsample'], n_jobs=-1) elif (regrs == 'nn'): random_grid = build_grid_nn() prms = grid_search(reg, X_train, y_train, random_grid) reg_prms = MLPRegressor(hidden_layer_sizes=prms['hidden_layer_sizes'],activation=prms['activation'],solver=prms['solver'],\ alpha=prms['alpha'],learning_rate_init=prms['learning_rate_init'],learning_rate=prms['learning_rate'],\ max_iter=prms['max_iter'],tol=prms['tol'],momentum=prms['momentum'],beta_1=prms['beta_1'],\ beta_2=prms['beta_2'],n_iter_no_change=prms['n_iter_no_change']) return (reg_prms)
def train(self, X_train, X_test, y_train, y_test): ''' Trains the machine learning model based on the dataframe provided as input. The fitted model will be saved under model/xgboost.pkl The function returns the MSE and the RMSE :param df: :return: RMSE and MAE scores ''' print('Training is starting...') eval_set = [(X_train, y_train), (X_test, y_test)] self.model = XGBRegressor(max_depth=7, objective='reg:squarederror', gamma=0, learning_rate=0.03, subsample=1, colsample_bytree=0.9, min_child_weight=10) self.model.fit(X_train, y_train, eval_set=eval_set, eval_metric="rmse", early_stopping_rounds=500) predictions = self.predict(X_test) with open('generated/gxboost_model.pickle', 'wb') as file: pickle.dump(self.model, file) self.evaluate(y_test, X_test)
def grid_search(self, X_train, X_test, y_train, y_test): grid_param = { 'max_depth': [n for n in range(2, 10)], 'gamma': np.arange(0, 0.5, 0.1), 'learning_rate': [0.0001, 0.001, 0.01, 0.1], 'subsample': np.arange(0.5, 0.9, 0.1), 'colsample_bytree': np.arange(0.5, 0.9, 0.1), 'min_child_weight': [1, 3, 5, 7] } model = XGBRegressor(max_depth=7, objective='reg:squarederror', gamma=0, learning_rate=0.03, subsample=1, colsample_bytree=0.9, min_child_weight=10) gd_sr = GridSearchCV(estimator=model, param_grid=grid_param, scoring='neg_mean_squared_error', cv=5, n_jobs=-1) gd_sr.fit(X_train, y_train) best_parameters = gd_sr.best_params_ print(best_parameters)
def xgbregressor(xtrain, y_train, x_test): xgb_reg = XGBRegressor() parameters = {'nthread':[4], 'objective':['reg:linear'], 'learning_rate': [.07], 'max_depth': [7], 'min_child_weight': [4], 'silent': [1], 'subsample': [0.7], 'colsample_bytree': [0.7], 'n_estimators': [12]} clf_xgbreg = GridSearchCV(xgb_reg, parameters, n_jobs = 5, cv = 2, verbose=True) clf_xgbreg.fit(x_train,y_train) #print(clf_xgbreg.best_params_) #values_to_predict = y_train preds = clf_RF.predict(x_train) y_test_pred = clf_xgbreg.predict(x_test) print(y_test_pred) print(pd.DataFrame(y_test_pred).describe()) return preds
def over_sample(train, test, feat): predictors = [x for x in train.columns if x not in ['ID', 'y']] groups = list(train[feat].unique()) result = None for name in groups: train_temp = pd.concat([train, train[train[feat] == name]]) test_temp = test[test[feat] == name] model = XGBRegressor(max_depth=4, learning_rate=0.0045, n_estimators=1250, silent=True, objective='reg:linear', nthread=-1, min_child_weight=1, max_delta_step=0, subsample=0.93, seed=27) model.fit(train_temp[predictors], train_temp['y']) pred = model.predict(test_temp[predictors]) if result is None: result = pd.DataFrame({'ID': test_temp['ID'].values, 'y': pred}) else: result = pd.concat([ result, pd.DataFrame({ 'ID': test_temp['ID'].values, 'y': pred }) ]) result.sort_values('ID', inplace=True) return result
def get_estimator(): drop_cols = [ 'CODGEO', 'LIBGEO', 'REG', 'DEP', 'Code Nuance', 'Code du département' ] base_cols = [ 'Orientation Economique', 'SEG Croissance POP', 'Urbanité Ruralité', 'Dynamique Démographique BV', 'Environnement Démographique', 'Fidélité', 'SYN MEDICAL', 'Seg Dyn Entre', 'SEG Environnement Démographique Obsolète', 'Seg Cap Fiscale', 'DYN SetC', 'CP', 'MED14', 'Nb Femme', 'Nb Homme' ] base_transformer = FunctionTransformer(_preprocessor, validate=False) base_transformer = make_pipeline(base_transformer, SimpleImputer(strategy='most_frequent')) preprocessor = ColumnTransformer( transformers=[ ('base', base_transformer, base_cols), ('drop cols', 'drop', drop_cols), ], remainder='passthrough') # remainder='drop' or 'passthrough' regressor = XGBRegressor() pipeline = Pipeline(steps=[('preprocessing', preprocessor), ('Regressor', regressor)]) return pipeline
def model_xgb_search(self, X, Y): # train_x, valid_x, train_y, valid_y = train_test_split(X, Y, test_size=0.1, random_state=0) # 分训练集和验证集 print('model_xgb_search start') xgb_model = XGBRegressor() # cv_split = ShuffleSplit(n_splits=5, train_size=0.7, test_size=0.2) # param_grid = dict( # max_depth=[2], # min_child_weight= [1, 2, 3, 4, 5, 6], # gamma=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6], # learning_rate=np.linspace(0.03, 1, 10), # n_estimators=[50, 100, 200, 400], # num_class=[2], # objective=['multi:softmax'] # ) param_grid = dict( max_depth=[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], # 3 learning_rate=np.linspace(0.03, 0.1, 5), n_estimators=[100, 200, 300], # 200 # num_class=[2], # objective=['multi:softmax'] # 'binary:logistic' ) start = time.time() cv_split = StratifiedKFold(n_splits=5, shuffle=True) grid = GridSearchCV(xgb_model, param_grid, cv=cv_split) # scoring='neg_log_loss' grid_result = grid.fit(X, Y) print("Best: %f using params: %s estimator: %s" % ( grid_result.best_score_, grid_result.best_params_, grid_result.best_estimator_)) print('GridSearchCV process use %.2f seconds' % (time.time() - start)) print("Save model to " + self.model_path) dump(grid_result, self.model_path) print('end=======')
def xgboostmodel(self): df = pd.read_csv(datafile, encoding='utf-8', index_col=0) print(df.shape) traindata = df.iloc[:, :].values x = traindata[:, :-1] y = traindata[:, -1] x_train, x_test, y_train, y_test = train_test_split( x, y, train_size=0.7) # list if self.params is None: params = {'max_depth': 80, 'n_estimators': 512} else: params = self.params raw_model = XGBRegressor(max_depth=128, n_estimators=768, learning_rate=0.01, silence=False) raw_model.fit(x_train, y_train) raw_model.save_model(self.model_file) pred = raw_model.predict(x_test) self.true = y_test self.pred = pred self.show_save_figure(fig_path=self.fig_path, modelname=self.job_name, detal_idx=500) t_mean = self.cal_mean(self.true) p_mean = self.cal_mean(self.pred) self.save_result(self.result_path, true_mean=t_mean, pred_mean=p_mean)
def fit_model_split(self, X_train, y_train, X_test, y_test): ##X_train_1用于生成模型 X_train_2用于和新特征组成新训练集合 X_train_1, X_train_2, y_train_1, y_train_2 = train_test_split( X_train, y_train, test_size=0.6, random_state=0) clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train_1, y_train_1) # y_pre = clf.predict(X_train_2) # y_pro = clf.predict_proba(X_train_2)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_train_2, y_pro) # print # "pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_train_2, y_pre) new_feature = clf.apply(X_train_2) X_train_new2 = self.mergeToOne(X_train_2, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set of sample size 0.4 fewer than before" return X_train_new2, y_train_2, X_test_new, y_test
def model_intrv3(Y_train, X_train, Y_test, X_test, Targ): global reslts global metrs import pandas as pd import numpy as np import datetime as dt import sklearn from sklearn.metrics import mean_squared_error from xgboost.sklearn import XGBRegressor from sklearn.metrics import mean_squared_error model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=4, random_state=0, subsample=0.9, colsample_bytree=1.0, loss='ls').fit(X_train, Y_train) model.score(X_test, Y_test) pred_Yxgb = model.predict(X_test) mse = mean_squared_error(Y_test, pred_Yxgb) nRMSE = np.sqrt(mse) / Targ.mean() # nRMSE=np.sqrt(mse)/max(Targ) Yts_pd = {'Yts': Y_test, 'Ypd': pred_Yxgb} Yts_pd = pd.DataFrame(Yts_pd) print(mse, nRMSE) metrs = {'mse': mse, 'nRMSE': nRMSE} reslts = {'Ypred': pred_Yxgb, 'Yts_pd': Yts_pd} return {'Yts_pd': Yts_pd, 'mse': mse, 'nRMSE': nRMSE}
def fit_model(self, X_train, y_train, X_test, y_test): clf = XGBRegressor(learning_rate=self.learning_rate, n_estimators=self.n_estimators, max_depth=self.max_depth, min_child_weight=self.min_child_weight, gamma=self.gamma, subsample=self.subsample, colsample_bytree=self.colsample_bytree, objective=self.objective, nthread=self.nthread, scale_pos_weight=self.scale_pos_weight, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, seed=self.seed) clf.fit(X_train, y_train) # y_pre = clf.predict(X_test) # y_pro = clf.predict_proba(X_test)[:, 1] # print # "pred_leaf=T AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro) # print("pred_leaf=T Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)) new_feature = clf.apply(X_train) X_train_new = self.mergeToOne(X_train, new_feature) new_feature_test = clf.apply(X_test) X_test_new = self.mergeToOne(X_test, new_feature_test) print "Training set sample number remains the same" return X_train_new, y_train, X_test_new, y_test
def train_xgb(df_preprocessed, df_target): """ Train an XGBoost Regressor on the data :param df_preprocessed: features :param df_target: target :return: a tuple of best estimator and best estimator score """ xgb_reg = XGBRegressor( nthread=4, objective='reg:linear', learning_rate=0.02, # so called `eta` value max_depth=10, min_child_weight=1, gamma=3, subsample=1.0, colsample_bytree=0.35) param_grid = {'n_estimators': [1000]} gridsearch_xgb = GridSearchCV(xgb_reg, param_grid, cv=3, verbose=1, n_jobs=-1, scoring='neg_mean_squared_error') gridsearch_xgb.fit(df_preprocessed, df_target) # save the model to disk # xgb_filename = r'models\xgboost_model.sav' # pickle.dump(gridsearch_xgb, open(xgb_filename, 'wb')) print(np.sqrt(-gridsearch_xgb.best_score_)) return gridsearch_xgb.best_estimator_, np.sqrt(-gridsearch_xgb.best_score_)
def def_model(self, parameters: dict = None): model = XGBRegressor() if parameters is not None: model.set_params(**parameters) self._model = model
def XGB_reg_evaluation(individual, evaluation_method='roll_win'): ''' evaluation_method : can be roll_win, mse ''' if evaluation_method == 'roll_win': trainNumber = individual[6] # the train num param = { 'eta': individual[0], 'silent': True, 'objective': "reg:linear", 'nthread': -1, 'min_child_weight': individual[1], 'max_depth': individual[2], 'subsample': individual[3], 'colsample_bylevel': individual[4], 'seed': 0 } roll_win_mseValue = 0 for i in xrange(N_validation): trainingX, trainingY = trainX[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window),:],\ trainY[(trainNum - (i + 1) * window - trainNumber):(trainNum - (i + 1) * window)] testingX, testingY= trainX[(trainNum - (i + 1) * window):(trainNum - i * window),:], \ trainY[(trainNum - (i + 1) * window):(trainNum - i * window)] dtrain = xgb.DMatrix(data=trainingX, label=trainingY) bst = xgb.train(params=param, dtrain=dtrain, num_boost_round=individual[5]) testingX = xgb.DMatrix(testingX) roll_win_mseValue += sum( (testingY - bst.predict(testingX))**2) / window roll_win_mseValue /= N_validation return (roll_win_mseValue, ) if evaluation_method == 'mse': ### The cross validation evaluation N_SPLITS = N_splits kf = KFold(n_splits=N_SPLITS) cv_mseValue = 0 fc = XGBRegressor(learning_rate=individual[0], n_estimators=individual[5], silent=True, objective="reg:linear", nthread=-1, gamma=0, min_child_weight=individual[1], max_depth=individual[2], subsample=individual[3], colsample_bylevel=individual[4], seed=0) for train, test in kf.split(trainX): fc.fit(trainX[train, :], trainY[train]) cv_mseValue += sum( (trainY[test] - fc.predict(trainX[test, :]))**2) / len(test) cv_mseValue = cv_mseValue / N_SPLITS return (cv_mseValue, ) print "There is no evaluation method for %s" % evaluation_method raise Exception("evaluation_method is not valid")
def xgboost_single_pred(self): x_train = self.x_train y_train = self.y_train x_test = self.x_test y_test = self.y_test self.y_pred_all_xgb = [] y_train = list(y_train) xgboost_clf = XGBRegressor(learning_rate=0.1, n_estimators=75) for i in range(len(x_test)): xgboost_clf.fit(x_train, y_train) x_test_one = x_test.iloc[i:i + 1] y_test_one = xgboost_clf.predict(x_test_one) self.y_pred_all_xgb.append(list(y_test_one)[0]) x_train = x_train.append(x_test_one) y_train.append(y_test[i]) xgboost_mse = mean_squared_error(self.y_test, self.y_pred_all_xgb) xgboost_rmse = np.sqrt(xgboost_mse) y_pred_all_xgb = pd.DataFrame(list(self.y_pred_all_xgb)) ratio_single_xgb = pd.DataFrame(list(self.y_test)) / y_pred_all_xgb return xgboost_rmse, y_pred_all_xgb, ratio_single_xgb
def __train_model(self, features): combo_list = [ ['available_year_avg', 'min_nights_year_avg', 'price_year_avg'] # ['available_winter_avg', 'min_nights_winter_avg', 'price_winter_avg'], # ['available_spring_avg', 'min_nights_spring_avg', 'price_spring_avg'], # ['available_summer_avg', 'min_nights_summer_avg', 'price_summer_avg'] ] for combo in combo_list: X_base = features.drop([ 'price_year_avg', 'price_winter_avg', 'price_spring_avg', 'price_summer_avg', 'price_fall_avg', 'available_year_avg', 'available_winter_avg', 'available_spring_avg', 'available_summer_avg', 'available_fall_avg', 'min_nights_year_avg', 'min_nights_winter_avg', 'min_nights_spring_avg', 'min_nights_summer_avg', 'min_nights_fall_avg' ], axis=1) X_base[combo[0]] = features[combo[0]] X_base[combo[1]] = features[combo[1]] y = features[combo[2]] X_train, X_test, y_train, y_test = train_test_split(X_base, y, test_size=.25, random_state=42, shuffle=True) model = XGBRegressor( objective='reg:squarederror', learning_rate=0.1, max_depth=8, n_estimators=200, cv=5, n_jobs=-1 ) model.fit(X_train, y_train) self.logger.info('Gradient boost model:') self.logger.info(f'Target label: {combo[2]}') self.logger.info(f'R^2: {model.score(X_test, y_test)}') self.logger.info(f'MAE: {mean_absolute_error(y_test, model.predict(X_test))}') return model
def _build(self, **config): """ build the models and initialize. :param config: hyper parameters for building the model :return: """ self.set_params(**config) if self.model_type == "regressor": self.model = XGBRegressor(n_estimators=self.n_estimators, max_depth=self.max_depth, n_jobs=self.n_jobs, tree_method=self.tree_method, random_state=self.random_state, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, seed=self.seed, subsample=self.subsample, colsample_bytree=self.colsample_bytree, gamma=self.gamma, reg_alpha=self.reg_alpha, reg_lambda=self.reg_lambda, verbosity=self.verbosity) elif self.model_type == "classifier": self.model = XGBClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, n_jobs=self.n_jobs, tree_method=self.tree_method, random_state=self.random_state, learning_rate=self.learning_rate, min_child_weight=self.min_child_weight, seed=self.seed, subsample=self.subsample, colsample_bytree=self.colsample_bytree, gamma=self.gamma, reg_alpha=self.reg_alpha, objective='binary:logistic', reg_lambda=self.reg_lambda, verbosity=self.verbosity) else: raise ValueError("model_type can only be \"regressor\" or \"classifier\"") self.model_init = True
def cbd_model(cbd_df,cbd_finalinput): ''' function that creates model from the cbd dataframe and returns the predicted number of crimes for the next three days ''' X_cbd=cbd_df[['year', 'month', 'day', 'tmax', 'tmin', 'consumer_price_index', 'gdp_millions_2007', 'seasonally_adjusted_unemployment', 'unadjusted_unemployment', 'Possession, cocaine ', 'Heroin, possession ', 'Heroin Price Canada', 'day_segment_1200pm-1159pm', 'day_of_week_Monday', 'day_of_week_Saturday', 'day_of_week_Sunday', 'day_of_week_Thursday', 'day_of_week_Tuesday', 'day_of_week_Wednesday']] y_cbd=cbd_df['number_of_crimes'] scaler = StandardScaler() scaler.fit(X_cbd) # Don't cheat - fit only on training data X_cbd = scaler.transform(X_cbd) cbd_input_scaled = scaler.transform(cbd_finalinput) xgb=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3, min_child_weight=1, missing=None, n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None, silent=True, subsample=1) xgb.fit(X_cbd,y_cbd) predict_cbd=xgb.predict(cbd_input_scaled) return predict_cbd
def get_ntree(): rmse_t_total, rmse_v_total = [], [] for ntree in range(10, 500, 10): xgb_base = XGBRegressor(objective='reg:linear', n_estimators=ntree, random_state=1234, silent=0, booster='gbtree', eval_metric='rmse') rmse_t_1, rmse_v_1 = [], [] print('此时 ntree = %s' % ntree) for train, test in get_cv(y=y_train, n_splits=5, random_state=42): X_t, y_t = X_train[train], y_train[train] X_v, y_v = X_train[test], y_train[test] xgb_base.fit(X_t, y_t) y_t_pre = xgb_base.predict(X_t) y_v_pre = xgb_base.predict(X_v) rmse_t_each = np.sqrt(mean_squared_error(y_t, y_t_pre)) rmse_v_each = np.sqrt(mean_squared_error(y_v, y_v_pre)) rmse_t_1.append(rmse_t_each) rmse_v_1.append(rmse_v_each) rmse_t = np.mean(rmse_t_1) rmse_v = np.mean(rmse_v_1) rmse_t_total.append(rmse_t) rmse_v_total.append(rmse_v) return rmse_t_total, rmse_v_total
def train_first_test(experiment_name, x_train, y_train, features): global file_loc file_loc = 'data/' + experiment_name + '/' from xgboost.sklearn import XGBRegressor import scipy.stats as st one_to_left = st.beta(10, 1) from_zero_positive = st.expon(0, 50) params = { "n_estimators": st.randint(3, 15), "max_depth": st.randint(3, 40), "learning_rate": st.uniform(0.05, 0.4), "colsample_bytree": one_to_left, "subsample": one_to_left, "gamma": st.uniform(0, 10), 'reg_alpha': from_zero_positive, "min_child_weight": from_zero_positive, } #xgbreg = XGBRegressor(nthreads=-1) xgbreg = XGBRegressor() from sklearn.model_selection import RandomizedSearchCV gs = RandomizedSearchCV(xgbreg, params, n_jobs=1) gs.fit(x_train, y_train) joblib.dump(gs.best_estimator_, file_loc + 'clf_bestmodel.pkl') return gs.best_estimator_
def __init__(self, nb_classes, bags=1, param={}): import xgboost as xgb from xgboost.sklearn import XGBRegressor self.nb_classes = nb_classes self.objective = param.get('objective','reg:linear') self.nthread = param.get('nthread',-1) self.n_estimators = param.get('n_estimators',10) self.max_depth = param.get('max_depth', 6) self.learning_rate = param.get('learning_rate', 0.3) self.colsample_bytree = param.get('colsample_bytree', 1.0) self.subsample = param.get('subsample', 1.0) self.missing = param.get('missing', None) self.seed = param.get('seed', 0) self.bags = bags self.bags_models = tuple() self.train_y = None for bag in range(self.bags): models = tuple() for k in range(self.nb_classes): model = XGBRegressor(objective = self.objective, nthread = self.nthread, seed = self.seed + bag, n_estimators = self.n_estimators, missing = self.missing, max_depth = self.max_depth, learning_rate = self.learning_rate, colsample_bytree = self.colsample_bytree, subsample = self.subsample) models = models + (model,) self.bags_models = self.bags_models + (models, )
def learn_model(X_train, y_train, X_valid, y_valid): t1 = time() model = XGBRegressor(max_depth=7, n_estimators=500) model.fit(X_train, y_train, eval_metric="rmse", eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=True, early_stopping_rounds=10) t2 = time() print('Total of training time: ', t2 - t1) return model
def tun_reg_alpha(reg_alpha_range, param_data_path, train_x, train_y): ''' tune the reg_alpha param in xgboost get the best param and save them to the file for further tuning :param reg_alpha_range: the range of reg_alpha you want to test :param param_data_path: default './../data/param_data.pkl' :return: void ''' # get the newest param first param_dict = get_param_data(param_data_path=param_data_path) print "正则化参数reg_alpha调优" param_test1 = {'reg_alpha': reg_alpha_range} gsearch1 = GridSearchCV(estimator=XGBRegressor(**param_dict), param_grid=param_test1, scoring='neg_mean_squared_error', iid=False, cv=5) gsearch1.fit(X=train_x, y=train_y) # show the results for i in gsearch1.grid_scores_: print i print "best_params_ and best_score_:" print gsearch1.best_params_, gsearch1.best_score_ # change some param and return param_dict['reg_alpha'] = gsearch1.best_params_['reg_alpha'] save_param_data(param_dict=param_dict, param_data_path=param_data_path)
def search_best_parameters(X, y): xgb_grid = { 'n_estimators': [80, 100, 120], 'max_depth': [3, 4, 5], 'learning_rate': [0.1, 0.2, 0.5], 'booster': ['gbtree', 'gblinear', 'dart'], 'gamma': [0, 0.2, 0.5], 'subsample': [0.5, 0.8], 'reg_alpha': [0.2, 0.3, 0.5], 'reg_lambda': [0.5, 0.8, 1], 'colsample_bytree': [1, 0.8, 0.5], 'colsample_bylevel': [1, 0.8, 0.5], 'colsample_bynode': [1, 0.8, 0.5], 'random_state': [77] } xgb_gridsearch = GridSearchCV(XGBRegressor(), xgb_grid, n_jobs=-1, verbose=True, scoring='r2') xgb_gridsearch.fit(X, y) print(f"best parameters: {xgb_gridsearch.best_params_}")