def train(self): print('#### preprocessing ####') self.df = self.preprocess(self.df) print('#### training ####') self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]] xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan) try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose) except: try: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose) except: cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5, metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(n_estimators=cvresult.shape[0]) self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring) #Predict training set: train_df_predictions = self.clf.predict(self.df[self.predictors]) if self.target_type == 'binary': train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1] print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions)) print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob)) elif self.target_type == 'linear': print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)) print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))
def test_cv_explicit_fold_indices_labels(self): params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'reg:linear'} N = 100 F = 3 dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N)) folds = [ # Train Test ([1, 3], [5, 8]), ([7, 9], [23, 43, 11]), ] # Use callback to log the test labels in each fold def cb(cbackenv): print([fold.dtest.get_label() for fold in cbackenv.cvfolds]) # Run cross validation and capture standard out to test callback result with captured_output() as (out, err): xgb.cv( params, dm, num_boost_round=1, folds=folds, callbacks=[cb], as_pandas=False ) output = out.getvalue().strip() solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' + ' dtype=float32)]') assert output == solution
def test_sklearn_nfolds_cv(): tm._skip_if_no_sklearn() from sklearn.datasets import load_digits from sklearn.model_selection import StratifiedKFold digits = load_digits(3) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 3 } seed = 2016 nfolds = 5 skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed) cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) cv2 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, folds=skf, seed=seed) cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
def test_cv(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } import pandas as pd cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) assert isinstance(cv, pd.DataFrame) exp = pd.Index([u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std']) assert cv.columns.equals(exp) # show progress log (result is the same as above) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, show_progress=True) assert isinstance(cv, pd.DataFrame) exp = pd.Index([u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std']) assert cv.columns.equals(exp) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, show_progress=True, show_stdv=False) assert isinstance(cv, pd.DataFrame) exp = pd.Index([u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std']) assert cv.columns.equals(exp) # return np.ndarray cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) assert isinstance(cv, np.ndarray) assert cv.shape == (10, 4)
def test_custom_objective(self): param = {'max_depth':2, 'eta':1, 'silent':1 } watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 def logregobj(preds, dtrain): labels = dtrain.get_label() preds = 1.0 / (1.0 + np.exp(-preds)) grad = preds - labels hess = preds * (1.0-preds) return grad, hess def evalerror(preds, dtrain): labels = dtrain.get_label() return 'error', float(sum(labels != (preds > 0.0))) / len(labels) # test custom_objective in training bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) assert err < 0.1 # test custom_objective in cross-validation xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0, obj = logregobj, feval=evalerror)
def test_sklearn_nfolds_cv(): digits = load_digits(3) X = digits['data'] y = digits['target'] dm = xgb.DMatrix(X, label=y) params = { 'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'multi:softprob', 'num_class': 3 } seed = 2016 nfolds = 5 skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed) import pandas as pd cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed) cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed) cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed) assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0] assert cv2.iloc[-1,0] == cv3.iloc[-1,0]
def xgb_model(all_file, num=200, debug=True): if debug: all_data = pd.read_csv(all_file,nrows=500, encoding='gb18030') else: all_data = pd.read_csv(all_file, encoding='gb18030') train_data = all_data[all_data['tag'] ==1] feature_data = train_data.drop(['Idx', 'ListingInfo', 'target','tag'],axis=1) feature_data.fillna(-1, inplace=True) labels = train_data['target'] # feature_importance = pd.read_csv(features_importance_file) # feature_importance_columns = feature_importance['feature'].tolist() # feature_importance_columns = feature_importance_columns[:num] # final_train_data = feature_data[feature_importance_columns] final_train_data = feature_data print final_train_data.shape labels = train_data['target'] dtrain = xgb.DMatrix(final_train_data, label=labels, missing=-1) # xgb_params = {'subsample':0.9, 'min_child_weight': 1, 'eval_metric': 'rmse', 'fit_const': 0.5, # 'nthread': 3, 'num_round': 700, 'gamma': 5, 'max_depth': 6, 'eta': 0.01, # 'colsample_bytree': 0.6, 'silent': 1, 'objective': 'binary:logistic'} # xgb_params = {'num_round': 2200, 'colsample_bytree': 0.4, 'silent': 1, 'eval_metric': 'auc', 'nthread': 3, # 'min_child_weight': 1, 'subsample': 0.66, 'eta': 0.006, 'fit_const': 0.6, 'objective': 'binary:logistic', # 'max_depth': 6, 'gamma': 0} xgb_params = {'num_round': 2400, 'colsample_bytree': 0.5, 'silent': 1, 'eval_metric': 'auc', 'nthread': 3, 'min_child_weight': 6, 'subsample': 0.8, 'eta': 0.016, 'fit_const': 0.4, 'objective': 'binary:logistic', 'max_depth': 10, 'gamma': 1} xgb.cv(xgb_params, dtrain, num_boost_round=2400, nfold=5, metrics={'auc'}, show_progress=True) print 'finished'
def test_fpreproc(self): param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'} num_round = 2 def fpreproc(dtrain, dtest, param): label = dtrain.get_label() ratio = float(np.sum(label == 0)) / np.sum(label==1) param['scale_pos_weight'] = ratio return (dtrain, dtest, param) xgb.cv(param, dtrain, num_round, nfold=5, metrics={'auc'}, seed = 0, fpreproc = fpreproc)
def cross_validation(): for k in sorted(train_y.keys()): if k.startswith('TripType_'): dtrain = xgboost.DMatrix(train_X, label=train_y) params = { 'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } print xgboost.cv(params, dtrain, num_round=2, nfold=5, metrics={'error'}, seed=0) break
def cross_validation(): dtrain = xgb.DMatrix('dataset_dmatrix/offline_0516_sim.train.buffer') param = {'max_depth':5, 'eta':0.08, 'silent':1, 'objective':'binary:logistic'} param['nthread'] = 8 param['subsample'] = 0.5 num_round = 1500 print ('running cross validation') xgb.cv(param, dtrain, num_round, nfold=3, show_progress=True,feval=evalerror ,seed = 0,show_stdv=False,maximize=True)
def cross_validate(args): """ Usage: cv iq_training_data_svm.txt dummy --num_round=1000 https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py :param args: :return: """ data = xgb.DMatrix(args.input) param = vars(args) xgb.cv(param, data, args.num_round, nfold=int(args.nfold), metrics={'mlogloss', 'merror'}, seed=0)
def model_param_select(self): """ k-folds cross validation to select the best param """ params = {'max_depth': self.max_depth, 'eta': self.eta, 'subsample': self.subsample, 'objective': self.objective, 'silent': self.silent} best_auc, best_param, best_iter_round = 0, {}, 0 param_grid = ParameterGrid(params) for i, param in enumerate(param_grid): cv_result = xgb.cv(param, self.train_matrix, num_boost_round=self.num_boost_round, # max iter round nfold=self.nfold, stratified=self.stratified, metrics=self.metrics, # metrics focus on early_stopping_rounds=self.early_stopping_rounds) # stop when metrics not get better cur_auc = cv_result.ix[len(cv_result)-1, 0] cur_iter_round = len(cv_result) if cur_auc > best_auc: best_auc, best_param, best_iter_round = cur_auc, param, cur_iter_round print('Param select {}, auc: {}, iter_round: {}, params: {}, now best auc: {}' .format(i, cur_auc, cur_iter_round, param, best_auc)) return best_auc, best_param, best_iter_round
def train_cv(self, dtrain, **args): cv_args = self.parse_args(args, method='cv') print(cv_args) cv_xgb = xgb.cv(dtrain=dtrain, **cv_args) results = self.persit_result_cv(cv_xgb, args) return results
def modelfit(alg, data, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): ''' a variation of: http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ ''' if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(data['x_train'][predictors], label=data['y_train']) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(data['x_train'][predictors], data['y_train'], eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(data['x_train'][predictors]) dtrain_predprob = alg.predict_proba(data['x_train'][predictors])[:,1] #Print model report: print ("\nModel Report") print ("Accuracy : %.4g" % metrics.accuracy_score(data['y_train'].values, dtrain_predictions)) print ("AUC Score (Train): %f" % metrics.roc_auc_score(data['y_train'], dtrain_predprob)) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp[0:20].plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() return alg
def xgbTuning(self, pX, change = 3): w = self.getWeight(self.y) dm = xgb.DMatrix(pX, self.y, weight=w) best_auc = 0 n = pX.shape[0] best_params = None for i in range(change): randp = np.random.random_sample(3) param = { 'bst:eta': randp[0], 'max_depth': int(3+6*randp[1]) , 'nthread':4, 'silent':1, 'alpha':randp[2], 'eval_metric':'auc', 'objective': 'binary:logistic' } m = xgb.cv(param, dm, metrics='auc', nfold=3, num_boost_round = 50,early_stopping_rounds=5) auc = m['test-auc-mean'].max() if auc > best_auc : print 'xgb:' + str(auc) best_auc = auc best_params = param Xtrain, Xtest, ytrain, ytest = train_test_split(pX, self.y, test_size=.33) trainw = self.getWeight(ytrain) testw = self.getWeight(ytest) dtrain = xgb.DMatrix(Xtrain, label = ytrain, feature_names=Xtrain.columns, weight = trainw) dtest = xgb.DMatrix(Xtest, label = ytest, feature_names=Xtest.columns, weight = testw) evallist = [(dtrain, 'train'), (dtest, 'eval')] booster = xgb.train(best_params, dtrain, evals=evallist, num_boost_round=100,early_stopping_rounds=10) rounds = booster.attr("best_iteration") best_auc = booster.attr("best_score") return float(best_auc), xgb.train(best_params, dtrain, num_boost_round=int(rounds))
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['target'].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the DataFrame alg.fit(dtrain[predictors], dtrain['target'],eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] #Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions) print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['target'], dtrain_predprob) importances = alg.booster().get_fscore() importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True) df = pd.DataFrame(importances, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() df.to_csv("ips.csv", index=False) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) print feat_imp
def model_2(train, labels, test): dtrain = xgb.DMatrix(train, label=labels) dtest = xgb.DMatrix(test) xgb_params = {} xgb_params["objective"] = "reg:linear" xgb_params["eta"] = 0.1 xgb_params["subsample"] = 0.7 xgb_params["silent"] = 1 xgb_params["max_depth"] = 6 xgb_params['eval_metric'] = 'rmse' xgb_params['min_child_weight'] = 5 xgb_params['seed'] = 22424 res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=2017, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 cv_mean = res.iloc[-1, 0] cv_std = res.iloc[-1, 1] print('') print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std)) bst = xgb.train(xgb_params, dtrain, best_nrounds) preds = np.exp(bst.predict(dtest)) return preds
def cv(self, X, y): X = self.build_matrix(X, y) param = { 'silent': 1 if self.silent else 0, 'use_buffer': int(self.use_buffer), 'num_round': self.num_round, 'ntree_limit': self.ntree_limit, 'nthread': self.nthread, 'booster': self.booster, 'eta': self.eta, 'gamma': self.gamma, 'max_depth': self.max_depth, 'min_child_weight': self.min_child_weight, 'subsample': self.subsample, 'colsample_bytree': self.colsample_bytree, 'max_delta_step': self.max_delta_step, 'l': self.l, 'alpha': self.alpha, 'lambda_bias': self.lambda_bias, 'objective': self.objective, 'eval_metric': self.eval_metric, 'seed': self.seed, 'num_class': self.num_class, } results = xgb.cv(param, X, self.num_round, 3) return results
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None): train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan')) test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan')) evallist = [(test_data,'eval'), (train_data,'train')] #if xgb_params == None: # xgb_params = get_default_xgboost_params() if not use_cv: num_rounds = 10 else: cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5, metrics={'rmse'}, show_progress=True) print cvresult num_rounds = len(cvresult) gbdt = None if(use_sklean): #gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None) xgb_params['n_estimators'] = num_rounds gbdt = xgboost.XGBRegressor(xgb_params) gbdt.fit(x_train, y_train) y_pred = gbdt.predict(X_test) return gbdt, y_pred else: #gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5) gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True) ceate_feature_map_for_feature_importance(features) show_feature_importance(gbdt, feature_names=features) y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan"))) return XGBoostModel(gbdt), y_pred
def modelfit(alg, train_data, train_label, cv_folds=5, early_stopping_rounds=1): xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(train_data, label=train_label) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics=['auc'], early_stopping_rounds=early_stopping_rounds, show_progress=True) alg.set_params(n_estimators=cvresult.shape[0]) # Goal of CV is to tune the number of rounds, which is set here # Note: can change to a different day to see what happens start = time.time() alg.fit(train_data, train_label, eval_metric='auc') print "Time to fit: %s" % (time.time()-start) pickle.dump(alg, open("/home/jche/Desktop/xgboost.p", "w+")) # Save model start = time.time() dtrain_predprob = alg.predict_proba(train_data)[:,1] print "Time to predict: %s" % (time.time() - start) for cutoff in range(0, 41): cut = cutoff/float(100) # Cutoff in decimal form dtrain_predictions = dtrain_predprob > cut # If y values are greater than the cutoff # Print model report: print "\nModel Report for cutoff %s" % cut print "Accuracy : %.4g" % metrics.accuracy_score(train_label, dtrain_predictions) print "AUC Score (Train): %f" % metrics.roc_auc_score(train_label, dtrain_predprob) print "Recall is: %s" % metrics.recall_score(train_label, dtrain_predictions) print metrics.confusion_matrix(train_label, dtrain_predictions)
def fit(self, X, y): self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) self._le = XGBLabelEncoder().fit(y) training_labels = self._le.transform(y) xgdmat = xgb.DMatrix(X, label=training_labels) if self.n_classes_ > 2: self.param_map.update({'num_class':self.n_classes_}) self.param_map.update({'objective':'multi:softprob'}) self.results = xgb.cv(self.param_map, xgdmat, self.num_boost_round, self.nfold, self.stratified, self.folds, self.metrics, self.obj, self.feval, self.maximize, self.early_stopping_rounds, self.fpreproc, self.as_pandas, self.verbose_eval, self.show_stdv, self.seed, self.callbacks)
def modelfit(alg, dtrain, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] #Print model report: print ("\nModel Report") print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions)) print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob)) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show()
def do_compute(x): row = grid.iloc[x,:] eta = row['eta'] min_child_weight = row['min_child_weight'] colsample_bytree = row['colsample_bytree'] max_depth = row['max_depth'] subsample = row['subsample'] _lambda = row['lambda'] nround = row['nround'] #### xgb_pars = {'min_child_weight': min_child_weight, 'eta': eta, 'colsample_bytree': colsample_bytree, 'max_depth': int(max_depth), 'subsample': subsample, 'lambda': _lambda, 'nthread': -1, 'booster' : 'gbtree', 'silent': 1, 'eval_metric': 'rmse', 'objective': 'reg:linear'} #print(xgb_pars) model = xgb.cv(xgb_pars, dtrain, 100000,nfold = 4, early_stopping_rounds=50,maximize=False, verbose_eval=10) nround = model.shape[0] rmse_cv_mean = model['test-rmse-mean'][model.shape[0]-1] rmse_cv_std = model['test-rmse-std'][model.shape[0]-1] # calculate the square of the value of x grid.loc[x,'rmse_cv_mean'] = rmse_cv_mean grid.loc[x,'rmse_cv_std'] = rmse_cv_std grid.loc[x,'nround'] = nround grid.to_csv('base_grid_xgb_40perc__'+str(os.getpid())+'.csv',index=False) return rmse_cv_mean
def xgbCV(dmatrix, nfolds, eta_list, gamma_list, num_rounds = 500): params = {'eta':'', 'gamma':'', 'objective':'binary:logistic', 'verbose':3, 'max_depth':20, 'subsample':.75, 'colsample_bytree':.75} vals = {'eta':[], 'gamma':[], 'num_iter':[], 'mean_cv_error':[], 'std_cv_error':[]} for e in eta_list: for g in gamma_list: params['eta'] = e params['gamma'] = g vals['eta'].append(e) vals['gamma'].append(g) print('Training the booster with a learning rate of', e, "and gamma of ", g) bst = xgb.cv(params, dmatrix, num_rounds, nfolds, early_stopping_rounds = 2) print('Stopped after', len(bst.index), "rounds.") best_iter = bst.nsmallest(1, 'test-error-mean') vals['num_iter'].append(best_iter.index[0]) vals['mean_cv_error'].append(best_iter['test-error-mean']) vals['std_cv_error'].append(best_iter['test-error-std']) cv_df = pd.DataFrame.from_dict(vals) return(cv_df)
def modelfit(alg, train, target, test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgboost_params = alg.get_xgb_params() xgtrain = xgb.DMatrix(train.values, label=target.values) xgtest = xgb.DMatrix(test.values) watchlist = [(xgtrain, 'train')] # Specify validations set to watch performance cvresult = xgb.cv(xgboost_params, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) #metrics='auc',show_progress=False alg.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data alg.fit(train, target, eval_metric='auc') # Predict training set: train_preds = alg.predict(train) train_predprob = alg.predict_proba(train)[:,1] # Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(target.values, train_preds) print "AUC Score (Train): %f" % metrics.roc_auc_score(target, train_predprob) # Make a prediction: print('Predicting......') test_predprob = alg.predict_proba(test)[:,1] feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') #plt.show() return test_predprob
def testInstance(population,i,dtrain): params = {"objective": "reg:linear", "eta": population.eta[i], "max_depth": population.depth[i], "subsample": population.subsample[i], "colsample_bytree": population.colsample_bytree[i], "num_boost_round":int(population.nRound[i]), "lambda":population.lamda[i], "alpha":population.alpha[i], "gamma":population.gamma[i], "min_child_weight":population.min_child_weight[i], "silent": 1, #"seed": 1301 } history = xgb.cv( params, dtrain, #early_stopping_rounds=30, #no early stopping in Python yet!!! num_boost_round =int(population.nRound[i]), nfold=5, # number of CV folds #nthread=12, # number of CPU threads show_progress=False, feval=rmspe_xg, # custom evaluation metric obj=RMSPE_objective #maximize=0 # the lower the evaluation score the better ) return history["test-rmspe-mean"].iget(-1)
def modelfit(alg, dtrain, predictors, dtest=None, dscore=None, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): if useTrainCV: xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics=['logloss'], early_stopping_rounds=early_stopping_rounds, show_progress=False) alg.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data alg.fit(dtrain[predictors], dtrain['target'], eval_metric='logloss') #Predict training set: dtrain_predictions = alg.predict(dtrain[predictors]) dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1] if isinstance(dtest, pd.DataFrame): dtest_predprob = alg.predict_proba(dtest[predictors])[:,1] if isinstance(dscore, pd.DataFrame): dscore_predprob = alg.predict_proba(dscore[predictors])[:,1] np.savetxt('XGBoost_pred_raw.csv', dscore_predprob, delimiter=",") #Print model report: print "\nModel Report" print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions) print "Metric Score (Train): %f" % metrics.log_loss(dtrain['target'], dtrain_predprob) if isinstance(dtest, pd.DataFrame): print "Metric Score (Test): %f" % metrics.log_loss(dtest['target'], dtest_predprob) feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show()
def train_crossV(self, train_x, train_y, nfold=3, early_stopping_rounds=300, metrics=['auc']): xgmat_train = xgb.DMatrix(train_x, label=train_y, missing=-9999) params = { 'booster':'gbtree', 'objective':'binary:logistic', 'silent':self.silent, 'eta':self.eta, 'gamma':self.gamma, 'max_depth':self.max_depth, 'min_chile_weitght':self.min_chile_weight, 'subsample':self.subsample, 'lambda':self.lambda_, 'scale_pos_weight':self.scale_pos_weight, "colsample_bytree": self.colsample_bytree, 'eval_metirc':'auc', 'seed':2014, 'nthread':self.threads } watchlist = [ (xgmat_train,'train') ] num_round = self.num_boost_round cv_result = xgb.cv(params, xgmat_train, num_boost_round=num_round, early_stopping_rounds=early_stopping_rounds, nfold=nfold, seed=1024, show_progress=True, metrics=metrics) return cv_result
def test_cv(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # return np.ndarray cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) assert isinstance(cv, np.ndarray) assert cv.shape == (10, 4)
def test_cv_no_shuffle(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} # return np.ndarray cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False) assert isinstance(cv, dict) assert len(cv) == (4)
import numpy as np import xgboost as xgb ### load data in do training dtrain = xgb.DMatrix('data/aga.train') param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'} num_round = 2 print('running cross validation') # do cross validation, this will print result out as # [iteration] metric_name:mean_value+std_value # std_value is standard deviation of the metric xgb.cv(param, dtrain, num_round, nfold=5, metrics={'error'}, seed=0, callbacks=[xgb.callback.print_evaluation(show_stdv=True)]) print('running cross validation, disable standard deviation display') # do cross validation, this will print result out as # [iteration] metric_name:mean_value res = xgb.cv(param, dtrain, num_boost_round=10, nfold=5, metrics={'error'}, seed=0, callbacks=[ xgb.callback.print_evaluation(show_stdv=False),
def test_cv_as_pandas(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) assert isinstance(cv, pd.DataFrame) exp = pd.Index([ u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std' ]) assert len(cv.columns.intersection(exp)) == 4 # show progress log (result is the same as above) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, verbose_eval=True) assert isinstance(cv, pd.DataFrame) exp = pd.Index([ u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std' ]) assert len(cv.columns.intersection(exp)) == 4 cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, verbose_eval=True, show_stdv=False) assert isinstance(cv, pd.DataFrame) exp = pd.Index([ u'test-error-mean', u'test-error-std', u'train-error-mean', u'train-error-std' ]) assert len(cv.columns.intersection(exp)) == 4 params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': 'auc' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) assert 'eval_metric' in params assert 'auc' in cv.columns[0] params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': ['auc'] } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True) assert 'eval_metric' in params assert 'auc' in cv.columns[0] params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': ['auc'] } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, early_stopping_rounds=1) assert 'eval_metric' in params assert 'auc' in cv.columns[0] assert cv.shape[0] < 10 params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='auc') assert 'auc' in cv.columns[0] params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic' } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['auc']) assert 'auc' in cv.columns[0] params = { 'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective': 'binary:logistic', 'eval_metric': ['auc'] } cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics='error') assert 'eval_metric' in params assert 'auc' not in cv.columns[0] assert 'error' in cv.columns[0] cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) assert 'eval_metric' in params assert 'auc' not in cv.columns[0] assert 'error' in cv.columns[0] params = list(params.items()) cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True, metrics=['error']) assert isinstance(params, list) assert 'auc' not in cv.columns[0] assert 'error' in cv.columns[0]
'subsample': 0.8, 'learning_rate': 0.06, 'colsample_bytree': 0.8, 'eta': 0.02, 'objective': 'binary:logistic', 'sample_type': 'uniform', 'normalize': 'tree', 'rate_drop': 0.15, # 0.1 'skip_drop': 0.85, # 0.9 'nthread': -1, } dtrain = xgb.DMatrix(X_train, label=y_train) xgb.cv(xgb_params, dtrain, num_boost_round=10000, nfold=5, verbose_eval=5, early_stopping_rounds=100) model = xgb.train(xgb_params, dtrain=dtrain, num_boost_round=300, evals=[(dtrain, 'train')], verbose_eval=5) dtest = xgb.DMatrix(X_test) preds = model.predict(dtest) print dtest StackingSubmission = pd.DataFrame({'score': preds})
def _train_model(self): self.log.info("Training models...") if self.date_obs_weights: date_min = min(self.data['train']['data']['disp_date']) date_max = max(self.data['train']['data']['disp_date']) span = date_max - date_min obs_weights = [(i - date_min) / span for i in self.data['train']['data']['disp_date']] else: obs_weights = [1] * len(self.data['train']['data'].index) # Tune Hyperparameters ---------------------------------------------------------------- if not os.path.exists(f"{self.code_dir}/models/tune_logs.json"): self.log.info( "No hyperparameter tuning logs found! Finding optimal hyperparameters... (this will take a while)" ) # log_params = {} # For each label in the training dataset... for name, values in self.data['train']['labels'].iteritems(): # Generate dmatrix for xgb model train_dmatrix = xgboost.DMatrix(self.data['train']['data'], label=values, weight=obs_weights) # Optomiz parameters and save to log log_params[name] = bayes_opt_xgb( dmatrix=train_dmatrix, log=self.log, opt_fun=xgb_cv_fun, opt_rounds=self.opt_rounds, init_rounds=self.init_rounds, params_ranges=self.params_ranges, max_estimators=self.max_estimators) # Save log to file json.dump(log_params, open(f"{self.code_dir}/models/tune_logs.json", "w"), indent=4) else: # If a file already exists, load it self.log.info("Hyperparameter tuning logs found, loading...") log_params = json.load( open(f"{self.code_dir}/models/tune_logs.json", "r")) # Train models ------------------------------------------------------------------------- fits = {} for name, values in self.data['train']['labels'].iteritems(): self.log.info(f"Training {name}") train_dmatrix = xgboost.DMatrix( self.data['train']['data'], label=values, weight=obs_weights, feature_names=self.data['train']['data'].columns) # Get best hyperparameters for label from log log_best = log_params[name][max(log_params[name].keys())] best_params = log_best['params'] fits[name] = xgboost.train( best_params, train_dmatrix, num_boost_round=log_best['fit_props']['n_estimators']) if len(self.data['test']['data'].index) > 0: # Print some quick "sanity check" results in test data test_dmatrix = xgboost.DMatrix( self.data['test']['data'], label=self.data['test']['labels'][name]) self.log.info("Mean pred: " + str(np.mean(fits[name].predict(test_dmatrix))) + " (" + str(np.mean(self.data['test']['labels'][name])) + ")" " Test AUC: " + str( metrics.roc_auc_score( self.data['test']['labels'][name], fits[name].predict(test_dmatrix)))) else: # If no test data, check results using CV cv_results = xgboost.cv( best_params, train_dmatrix, metrics='auc', num_boost_round=log_best['fit_props']['n_estimators']) self.log.info(cv_results.iloc[-1]) model_props = get_model_props(fits, data=self.data, out_weights=self.out_weights, instrument_trans=self.instrument_trans, log=self.log) if self.refit_full_model: for name, values in self.data['train']['labels'].iteritems(): self.log.info(f"Training {name} on training and test data") train_dmatrix = xgboost.DMatrix( self.data['train']['data'], label=values, weight=obs_weights, feature_names=self.data['train']['data'].columns) # Get best hyperparameters for label from log log_best = log_params[name][max(log_params[name].keys())] best_params = log_best['params'] fits[name] = xgboost.train( best_params, train_dmatrix, num_boost_round=log_best['fit_props']['n_estimators']) model_dict = {'models': fits, 'model_props': model_props} return model_dict
def train(self): params = { 'eta': self.eta, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'nthread': self.threads, 'num_class': len(self.incl) } print('\033[92m' + 'Using the following parameters:' + '\033[0m') print('eta: {}'.format(self.eta)) print('num_rounds: {}'.format(self.num_rounds)) print('early_stopping_rounds: {}'.format(self.early_stop)) grid_params = [(max_depth, subsample, colsample_bytree) for max_depth in self.max_depth for subsample in self.subsample for colsample_bytree in self.colsample_bytree] print('\033[92m' + 'Cross-validating with {} folds:'.format(self.nfold) + '\033[0m') min_mlogloss = float("Inf") best_params = None for max_depth, subsample, colsample_bytree in grid_params: print("CV with max_depth={}, subsample={}, colsample_bytree={}". format(max_depth, subsample, colsample_bytree)) params['max_depth'] = max_depth params['subsample'] = subsample params['colsample_bytree'] = colsample_bytree cv_results = xgb.cv(params, self.dtrain, num_boost_round=self.num_rounds, seed=self.rnd_seed, nfold=self.nfold, metrics={'mlogloss'}, early_stopping_rounds=self.early_stop) # Update best mlogloss mean_mlogloss = cv_results['test-mlogloss-mean'].min() boost_rounds = cv_results['test-mlogloss-mean'].argmin() print("\tmlogloss {} for {} rounds".format(mean_mlogloss, boost_rounds)) if mean_mlogloss < min_mlogloss: min_mlogloss = mean_mlogloss best_params = (max_depth, subsample, colsample_bytree, boost_rounds) print("Best params: {}, {}, {}, mlogloss: {}".format( best_params[0], best_params[1], best_params[2], min_mlogloss)) print('\033[92m' + 'Training final model' + '\033[0m') params['max_depth'] = best_params[0] params['subsample'] = best_params[1] params['colsample_bytree'] = best_params[2] self.model = xgb.train(params, self.dtrain, num_boost_round=self.num_rounds, evals=[(self.dtest, "Test")], early_stopping_rounds=self.early_stop) self.boost_rounds = self.model.best_iteration self.model.save_model(self.out + 'xgb_repeats.model')
best_params = None for max_depth, min_child_weight in gridsearch_params: print("CV with max_depth={}, min_child_weight={}".format( max_depth, min_child_weight)) # Update our parameters params['max_depth'] = max_depth params['min_child_weight'] = min_child_weight # Run CV cv_results = xgb.cv( params, dtrain=dt, num_boost_round=300, seed=2, nfold=5, metrics={'mae'}, early_stopping_rounds=10 ) mean_mae = cv_results['test-mae-mean'].min() boost_rounds = cv_results['test-mae-mean'].argmin() print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds)) if mean_mae < min_mae: min_mae = mean_mae best_params = (max_depth,min_child_weight) print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae)) # Best (11,4)
reg_alpha=0.005, learning_rate=0.21, max_depth=1, alpha=10, n_estimators=1000, min_child_weight=10, gamma=0.2, nthread=4, scale_pos_weight=1) datamatrix = xgb.DMatrix(data=X, label=Y) xgpara = xg_reg.get_xgb_params() cv_results = xgb.cv( dtrain=datamatrix, params=xgpara, nfold=6, num_boost_round=xg_reg.get_params()['n_estimators'], early_stopping_rounds=30, metrics="rmse", as_pandas=True, seed=123) xg_reg.set_params(n_estimators=cv_results.shape[0]) sc = MinMaxScaler(feature_range=(0, 1200)) ta = sc.fit_transform(ta[:, :]) xg_reg.fit(ta[:, :-1], ta[:, -1]) #Getting the Predictions For Non-Linear Residue xgpred = xg_reg.predict(ta[:, :-1]) xg_reg.fit(ta1[:, :-1], ta1[:, -1]) xgpred1 = xg_reg.predict(ta1[:, :-1]) arimaPreds1 = arima_predictions[start:end, 1] arimaPreds2 = arimaPreds1.reshape((arimaPreds1.shape[0], 1))
min_mae = float("Inf") best_params = None params['silent'] = 1 for eta in [.3, .2, .1, .05, .01, .005]: print("CV with eta={}".format(eta)) # We update our parameters params['eta'] = eta # Run and time CV cv_results = xgb.cv( params, dtrain, num_boost_round=num_boost_round, seed=42, nfold=5, metrics=['mae'], early_stopping_rounds=10 ) # Update best score mean_mae = cv_results['test-mae-mean'].min() boost_rounds = cv_results['test-mae-mean'].argmin() print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds)) if mean_mae < min_mae: min_mae = mean_mae best_params = eta print("Best params: {}, MAE: {}".format(best_params, min_mae)) print('================\n')
'eta': .01, 'colsample_bytree': .8, 'subsample': .8, 'seed': 0, 'nthread': 16, 'objective': 'multi:softprob', 'eval_metric': 'mlogloss', 'num_class': 3, 'silent': 0 } dtrain = xgb.DMatrix(data=x_train, label=y_train) dtest = xgb.DMatrix(data=x_test) bst = xgb.cv(params, dtrain, 10000, NFOLDS, early_stopping_rounds=50, verbose_eval=25) best_rounds = np.argmin(bst['test-mlogloss-mean']) bst = xgb.train(params, dtrain, best_rounds) bst.save_model('xgboostreduced2.model') preds = bst.predict(dtest) preds = pd.DataFrame(preds) cols = ['high', 'medium', 'low'] preds.columns = cols preds['listing_id'] = test.listing_id.values preds.to_csv('xgboostreduced2.csv', index=None)
best_params = None t = time.time() #search for max depth and min_child_weight for max_depth, min_child_weight in gridsearch_params: print("CV with max_depth={}, min_child_weight={}".format( max_depth, min_child_weight)) params = initializeDefaultParamsGPU() params['max_depth'] = max_depth params['min_child_weight'] = min_child_weight cv_result = xgb.cv(params, train_dmat, num_boost_round=num_boost_round_default, seed=seed, nfold=nfold, metrics=metrics, early_stopping_rounds=early_stopping_round_default) mean_mlogloss = cv_result['test-mlogloss-mean'].min() boost_rounds = cv_result['test-mlogloss-mean'].argmin() print("\tMAE {} for {} rounds".format(mean_mlogloss, boost_rounds)) if mean_mlogloss < min_mlogloss: min_mlogloss = mean_mlogloss best_params = (max_depth, min_child_weight) print("Best max_depth and min_child_weight: {}, mlogloss: {}".format( best_params, min_mlogloss)) print("Time: {}".format(time.time() - t)) max_depth = best_params[0]
'normalize': 'tree', 'rate_drop': 0.1, 'skip_drop': 0.9, 'seed': 87, 'nthread': 12, 'slice': 0 } watchlist = [(dtrain, 'train')] print 'cv' # #通过cv找最佳的nround cv_log = xgb.cv(params, dtrain, num_boost_round=1000, nfold=5, metrics='rmse', early_stopping_rounds=50, seed=1024) bst_rmse = cv_log['test-rmse-mean'].min() cv_log['nb'] = cv_log.index cv_log.index = cv_log['test-rmse-mean'] bst_nb = cv_log.nb.to_dict()[bst_rmse] # watchlist = [(dtrain,'train')] model = xgb.train(params, dtrain, num_boost_round=bst_nb + 50, evals=watchlist) #predict test set test_y = model.predict(dtest) print test_y #
X_train_b = X_train[cols_b] X_test_a = X_test[cols_a] X_test_b = X_test[cols_b] dtrain = xgb.DMatrix(X_train,y) dtest= xgb.DMatrix(X_test) dtrain_b = xgb.DMatrix(X_train_b,y) dtest_b = xgb.DMatrix(X_test_b) param_a = {'subsample':0.55, 'eta':0.05, 'seed': 10, 'max_depth': 4, 'gamma': 0.75,'objective':'reg:linear','colsample_bytree':0.7,'eval_metric': 'rmse','nthread': 8, 'min_child_weight': 4.0 ,'early_stopping_rounds':10,'verbose_eval':10,'booster':'gbtree'} num_round_a = 3000 n = int(0.3*74067) df_train_train = df_train.iloc[:n] df_train_test = df_train.iloc[n:] id_test = df_train_test['id'].astype(int) clf_a = xgb.cv(param_a, dtrain, num_round_a,nfold = 5,metrics={'rmse'}, seed = 0) clf_a[clf_a['test-rmse-mean']==min(clf_a['test-rmse-mean'])].index.tolist() param_b = {'subsample':0.55, 'eta':0.05, 'seed': 10, 'max_depth': 4, 'gamma': 0.75,'objective':'reg:linear','colsample_bytree':0.7,'eval_metric': 'rmse','nthread': 8, 'min_child_weight': 4.0 ,'early_stopping_rounds':10,'verbose_eval':10,'booster':'gblinear'} num_round_b = 8000 clf_b = xgb.cv(param_b, dtrain_b, num_round_b,nfold = 5,metrics={'rmse'}, seed = 0) clf_b[clf_b['test-rmse-mean']==min(clf_b['test-rmse-mean'])].index.tolist() ########################## xgboost model 1 ################################## num_round = 8000 bst = xgb.train(param_a,dtrain_a,num_round_a) print("predict...") y_pred = bst.predict(dtest_a) for i in range(len(y_pred)): if y_pred[i]<1.0: y_pred[i] = 1.0
clf=xgb.train(params, dtrain, num_boost_round=10, evals=[], obj=None, feval=None, maximize=False, early_stopping_rounds=None, evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None) ''' num_boost_round:boost迭代次数 evals:一对对 (DMatrix, string)组成的列表,培训期间将评估哪些指标的验证集列表。验证指标将帮助我们跟踪模型的性能。用evallist = [(dtest, 'eval'), (dtrain, 'train')]指定。 obj feval:自定义评价函数 maximize early_stopping_rounds:验证指标需要至少在每轮early_stopping_rounds中改进一次才能继续训练,例如early_stopping_rounds=200表示每200次迭代将会检查验证指标是否有改进,如果没有就会停止训练,如果有多个指标,则只判断最后一个指标 evals_result verbose_eval:取值可以是bool型也可以是整数,当取值为True时,表示每次迭代都显示评价指标,当取值为整数时,表示每该取值次数轮迭代后显示评价指标 xgb_model callbacks learning_rates ''' xgb.cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True) ''' model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot() ''' bst.save_model('0001.model') ypred = clf.predict(data, output_margin=False, ntree_limit=None, validate_features=True) ''' ntree_limit:限制预测中的树数;如果定义了最佳树数限制,则默认为最佳树数限制,否则为0(使用所有树) ''' xgb.plot_tree(bst, num_trees=2) xgb.to_graphviz(bst, num_trees=2)
mdl = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=150, maximize=True, verbose_eval=100) # Add model to the list of models (one for each fold) models_by_fold.append(mdl) # In[ ]: n_folds = 5 early_stopping = 10 params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'binary:logistic', 'seed': 99, 'silent': 1, 'eval_metric':'auc', 'nthread':4} xg_train = xgb.DMatrix(Xtrain_hashed, label=y_train); cv = xgb.cv(params, xg_train, 5000, nfold=n_folds, early_stopping_rounds=early_stopping, verbose_eval=1) # ## Work in progess-K folds # In[ ]: K = 5 kf = KFold(n_splits = K, random_state = 3228, shuffle = True) # In[ ]: for train_index, test_index in kf.split(train):
help= "path to CSV file with labels reflecting relevances of pairs (theorem, premise)" ) parser.add_argument( "output_directory", help= "path to directory where performance of tested model should be saved") args = parser.parse_args() y = read_csv(os.path.abspath(args.y), type_of_records="int") X = load_obj(os.path.abspath(args.X)) output_directory = os.path.abspath(args.output_directory) dtrain = xgb.DMatrix(X, label=y) params = { "max_depth": p["max_depth"], "eta": p["eta"], "gamma": p["gamma"], "objective": "binary:logistic" } x = xgb.cv(params=params, dtrain=dtrain, num_boost_round=p["num_boost_round"], early_stopping_rounds=p["early_stopping_rounds"], nfold=4, metrics={"error", "auc", "logloss"}) output_name = os.path.join(output_directory, "_".join(map(str, list(p.values()))) + ".pkl") save_obj({"params": p, "stats": x}, output_name)
# Compute the accuracy of the predictions: accuracy accuracy = float(np.sum(y_pred_4 == y_test)) / y_test.shape[0] print("accuracy:", accuracy) # Create the DMatrix: churn_dmatrix churn_dmatrix = xgb.DMatrix(data=X, label=y) # Create the parameter dictionary: params params = {"objective": "reg:logistic", "max_depth": 3} # Perform cross-validation: cv_results cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5, metrics="error", as_pandas=True, seed=123) # Print cv_results print(cv_results) # Print the accuracy print(((1 - cv_results["test-error-mean"]).iloc[-1])) # Perform cross_validation: cv_results cv_results = xgb.cv(dtrain=churn_dmatrix, params=params, nfold=3, num_boost_round=5,
# get the parameters for xgboost file_name = sys.argv[1] folder_name = file_name.split('.')[0] with open('../data/params_0119/' + file_name) as fread: jj = 0 for line in fread.readlines(): params = line.split(',') params[-1] = params[-1].strip() jj += 1 # get the parameters for xgboost plst = get_params(eta=params[0], min_child_weight=params[1], subsample=params[2], colsample_bytree=params[3], max_depth=params[4]) # print 'file: '+file_name+",params: "+str(jj) # train model model = xgb.cv(plst, xgtrain0, num_boost_round=xgb_num_rounds, metrics=['auc'], show_progress=True, show_stdv=True) # get preds # train_validation_preds = model.predict(xgtrain_validation, ntree_limit=model.best_iteration) # print 'file: '+file_name+",params: "+str(jj)+' Train score is:', eval_wrapper(train_validation_preds, train_validation['Response'])
def kfold(self, x_train, y_train, nfold=5): dtrain = xgb.DMatrix(x_train, y_train) cv_rounds = xgb.cv(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round, nfold=nfold, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10) return cv_rounds.iloc[-1,:]
def xgb_evaluate(max_depth, min_child_weight, colsample_bytree, subsample, gamma, colsample_bylevel, max_delta_step, eta, reg_alpha, reg_lambda ): global AUCbest global ITERbest params={} params['booster'] = 'gbtree' params['min_child_weight'] = int(min_child_weight) params['cosample_bytree'] = max(min(colsample_bytree, 1), 0) params['max_depth'] = int(max_depth) params['subsample'] = max(min(subsample, 1), 0) params['gamma'] = gamma params['colsample_bylevel'] = max(min(colsample_bylevel, 1), 0) params['max_delta_step']=max(int(max_delta_step),0) params['eta']=max(min(eta,1), 0) params['reg_alpha'] = max(reg_alpha, 0) params['reg_lambda']=max(min(reg_lambda, 1), 0) params['eval_metric']='auc' params['silent']=True params['objective']='binary:logistic' params['seed'] =42 print("\n Search parameters (%d-fold validation):\n %s" % (folds, params), file=log_file ) log_file.flush() xgbc = xgb.cv( params, dtrain, num_boost_round = nrounds, stratified = True, nfold = folds, early_stopping_rounds = 100, metrics = 'auc', show_stdv = True ) val_score = xgbc['test-auc-mean'].iloc[-1] train_score = xgbc['train-auc-mean'].iloc[-1] print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1), (val_score*2-1)) , file=log_file) if ( val_score > AUCbest ): AUCbest = val_score ITERbest = len(xgbc) print('\n\nBest Valid AUC changed to %f'%AUCbest, file=log_file) log_file.flush() # print("\n Best parameters (%d-fold validation):\n %s" % (folds, params), file=log_file_bestparam ) print('\n Best Valid AUC changed to %f'%AUCbest, file=log_file_bestparam) print('\n Train AUC is %f'%train_score, file=log_file_bestparam) log_file_bestparam.flush() # del xgbc gc.collect() return (val_score*2) - 1
test_data_features = vectorizer.transform(clean_test_reviews) test_data_features = test_data_features.toarray() ################################################################## ## Random Forest; Spend 7 mins # Let's try a random forest with the features we just created. rf_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0) # Initialize a Random Forest classifier with 100 trees # Use cross validation to evaluate the performance of Random Forest rf_clf_error = 1 - cross_val_score(rf_clf, train_data_features, train['sentiment'], cv=5, scoring='accuracy', n_jobs=-1).mean() print(rf_clf_error) # Random Forest training error: 0.1573 ################################################################## ## XGBoost; 比上面的还要慢好多 # Let's try a XGBoost with the features we created. dtrain = xgb.DMatrix(train_data_features, label=train['sentiment']) # Create xgb trianing set and parameters params = {'silent': 1, 'nthread': -1, 'eval_metric': 'error'} print('The cross validation may take a while...') # Use cross validation to evaluate the performance of XGBoost xgb_cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, show_stdv=False, seed=0) # 好慢啊, 忍不住 kill 了 xgb_error = xgb_cv_results['test-error-mean'].mean() print(xgb_error) # XGBoost trianing error: 0.1829 # It seems that Random Forest out-performed XGBoost. Thus we'll create a submission file with Random Forest. ################################################################## ## Creating a Submission of Random Forest; 经过上面在 test 上面的对比, 最终采用 Random Forest # Fit the forest to the training set, using the bag of words as features and the sentiment labels as labels rf_clf.fit(train_data_features, train['sentiment']) # This may take a few minutes to run, 比上面的 XGBoost 快多了 # Use the random forest to make sentiment label predictions result = rf_clf.predict(test_data_features); print(result) # Copy the results to a pandas dataframe with an "id" column an a "sentiment" column output = pd.DataFrame(data={"id":test["id"], "sentiment":result}) ################################################################## ## 保存结果, 提交到比赛页面 # Use pandas to write the comma-separated output file output.to_csv("tmp-Bag_of_Words_rf_clf_results.csv", index=False, quoting=3)
# Fitting the ANN to the Training set model_history=classifier.fit(X_train.values, y_train.values,validation_split=0.20, batch_size = 10, epochs = 760) import xgboost as xgb data_dmatrix = xgb.DMatrix(data=X_train,label=y_train) xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 500) xg_reg.fit(X_train,y_train) y_pred = xg_reg.predict(X_test) params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1, 'max_depth': 5, 'alpha': 10} cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3, num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123) cv_results.head() y_pred=classifier.predict(test) pred=pd.DataFrame(y_pred) sub_df=pd.read_csv('sample_submission.csv') datasets=pd.concat([sub_df['Id'],pred],axis=1) datasets.columns=['Id','SalePrice'] datasets.to_csv('sample_submission.csv',index=False) datasets.head()
# Author: 杨秀隆 sndnyang <*****@*****.**> # sndnyang.github.io # Description: # ##################################################### import xgboost as xgb if __name__ == '__main__': dtrain = xgb.DMatrix("train.buffer") param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' } param['nthread'] = 4 num_round = 10 print xgb.cv( param, dtrain, num_round, nfold=5, metrics={'error'}, seed = 0 ) """test_features = pd.read_csv('test/orig_feature.csv') test_features = pd.read_csv('test/test_features.csv') test_features = test_features.sort('enrollment_id') columns = test_features.columns[2:] test_features[columns].values # print columns fp = file('lsvm_submit.csv', 'w') prd = clf.predict(test_features[columns].values) for eid, cls in zip(test_features['enrollment_id'].values, prd):
######################################### Modeling ############################################### ### PROC REG -- stepwise selection -- predicting TARGET ### PROC GENMOD -- using stepwise selected variables -- log link dist=nb ### PROC Logisitc -- using stepwise selected variables -- ######################################## XGBoost ############################################### X = data.iloc[:, 3:] y = data.iloc[:, 0] dTrain = xgb.DMatrix(data=X, label=y) params = {'objective': 'count:poisson', 'max_depth': 4} cv_results = xgb.cv(dtrain=dTrain, params=params, nfold=4, num_boost_round=10, metrics='error', as_pandas=True) print("Accuracy: %f" % ((1 - cv_results["test-error-mean"]).iloc[-1])) bst = xgb.train(params, dTrain) preds = bst.predict(dTrain) print("RMSE: %f" % np.sqrt(mean_squared_error(y, preds))) X = data.iloc[:, 3:] y = data.iloc[:, 0] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123) xg_reg.fit(X_train, y_train)
subsample=0.65, colsample_bytree=0.7, learning_rate=0.01, objective='multi:softmax', #需要被最小化的损失函数,选的是多分类预测类别 num_class=5, #指定类别数目 gamma=0, #惩罚参数 reg_alpha=0.05, reg_lambda=0.05, nthread=4, seed=27) xgtrain = xgb.DMatrix(X, label=y) xgb_param = clf.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=5000, nfold=5, metrics=['mlogloss'], early_stopping_rounds=50, stratified=True, seed=1301) print('Best number of trees = {}'.format(cvresult.shape[0])) clf.set_params(n_estimators=cvresult.shape[0], use_label_encoder=False) #把clf的参数设置成最好的树对应的参数 clf.fit(X, y, eval_metric='merror') dtest_x = xgb.DMatrix(X1) pre = clf.predict(X1) w_score(y1, pre) #-------------------------------------------------------------- #number Two #sklearn的CV调优 param_test1 = {
subsample=0.77, colsample_bytree=0.7, objective='reg:linear', nthread=4, alpha=.1, lamda=1) # ### Build xgboost model # In[21]: xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(features_filtered, label=y) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=5, early_stopping_rounds=50) model.set_params(n_estimators=cvresult.shape[0]) model.fit(features_filtered, y) model.predict(features_filtered) # In[30]: path = 'C:\\Users\\DALab\\Desktop\\data contest\\test data' test = pd.DataFrame(np.nan, index=range(75000), columns=['id', 'time', '1st', '2nd', '3rd', '4th']) index = 0 for file in listdir(path): temp = pd.read_excel(path + '\\' + file, header=None)
plt.show() #The residual plot looks pretty good.To wrap it up let's predict on the test set # and submit on the leaderboard: #Adding an xgboost model: #Let's add an xgboost model to our linear model to see if we can improve our score: import xgboost as xgb dtrain = xgb.DMatrix(X_train, label=y) dtest = xgb.DMatrix(X_test) params = {"max_depth": 2, "eta": 0.1} model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot( ) # will plot train and test errors plt.show() model_xgb = xgb.XGBRegressor( n_estimators=360, max_depth=2, learning_rate=0.1) #the params were tuned using xgb.cv model_xgb.fit(X_train, y) # Output: XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0, # learning_rate=0.1, max_delta_step=0, max_depth=2, # min_child_weight=1, missing=None, n_estimators=360, nthread=-1, # objective='reg:linear', reg_alpha=0, reg_lambda=1, # scale_pos_weight=1, seed=0, silent=True, subsample=1)
def xgb_r2_score(preds, dtrain): labels = dtrain.get_label() return 'rmse', r2_score(labels, preds) # form DMatrices for Xgboost training dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train) dtest = xgb.DMatrix(test) # xgboost, cross-validation cv_result = xgb.cv(xgb_params, dtrain, obj=logcoshobj, feval=xgb_r2_score, num_boost_round=10000, # increase to have better results (~700) early_stopping_rounds=100, verbose_eval=50, show_stdv=False, maximize=True ) best_iteration = cv_result.shape[0] - 1 print(best_iteration) cv_mean = cv_result.iloc[-1, 2] cv_std = cv_result.iloc[-1, 3] print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std)) #num_boost_rounds = len(cv_result) #print('num_boost_rounds=' + str(num_boost_rounds)) # train model model = xgb.train(dict(xgb_params, silent=1), dtrain, obj=logcoshobj, num_boost_round=best_iteration)
def train_test_xgb(dataset='data/dataset_for_analysis_test.csv', finetuning=False, force_data_prep=True): """ train and test a xgb model 1) define target and predictors, prepare dataset 2) OPTIONAL: hyper-parameter tuning 3) train and test model, save CV-scores 4) print scores and feature importance """ # 1) define target and predictors, prepare dataset # notes: f*****g FLDAS updates after 2 months # alternatives # precipitation: NASA/GPM_L3/IMERG_V06 # define labeld of predictors and target X_labels_base = [ 'MODIS_006_MOD11A1_LST_Day_1km', 'MODIS_006_MOD11A1_LST_Night_1km', # 'MODIS_006_MYD13A1_EVI', 'NASA_FLDAS_NOAH01_C_GL_M_V001_Qair_f_tavg', 'NASA_FLDAS_NOAH01_C_GL_M_V001_Rainf_f_tavg', 'NASA_FLDAS_NOAH01_C_GL_M_V001_SoilMoi00_10cm_tavg', 'NASA_FLDAS_NOAH01_C_GL_M_V001_SoilTemp00_10cm_tavg', 'NASA_FLDAS_NOAH01_C_GL_M_V001_Wind_f_tavg', 'NASA_FLDAS_NOAH01_C_GL_M_V001_Tair_f_tavg', 'JAXA_GPM_L3_GSMaP_v6_operational_hourlyPrecipRateGC' ] y_label = 'mean_ovi' # define number of time steps to use (past observations) n_time_steps = 3 # define labels of predictors at different time steps X_labels = [] for ts in range(n_time_steps): X_labels += [x + '_' + str(ts) for x in X_labels_base] # prepare dataset if not os.path.exists(dataset) or force_data_prep: print( 'dataset not found, preparing from raw data (this might take a wile)' ) # define input data path ovitrap_data = 'data/ovitrap_data_month_adm2.csv' weather_data = 'data/merged_adm2.csv' prepare_dataset_for_training(ovitrap_data, weather_data, X_labels_base, filename=dataset, n_time_steps=n_time_steps) df = pd.read_csv(dataset) df = df.reset_index() df.date = pd.to_datetime(df.date) # save original dataset for later (inference) df_uncut = df.copy() df_uncut = df_uncut.dropna(subset=X_labels) df = df.dropna(subset=[y_label]) # QA cuts # removing a few data points based on poor correlation with observed "good" predictors df = df[(df.date.dt.year >= 2013) & (df.date.dt.year <= 2017)] bad_provinces = [ 'Nueva Vizcaya', 'Surigao del Norte', 'Sarangani', 'Siquijor', 'Dinagat Islands', 'Isabela', 'Capiz', 'South Cotabato', 'Maguindanao', 'Biliran', 'Davao Occidental', 'Quirino', 'Guimaras', 'Aurora' ] df = df[~df.adm_level.isin(bad_provinces)] df = df[df.count_ovi > 5] # 2) OPTIONAL: hyper-parameter tuning if finetuning: # define training data dtrain = xgb.DMatrix(data=df[X_labels], label=df[y_label]) # define grid of possible hyper-parameter values gridsearch_params = [{ 'max_depth': max_depth, 'learning_rate': learning_rate, 'n_estimators': n_estimators, 'min_split_loss': min_split_loss, 'subsample': subsample, 'colsample_bytree': colsample_bytree, 'reg_lambda': reg_lambda, 'reg_alpha': reg_alpha, 'booster': booster } for max_depth in [2, 5, 10, 20] for learning_rate in [0.05, 0.1, 0.2] for n_estimators in [1000, 2000, 5000] for min_split_loss in [0., 0.1, 1.] for subsample in [0.5, 0.8, 1.] for colsample_bytree in [0.3, 0.6, 1.] for reg_lambda in [1, 1.5, 2] for reg_alpha in [0., 0.1, 1.] for booster in ['gbtree']] # fixed hyper-parameters params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'booster': 'gbtree', 'num_boost_round': 1000 } # loop over grid, find best hyper-parameters min_rmse = float("Inf") best_params = None for params_grid in tqdm(gridsearch_params): # Update parameters for name, value in params_grid.items(): params[name] = value # Run CV cv_results = xgb.cv(dtrain=dtrain, params=params, num_boost_round=params['num_boost_round'], early_stopping_rounds=50, seed=42, nfold=5, as_pandas=True) # Update best MAE mean_rmse = cv_results['test-rmse-mean'].min() boost_rounds = cv_results['test-rmse-mean'].values.argmin() + 1 print("\trmse {} for {} / {} rounds".format( mean_rmse, boost_rounds, len(cv_results))) if mean_rmse < min_rmse: min_rmse = mean_rmse best_params = params # print results print("best params: {}".format(best_params)) print("min rmse: {}".format(min_rmse)) else: # used fixed hyper-parameters best_params = { 'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'booster': 'gbtree', 'num_boost_round': 1000, 'max_depth': 20, 'learning_rate': 0.2, 'n_estimators': 5000, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_lambda': 2, 'reg_alpha': 1.0, 'min_split_loss': 0. } # 3) train and test model, save CV-scores nfolds = 5 X, y = df[X_labels], df[y_label] dmdata = xgb.DMatrix(data=X, label=y) cv_results = xgb.cv(dtrain=dmdata, params=best_params, nfold=nfolds, metrics=["rmse", "mae"], num_boost_round=best_params['num_boost_round'], early_stopping_rounds=50, as_pandas=True, seed=42) boost_rounds = len(cv_results) # 4) print scores and feature importance # print average performance & feature importance print("cv-performance: MAE {}, RMSE {}".format( cv_results["test-mae-mean"].tail(1).values[0], cv_results["test-rmse-mean"].tail(1).values[0])) xg_reg = xgb.train(dtrain=dmdata, params=best_params, num_boost_round=boost_rounds) dpredict = xgb.DMatrix(data=df_uncut[X_labels], label=df_uncut[y_label]) predictions = xg_reg.predict(dpredict) df_pred = df_uncut.copy() df_pred[y_label] = predictions df_pred = inverse_transform(df_pred, dataset, [y_label] + X_labels) df_pred.to_csv('output/dataset_predictions.csv') xg_reg.save_model('models/best_model.json') df_uncut = df_uncut.dropna(subset=[y_label]) dpredict = xgb.DMatrix(data=df_uncut[X_labels], label=df_uncut[y_label]) predictions = xg_reg.predict(dpredict) print('R2 train', r2_score(df_uncut[y_label].values, predictions)) xgb.plot_importance(xg_reg) plt.tight_layout() plt.show()
def main(): print 'load datas...' train, test = data_util.load_dataset() y_train_all = train['y'] del train['ID'] del train['y'] id_test = test['ID'] del test['ID'] print 'train:', train.shape, ', test:', test.shape train_r2_scores = [] val_r2_scores = [] num_boost_roundses = [] X_test = test df_columns = train.columns.values dtest = xgb.DMatrix(X_test, feature_names=df_columns) xgb_params = { 'eta': 0.005, 'max_depth': 4, 'subsample': 0.93, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': 1 } for i in range(0, 5): random_state = 42 + i X_train, X_val, y_train, y_val = train_test_split( train, y_train_all, test_size=0.25, random_state=random_state) dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns) dval = xgb.DMatrix(X_val, y_val, feature_names=df_columns) y_mean = np.mean(y_train) cv_result = xgb.cv( dict(xgb_params, base_score=y_mean), # base prediction = mean(target) dtrain, num_boost_round=2000, # increase to have better results (~700) early_stopping_rounds=50, ) num_boost_rounds = len(cv_result) num_boost_roundses.append(num_boost_rounds) model = xgb.train(dict(xgb_params, base_score=y_mean), dtrain, num_boost_round=num_boost_rounds) train_r2_score = r2_score(dtrain.get_label(), model.predict(dtrain)) val_r2_score = r2_score(dval.get_label(), model.predict(dval)) print 'perform {} cross-validate: train r2 score = {}, validate r2 score = {}'.format( i + 1, train_r2_score, val_r2_score) train_r2_scores.append(train_r2_score) val_r2_scores.append(val_r2_score) print '\naverage train r2 score = {}, average validate r2 score = {}'.format( sum(train_r2_scores) / len(train_r2_scores), sum(val_r2_scores) / len(val_r2_scores)) best_num_boost_rounds = sum(num_boost_roundses) // len(num_boost_roundses) print 'best_num_boost_rounds =', best_num_boost_rounds # train model print 'training on total training data...' dtrain_all = xgb.DMatrix(train, y_train_all, feature_names=df_columns) model = xgb.train(dict(xgb_params, base_score=np.mean(y_train_all)), dtrain_all, num_boost_round=best_num_boost_rounds) print 'predict submit...' xgb_result = model.predict(dtest) # ===================================== model stacking ================================= stacked_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StackingEstimator( estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)), LassoLarsCV()) stacked_pipeline.fit(train.values, y_train_all) stack_results = stacked_pipeline.predict(X_test.values) df_sub = pd.DataFrame({'ID': id_test, 'y': stack_results}) df_sub.to_csv('model_stacking_result.csv', index=False) y_pred = xgb_result * 0.784 + stack_results * 0.216 df_sub = pd.DataFrame({'ID': id_test, 'y': y_pred}) df_sub.to_csv(Configure.submission_path, index=False)
'colsample_bytree': 0.8, 'silent': 1, 'subsample': 0.6, 'learning_rate': 0.01, 'objective': 'reg:linear', 'max_depth': 1, 'num_parallel_tree': 1, 'min_child_weight': 1, 'eval_metric': 'rmse', } res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False, early_stopping_rounds=25, verbose_eval=10, show_stdv=True) best_nrounds = res.shape[0] - 1 cv_mean = res.iloc[-1, 0] cv_std = res.iloc[-1, 1] print('Ensemble-CV: {0}±{1}'.format(cv_mean, cv_std)) gbdt = xgb.train(xgb_params, dtrain, best_nrounds) submission = pd.read_csv(SUBMISSION_FILE) submission.iloc[:, 1] = gbdt.predict(dtest)