Пример #1
0
    def train(self):
        print('#### preprocessing ####')
        self.df = self.preprocess(self.df)

        print('#### training ####')
        self.predictors = [x for x in self.df.columns if x not in [self.target_column, self.id_column]]
        xgb_param = self.clf.get_xgb_params()

        xgtrain  = xgb.DMatrix(self.df[self.predictors], label=self.df[self.target_column], missing=np.nan)
        try:
            cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, show_progress=self.verbose)
        except:
            try:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds, verbose_eval=self.verbose)
            except:
                cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=5,
                    metrics=[self.scoring], early_stopping_rounds=self.early_stopping_rounds)
        self.clf.set_params(n_estimators=cvresult.shape[0])
        self.clf.fit(self.df[self.predictors], self.df[self.target_column],eval_metric=self.scoring)

        #Predict training set:
        train_df_predictions = self.clf.predict(self.df[self.predictors])

        if self.target_type == 'binary':
            train_df_predprob = self.clf.predict_proba(self.df[self.predictors])[:,1]
            print("Accuracy : %.4g" % metrics.accuracy_score(self.df[self.target_column].values, train_df_predictions))
            print("AUC Score (Train): %f" % metrics.roc_auc_score(self.df[self.target_column], train_df_predprob))
        elif self.target_type == 'linear':
            print("Mean squared error: %f" % metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions))
            print("Root mean squared error: %f" % np.sqrt(metrics.mean_squared_error(self.df[self.target_column].values, train_df_predictions)))
Пример #2
0
    def test_cv_explicit_fold_indices_labels(self):
        params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, 'objective':
                  'reg:linear'}
        N = 100
        F = 3
        dm = xgb.DMatrix(data=np.random.randn(N, F), label=np.arange(N))
        folds = [
            # Train        Test
            ([1, 3], [5, 8]),
            ([7, 9], [23, 43, 11]),
        ]

        # Use callback to log the test labels in each fold
        def cb(cbackenv):
            print([fold.dtest.get_label() for fold in cbackenv.cvfolds])

        # Run cross validation and capture standard out to test callback result
        with captured_output() as (out, err):
            xgb.cv(
                params, dm, num_boost_round=1, folds=folds, callbacks=[cb],
                as_pandas=False
            )
            output = out.getvalue().strip()
        solution = ('[array([5., 8.], dtype=float32), array([23., 43., 11.],' +
                    ' dtype=float32)]')
        assert output == solution
def test_sklearn_nfolds_cv():
    tm._skip_if_no_sklearn()
    from sklearn.datasets import load_digits
    from sklearn.model_selection import StratifiedKFold

    digits = load_digits(3)
    X = digits['data']
    y = digits['target']
    dm = xgb.DMatrix(X, label=y)

    params = {
        'max_depth': 2,
        'eta': 1,
        'silent': 1,
        'objective':
        'multi:softprob',
        'num_class': 3
    }

    seed = 2016
    nfolds = 5
    skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed)

    cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
    cv2 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, folds=skf, seed=seed)
    cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
    assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
    assert cv2.iloc[-1, 0] == cv3.iloc[-1, 0]
Пример #4
0
    def test_cv(self):
        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
        params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }

        import pandas as pd
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([u'test-error-mean', u'test-error-std',
                        u'train-error-mean', u'train-error-std'])
        assert cv.columns.equals(exp)

        # show progress log (result is the same as above)
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
                    show_progress=True)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([u'test-error-mean', u'test-error-std',
                        u'train-error-mean', u'train-error-std'])
        assert cv.columns.equals(exp)
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10,
                    show_progress=True, show_stdv=False)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([u'test-error-mean', u'test-error-std',
                        u'train-error-mean', u'train-error-std'])
        assert cv.columns.equals(exp)

        # return np.ndarray
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
        assert isinstance(cv, np.ndarray)
        assert cv.shape == (10, 4)
Пример #5
0
	def test_custom_objective(self):
		param = {'max_depth':2, 'eta':1, 'silent':1 }
		watchlist  = [(dtest,'eval'), (dtrain,'train')]
		num_round = 2
		def logregobj(preds, dtrain):
			labels = dtrain.get_label()
			preds = 1.0 / (1.0 + np.exp(-preds))
			grad = preds - labels
			hess = preds * (1.0-preds)
			return grad, hess
		def evalerror(preds, dtrain):
			labels = dtrain.get_label()
			return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
		
		# test custom_objective in training
		bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
		assert isinstance(bst, xgb.core.Booster)
		preds = bst.predict(dtest)
		labels = dtest.get_label()
		err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
		assert err < 0.1

		# test custom_objective in cross-validation
		xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
	       obj = logregobj, feval=evalerror)
Пример #6
0
def test_sklearn_nfolds_cv():
    digits = load_digits(3)
    X = digits['data']
    y = digits['target']
    dm = xgb.DMatrix(X, label=y)
    
    params = {
        'max_depth': 2,
        'eta': 1,
        'silent': 1,
        'objective':
        'multi:softprob',
        'num_class': 3
    }

    seed = 2016
    nfolds = 5
    skf = StratifiedKFold(y, n_folds=nfolds, shuffle=True, random_state=seed)

    import pandas as pd
    cv1 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, seed=seed)
    cv2 = xgb.cv(params, dm, num_boost_round=10, folds=skf, seed=seed)
    cv3 = xgb.cv(params, dm, num_boost_round=10, nfold=nfolds, stratified=True, seed=seed)
    assert cv1.shape[0] == cv2.shape[0] and cv2.shape[0] == cv3.shape[0]
    assert cv2.iloc[-1,0] == cv3.iloc[-1,0]
Пример #7
0
def xgb_model(all_file, num=200, debug=True):
    if debug:
        all_data = pd.read_csv(all_file,nrows=500, encoding='gb18030')
    else:
        all_data = pd.read_csv(all_file, encoding='gb18030')
    train_data = all_data[all_data['tag'] ==1]
    feature_data = train_data.drop(['Idx', 'ListingInfo', 'target','tag'],axis=1)
    feature_data.fillna(-1, inplace=True)
    labels = train_data['target']
    # feature_importance = pd.read_csv(features_importance_file)
    # feature_importance_columns = feature_importance['feature'].tolist()
    # feature_importance_columns = feature_importance_columns[:num]
    # final_train_data = feature_data[feature_importance_columns]
    final_train_data = feature_data
    print final_train_data.shape
    labels = train_data['target']
    dtrain = xgb.DMatrix(final_train_data, label=labels, missing=-1)
    # xgb_params = {'subsample':0.9, 'min_child_weight': 1, 'eval_metric': 'rmse', 'fit_const': 0.5,
    #               'nthread': 3, 'num_round': 700, 'gamma': 5, 'max_depth': 6, 'eta': 0.01,
    #               'colsample_bytree': 0.6, 'silent': 1, 'objective': 'binary:logistic'}
    # xgb_params = {'num_round': 2200, 'colsample_bytree': 0.4, 'silent': 1, 'eval_metric': 'auc', 'nthread': 3,
    #               'min_child_weight': 1, 'subsample': 0.66, 'eta': 0.006, 'fit_const': 0.6, 'objective': 'binary:logistic',
    #               'max_depth': 6, 'gamma': 0}
    xgb_params = {'num_round': 2400, 'colsample_bytree': 0.5, 'silent': 1, 'eval_metric': 'auc', 'nthread': 3,
                  'min_child_weight': 6, 'subsample': 0.8, 'eta': 0.016, 'fit_const': 0.4, 'objective': 'binary:logistic',
                  'max_depth': 10, 'gamma': 1}

    xgb.cv(xgb_params, dtrain, num_boost_round=2400, nfold=5, metrics={'auc'}, show_progress=True)
    print 'finished'
Пример #8
0
	def test_fpreproc(self):
		param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
		num_round = 2
		def fpreproc(dtrain, dtest, param):
			label = dtrain.get_label()
			ratio = float(np.sum(label == 0)) / np.sum(label==1)
			param['scale_pos_weight'] = ratio
			return (dtrain, dtest, param)
		xgb.cv(param, dtrain, num_round, nfold=5,
	       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
Пример #9
0
def cross_validation():
    for k in sorted(train_y.keys()):
        if k.startswith('TripType_'):
            dtrain = xgboost.DMatrix(train_X, label=train_y)
            params = {
                'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'
            }
            print xgboost.cv(params, dtrain, num_round=2, nfold=5,
                metrics={'error'}, seed=0)
            break
def cross_validation():

    dtrain = xgb.DMatrix('dataset_dmatrix/offline_0516_sim.train.buffer')

    param = {'max_depth':5, 'eta':0.08, 'silent':1, 'objective':'binary:logistic'}
    param['nthread'] = 8
    param['subsample'] = 0.5
    num_round = 1500

    print ('running cross validation')
    xgb.cv(param, dtrain, num_round, nfold=3,
		   show_progress=True,feval=evalerror ,seed = 0,show_stdv=False,maximize=True)
Пример #11
0
def cross_validate(args):
    """
    Usage: cv iq_training_data_svm.txt dummy --num_round=1000
    https://github.com/dmlc/xgboost/blob/master/demo/kaggle-higgs/higgs-cv.py
    https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cross_validation.py
    :param args:
    :return:
    """

    data = xgb.DMatrix(args.input)
    param = vars(args)
    xgb.cv(param, data, args.num_round, nfold=int(args.nfold),
           metrics={'mlogloss', 'merror'}, seed=0)
Пример #12
0
 def model_param_select(self):
     """
     k-folds cross validation to select the best param 
     """
     params = {'max_depth': self.max_depth,
               'eta': self.eta,
               'subsample': self.subsample,
               'objective': self.objective,
               'silent': self.silent}
     best_auc, best_param, best_iter_round = 0, {}, 0
     param_grid = ParameterGrid(params)
     for i, param in enumerate(param_grid):
         cv_result = xgb.cv(param, self.train_matrix,
                            num_boost_round=self.num_boost_round,  # max iter round
                            nfold=self.nfold,
                            stratified=self.stratified,
                            metrics=self.metrics,  # metrics focus on
                            early_stopping_rounds=self.early_stopping_rounds)  # stop when metrics not get better
         cur_auc = cv_result.ix[len(cv_result)-1, 0]
         cur_iter_round = len(cv_result)
         if cur_auc > best_auc:
             best_auc, best_param, best_iter_round = cur_auc, param, cur_iter_round
         print('Param select {}, auc: {}, iter_round: {}, params: {}, now best auc: {}'
               .format(i, cur_auc, cur_iter_round, param, best_auc))
     return best_auc, best_param, best_iter_round
 def train_cv(self, dtrain, **args):
     cv_args = self.parse_args(args, method='cv')
     print(cv_args)
     cv_xgb = xgb.cv(dtrain=dtrain,
                     **cv_args)
     results = self.persit_result_cv(cv_xgb, args)
     return results
Пример #14
0
def modelfit(alg, data, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    '''
    a variation of:
    http://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
    '''
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(data['x_train'][predictors], label=data['y_train'])
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          metrics='auc',
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    #Fit the algorithm on the data
    alg.fit(data['x_train'][predictors], data['y_train'], eval_metric='auc')
    #Predict training set:
    dtrain_predictions = alg.predict(data['x_train'][predictors])
    dtrain_predprob = alg.predict_proba(data['x_train'][predictors])[:,1]
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(data['y_train'].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(data['y_train'], dtrain_predprob))
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp[0:20].plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
    return alg
Пример #15
0
	def xgbTuning(self, pX, change = 3):
		w = self.getWeight(self.y)
		dm = xgb.DMatrix(pX, self.y, weight=w)
		best_auc = 0
		n = pX.shape[0]
		best_params = None
		for i in range(change):
			randp = np.random.random_sample(3)
			param = {
				'bst:eta': randp[0],
				'max_depth': int(3+6*randp[1]) , 
				'nthread':4, 
				'silent':1,
				'alpha':randp[2],
				'eval_metric':'auc',
				'objective': 'binary:logistic' 
			}
			m = xgb.cv(param, dm, metrics='auc', nfold=3, num_boost_round = 50,early_stopping_rounds=5)
			auc = m['test-auc-mean'].max()
			if auc > best_auc :
				print 'xgb:' + str(auc)
				best_auc = auc
				best_params = param
		Xtrain, Xtest, ytrain, ytest = train_test_split(pX, self.y, test_size=.33)
		trainw = self.getWeight(ytrain)
		testw = self.getWeight(ytest)
		dtrain = xgb.DMatrix(Xtrain, label = ytrain, feature_names=Xtrain.columns, weight = trainw)
		dtest = xgb.DMatrix(Xtest, label = ytest, feature_names=Xtest.columns, weight = testw)
		evallist = [(dtrain, 'train'), (dtest, 'eval')]
		booster = xgb.train(best_params, dtrain, evals=evallist, num_boost_round=100,early_stopping_rounds=10)
		rounds = booster.attr("best_iteration")
		best_auc = booster.attr("best_score")
		return float(best_auc), xgb.train(best_params, dtrain, num_boost_round=int(rounds))
Пример #16
0
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain['target'].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
                         
        #Fit the algorithm on the DataFrame
        alg.fit(dtrain[predictors], dtrain['target'],eval_metric='auc')
                          
        #Predict training set:
        dtrain_predictions = alg.predict(dtrain[predictors])
        dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
                          
        #Print model report:
        print "\nModel Report"
        print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions)
        print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['target'], dtrain_predprob)

        importances = alg.booster().get_fscore()
        importances = sorted(importances.items(), key=operator.itemgetter(1), reverse=True)
        df = pd.DataFrame(importances, columns=['feature', 'fscore'])
        df['fscore'] = df['fscore'] / df['fscore'].sum()

        df.to_csv("ips.csv", index=False)

        feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
        print feat_imp
Пример #17
0
def model_2(train, labels, test):

    dtrain = xgb.DMatrix(train, label=labels)
    dtest = xgb.DMatrix(test)

    xgb_params = {}
    xgb_params["objective"] = "reg:linear"
    xgb_params["eta"] = 0.1
    xgb_params["subsample"] = 0.7
    xgb_params["silent"] = 1
    xgb_params["max_depth"] = 6
    xgb_params['eval_metric'] = 'rmse'
    xgb_params['min_child_weight'] = 5
    xgb_params['seed'] = 22424

    res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=2017, stratified=False,
                 early_stopping_rounds=25, verbose_eval=10, show_stdv=True)

    best_nrounds = res.shape[0] - 1
    cv_mean = res.iloc[-1, 0]
    cv_std = res.iloc[-1, 1]

    print('')
    print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
    bst = xgb.train(xgb_params, dtrain, best_nrounds)

    preds = np.exp(bst.predict(dtest))
    return preds
 def cv(self, X, y):
   X = self.build_matrix(X, y)
   param = {
     'silent': 1 if self.silent else 0,
     'use_buffer': int(self.use_buffer),
     'num_round': self.num_round,
     'ntree_limit': self.ntree_limit,
     'nthread': self.nthread,
     'booster': self.booster,
     'eta': self.eta,
     'gamma': self.gamma,
     'max_depth': self.max_depth,
     'min_child_weight': self.min_child_weight,
     'subsample': self.subsample,
     'colsample_bytree': self.colsample_bytree,
     'max_delta_step': self.max_delta_step,
     'l': self.l,
     'alpha': self.alpha,
     'lambda_bias': self.lambda_bias,
     'objective': self.objective,
     'eval_metric': self.eval_metric,
     'seed': self.seed,
     'num_class': self.num_class,
   }
   results = xgb.cv(param, X, self.num_round, 3)
   return results
Пример #19
0
def regression_with_xgboost(x_train, y_train, X_test, Y_test, features=None, use_cv=True, use_sklean=False, xgb_params=None):
    train_data = xgb.DMatrix(x_train, label=y_train, missing=float('nan'))
    test_data = xgb.DMatrix(X_test, Y_test, missing=float('nan'))
    evallist  = [(test_data,'eval'), (train_data,'train')]

    #if xgb_params == None:
    #    xgb_params = get_default_xgboost_params()

    if not use_cv:
        num_rounds = 10
    else:
        cvresult = xgb.cv(xgb_params, train_data, num_boost_round=100, nfold=5,
            metrics={'rmse'}, show_progress=True)
        print cvresult
        num_rounds = len(cvresult)
    gbdt = None
    if(use_sklean):
        #gbdt = xgboost.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='reg:linear', nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, seed=0, missing=None)
        xgb_params['n_estimators'] = num_rounds
        gbdt = xgboost.XGBRegressor(xgb_params)

        gbdt.fit(x_train, y_train)
        y_pred = gbdt.predict(X_test)

        return gbdt, y_pred
    else:
        #gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True, early_stopping_rounds=5)
        gbdt = xgb.train( xgb_params, train_data, num_rounds, evallist, verbose_eval = True)

        ceate_feature_map_for_feature_importance(features)
        show_feature_importance(gbdt, feature_names=features)

        y_pred = gbdt.predict(xgb.DMatrix(X_test, missing=float("nan")))
        return XGBoostModel(gbdt), y_pred
Пример #20
0
def modelfit(alg, train_data, train_label, cv_folds=5, early_stopping_rounds=1):

    xgb_param = alg.get_xgb_params()
    xgtrain = xgb.DMatrix(train_data, label=train_label)
    cvresult = xgb.cv(xgb_param,
                      xgtrain,
                      num_boost_round=alg.get_params()['n_estimators'],
                      nfold=cv_folds,
                      metrics=['auc'],
                      early_stopping_rounds=early_stopping_rounds,
                      show_progress=True)
    alg.set_params(n_estimators=cvresult.shape[0])   # Goal of CV is to tune the number of rounds, which is set here

    # Note: can change to a different day to see what happens
    start = time.time()
    alg.fit(train_data,
            train_label,
            eval_metric='auc')
    print "Time to fit: %s" % (time.time()-start)

    pickle.dump(alg, open("/home/jche/Desktop/xgboost.p", "w+"))   # Save model

    start = time.time()
    dtrain_predprob = alg.predict_proba(train_data)[:,1]
    print "Time to predict: %s" % (time.time() - start)

    for cutoff in range(0, 41):
        cut = cutoff/float(100)   # Cutoff in decimal form
        dtrain_predictions = dtrain_predprob > cut   # If y values are greater than the cutoff
        # Print model report:
        print "\nModel Report for cutoff %s" % cut
        print "Accuracy : %.4g" % metrics.accuracy_score(train_label, dtrain_predictions)
        print "AUC Score (Train): %f" % metrics.roc_auc_score(train_label, dtrain_predprob)
        print "Recall is: %s" % metrics.recall_score(train_label, dtrain_predictions)
        print metrics.confusion_matrix(train_label, dtrain_predictions)
Пример #21
0
 def fit(self, X, y):
     self.classes_ = np.unique(y)
     self.n_classes_ = len(self.classes_)
     self._le = XGBLabelEncoder().fit(y)
     training_labels = self._le.transform(y)
     xgdmat = xgb.DMatrix(X, label=training_labels)
     if self.n_classes_ > 2:
         self.param_map.update({'num_class':self.n_classes_})
         self.param_map.update({'objective':'multi:softprob'})
     self.results = xgb.cv(self.param_map,
                       xgdmat,
                       self.num_boost_round,
                       self.nfold,
                       self.stratified,
                       self.folds,
                       self.metrics,
                       self.obj,
                       self.feval,
                       self.maximize,
                       self.early_stopping_rounds,
                       self.fpreproc,
                       self.as_pandas,
                       self.verbose_eval,
                       self.show_stdv,
                       self.seed,
                       self.callbacks)
Пример #22
0
def modelfit(alg, dtrain, predictors, target, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print ("\nModel Report")
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
Пример #23
0
def do_compute(x):  
    row = grid.iloc[x,:]   
    eta = row['eta']
    min_child_weight = row['min_child_weight']
    colsample_bytree = row['colsample_bytree']
    max_depth = row['max_depth']
    subsample = row['subsample']
    _lambda = row['lambda']
    nround = row['nround']
    ####
    xgb_pars = {'min_child_weight': min_child_weight,
                'eta': eta,
                'colsample_bytree': colsample_bytree,
                'max_depth': int(max_depth),
                'subsample': subsample,
                'lambda': _lambda,
                'nthread': -1,
                'booster' : 'gbtree',
                'silent': 1,
                'eval_metric': 'rmse',
                'objective': 'reg:linear'}
    #print(xgb_pars)
    model = xgb.cv(xgb_pars, dtrain, 100000,nfold = 4, early_stopping_rounds=50,maximize=False, verbose_eval=10)
    nround = model.shape[0]
    rmse_cv_mean = model['test-rmse-mean'][model.shape[0]-1]
    rmse_cv_std = model['test-rmse-std'][model.shape[0]-1]
    # calculate the square of the value of x
    grid.loc[x,'rmse_cv_mean'] = rmse_cv_mean
    grid.loc[x,'rmse_cv_std'] = rmse_cv_std
    grid.loc[x,'nround'] = nround
    grid.to_csv('base_grid_xgb_40perc__'+str(os.getpid())+'.csv',index=False)
    return rmse_cv_mean
def xgbCV(dmatrix,  nfolds, eta_list, gamma_list, num_rounds = 500):
	
	params = {'eta':'', 'gamma':'', 'objective':'binary:logistic', 'verbose':3,
				'max_depth':20, 'subsample':.75, 'colsample_bytree':.75}
	
	vals = {'eta':[], 'gamma':[], 'num_iter':[], 'mean_cv_error':[], 'std_cv_error':[]}
	
	
	for e in eta_list:
		for g in gamma_list:
			params['eta'] = e
			params['gamma'] = g
			
			vals['eta'].append(e)
			vals['gamma'].append(g)
			
			print('Training the booster with a learning rate of', e, "and gamma of ", g)
			bst = xgb.cv(params, dmatrix, num_rounds, nfolds, early_stopping_rounds = 2)
			print('Stopped after', len(bst.index), "rounds.")
			
			best_iter = bst.nsmallest(1, 'test-error-mean')
			vals['num_iter'].append(best_iter.index[0])
			vals['mean_cv_error'].append(best_iter['test-error-mean'])
			vals['std_cv_error'].append(best_iter['test-error-std'])
			
	cv_df = pd.DataFrame.from_dict(vals)
	
	return(cv_df)
def modelfit(alg, train, target, test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgboost_params = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(train.values, label=target.values)
        xgtest = xgb.DMatrix(test.values)
        watchlist = [(xgtrain, 'train')] # Specify validations set to watch performance
        cvresult = xgb.cv(xgboost_params, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) #metrics='auc',show_progress=False
        alg.set_params(n_estimators=cvresult.shape[0])
    
    # Fit the algorithm on the data
    alg.fit(train, target, eval_metric='auc')

    # Predict training set:
    train_preds = alg.predict(train)
    train_predprob = alg.predict_proba(train)[:,1]
    
    # Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(target.values, train_preds)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(target, train_predprob)

    # Make a prediction:
    print('Predicting......')
    test_predprob = alg.predict_proba(test)[:,1]

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    #plt.show()
    return test_predprob
Пример #26
0
def testInstance(population,i,dtrain):
    params = {"objective": "reg:linear",
          "eta": population.eta[i],
          "max_depth": population.depth[i],
          "subsample": population.subsample[i],
          "colsample_bytree": population.colsample_bytree[i],
          "num_boost_round":int(population.nRound[i]),
          "lambda":population.lamda[i],
          "alpha":population.alpha[i],
          "gamma":population.gamma[i],
          "min_child_weight":population.min_child_weight[i],
          "silent": 1,
          #"seed": 1301
          } 
    history = xgb.cv(
        params,
        dtrain,  
        #early_stopping_rounds=30, #no early stopping in Python yet!!!
        num_boost_round  =int(population.nRound[i]),
        nfold=5, # number of CV folds
        #nthread=12, # number of CPU threads  
        show_progress=False,
        feval=rmspe_xg, # custom evaluation metric
        obj=RMSPE_objective
        #maximize=0 # the lower the evaluation score the better
        )
    return history["test-rmspe-mean"].iget(-1)
Пример #27
0
def modelfit(alg, dtrain, predictors, dtest=None, dscore=None, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics=['logloss'], early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['target'], eval_metric='logloss')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    if isinstance(dtest, pd.DataFrame):
        dtest_predprob = alg.predict_proba(dtest[predictors])[:,1]
    if isinstance(dscore, pd.DataFrame):
        dscore_predprob = alg.predict_proba(dscore[predictors])[:,1]
        np.savetxt('XGBoost_pred_raw.csv', dscore_predprob, delimiter=",")

    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions)
    print "Metric Score (Train): %f" % metrics.log_loss(dtrain['target'], dtrain_predprob)
    if isinstance(dtest, pd.DataFrame):
        print "Metric Score (Test): %f" % metrics.log_loss(dtest['target'], dtest_predprob)

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
	def train_crossV(self, train_x, train_y, nfold=3, early_stopping_rounds=300, metrics=['auc']):
		xgmat_train = xgb.DMatrix(train_x, label=train_y, missing=-9999)

		params = {
			'booster':'gbtree',
			'objective':'binary:logistic',
			'silent':self.silent,
			'eta':self.eta,
			'gamma':self.gamma,
			'max_depth':self.max_depth,
			'min_chile_weitght':self.min_chile_weight,
			'subsample':self.subsample,
			'lambda':self.lambda_,
			'scale_pos_weight':self.scale_pos_weight,
			"colsample_bytree": self.colsample_bytree,
			'eval_metirc':'auc',
			'seed':2014,
			'nthread':self.threads
		}

		watchlist = [ (xgmat_train,'train') ]
		num_round = self.num_boost_round

		cv_result = xgb.cv(params, xgmat_train, num_boost_round=num_round, early_stopping_rounds=early_stopping_rounds, nfold=nfold, seed=1024, show_progress=True, metrics=metrics)

		return cv_result
Пример #29
0
    def test_cv(self):
        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
        params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}

        # return np.ndarray
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False)
        assert isinstance(cv, np.ndarray)
        assert cv.shape == (10, 4)
Пример #30
0
    def test_cv_no_shuffle(self):
        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
        params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}

        # return np.ndarray
        cv = xgb.cv(params, dm, num_boost_round=10, shuffle=False, nfold=10, as_pandas=False)
        assert isinstance(cv, dict)
        assert len(cv) == (4)
Пример #31
0
import numpy as np
import xgboost as xgb

### load data in do training
dtrain = xgb.DMatrix('data/aga.train')
param = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
num_round = 2

print('running cross validation')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value+std_value
# std_value is standard deviation of the metric
xgb.cv(param,
       dtrain,
       num_round,
       nfold=5,
       metrics={'error'},
       seed=0,
       callbacks=[xgb.callback.print_evaluation(show_stdv=True)])

print('running cross validation, disable standard deviation display')
# do cross validation, this will print result out as
# [iteration]  metric_name:mean_value
res = xgb.cv(param,
             dtrain,
             num_boost_round=10,
             nfold=5,
             metrics={'error'},
             seed=0,
             callbacks=[
                 xgb.callback.print_evaluation(show_stdv=False),
Пример #32
0
    def test_cv_as_pandas(self):
        dm = xgb.DMatrix(dpath + 'agaricus.txt.train')
        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic'
        }

        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([
            u'test-error-mean', u'test-error-std', u'train-error-mean',
            u'train-error-std'
        ])
        assert len(cv.columns.intersection(exp)) == 4

        # show progress log (result is the same as above)
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    verbose_eval=True)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([
            u'test-error-mean', u'test-error-std', u'train-error-mean',
            u'train-error-std'
        ])
        assert len(cv.columns.intersection(exp)) == 4
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    verbose_eval=True,
                    show_stdv=False)
        assert isinstance(cv, pd.DataFrame)
        exp = pd.Index([
            u'test-error-mean', u'test-error-std', u'train-error-mean',
            u'train-error-std'
        ])
        assert len(cv.columns.intersection(exp)) == 4

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': 'auc'
        }
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
        assert 'eval_metric' in params
        assert 'auc' in cv.columns[0]

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': ['auc']
        }
        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=True)
        assert 'eval_metric' in params
        assert 'auc' in cv.columns[0]

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': ['auc']
        }
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    early_stopping_rounds=1)
        assert 'eval_metric' in params
        assert 'auc' in cv.columns[0]
        assert cv.shape[0] < 10

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic'
        }
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    metrics='auc')
        assert 'auc' in cv.columns[0]

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic'
        }
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    metrics=['auc'])
        assert 'auc' in cv.columns[0]

        params = {
            'max_depth': 2,
            'eta': 1,
            'verbosity': 0,
            'objective': 'binary:logistic',
            'eval_metric': ['auc']
        }
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    metrics='error')
        assert 'eval_metric' in params
        assert 'auc' not in cv.columns[0]
        assert 'error' in cv.columns[0]

        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    metrics=['error'])
        assert 'eval_metric' in params
        assert 'auc' not in cv.columns[0]
        assert 'error' in cv.columns[0]

        params = list(params.items())
        cv = xgb.cv(params,
                    dm,
                    num_boost_round=10,
                    nfold=10,
                    as_pandas=True,
                    metrics=['error'])
        assert isinstance(params, list)
        assert 'auc' not in cv.columns[0]
        assert 'error' in cv.columns[0]
    'subsample': 0.8,
    'learning_rate': 0.06,
    'colsample_bytree': 0.8,
    'eta': 0.02,
    'objective': 'binary:logistic',
    'sample_type': 'uniform',
    'normalize': 'tree',
    'rate_drop': 0.15,  # 0.1
    'skip_drop': 0.85,  # 0.9
    'nthread': -1,
}

dtrain = xgb.DMatrix(X_train, label=y_train)
xgb.cv(xgb_params,
       dtrain,
       num_boost_round=10000,
       nfold=5,
       verbose_eval=5,
       early_stopping_rounds=100)

model = xgb.train(xgb_params,
                  dtrain=dtrain,
                  num_boost_round=300,
                  evals=[(dtrain, 'train')],
                  verbose_eval=5)

dtest = xgb.DMatrix(X_test)
preds = model.predict(dtest)

print dtest

StackingSubmission = pd.DataFrame({'score': preds})
Пример #34
0
    def _train_model(self):

        self.log.info("Training models...")

        if self.date_obs_weights:
            date_min = min(self.data['train']['data']['disp_date'])
            date_max = max(self.data['train']['data']['disp_date'])
            span = date_max - date_min
            obs_weights = [(i - date_min) / span
                           for i in self.data['train']['data']['disp_date']]
        else:
            obs_weights = [1] * len(self.data['train']['data'].index)

        # Tune Hyperparameters ----------------------------------------------------------------

        if not os.path.exists(f"{self.code_dir}/models/tune_logs.json"):
            self.log.info(
                "No hyperparameter tuning logs found! Finding optimal hyperparameters... (this will take a while)"
            )

            #
            log_params = {}
            # For each label in the training dataset...
            for name, values in self.data['train']['labels'].iteritems():
                # Generate dmatrix for xgb model
                train_dmatrix = xgboost.DMatrix(self.data['train']['data'],
                                                label=values,
                                                weight=obs_weights)
                # Optomiz parameters and save to log
                log_params[name] = bayes_opt_xgb(
                    dmatrix=train_dmatrix,
                    log=self.log,
                    opt_fun=xgb_cv_fun,
                    opt_rounds=self.opt_rounds,
                    init_rounds=self.init_rounds,
                    params_ranges=self.params_ranges,
                    max_estimators=self.max_estimators)
            # Save log to file
            json.dump(log_params,
                      open(f"{self.code_dir}/models/tune_logs.json", "w"),
                      indent=4)

        else:
            # If a file already exists, load it
            self.log.info("Hyperparameter tuning logs found, loading...")
            log_params = json.load(
                open(f"{self.code_dir}/models/tune_logs.json", "r"))

        # Train models -------------------------------------------------------------------------

        fits = {}
        for name, values in self.data['train']['labels'].iteritems():

            self.log.info(f"Training {name}")

            train_dmatrix = xgboost.DMatrix(
                self.data['train']['data'],
                label=values,
                weight=obs_weights,
                feature_names=self.data['train']['data'].columns)

            # Get best hyperparameters for label from log
            log_best = log_params[name][max(log_params[name].keys())]
            best_params = log_best['params']

            fits[name] = xgboost.train(
                best_params,
                train_dmatrix,
                num_boost_round=log_best['fit_props']['n_estimators'])

            if len(self.data['test']['data'].index) > 0:
                # Print some quick "sanity check" results in test data
                test_dmatrix = xgboost.DMatrix(
                    self.data['test']['data'],
                    label=self.data['test']['labels'][name])
                self.log.info("Mean pred: " +
                              str(np.mean(fits[name].predict(test_dmatrix))) +
                              " (" +
                              str(np.mean(self.data['test']['labels'][name])) +
                              ")"
                              " Test AUC: " + str(
                                  metrics.roc_auc_score(
                                      self.data['test']['labels'][name],
                                      fits[name].predict(test_dmatrix))))
            else:
                # If no test data, check results using CV
                cv_results = xgboost.cv(
                    best_params,
                    train_dmatrix,
                    metrics='auc',
                    num_boost_round=log_best['fit_props']['n_estimators'])
                self.log.info(cv_results.iloc[-1])

        model_props = get_model_props(fits,
                                      data=self.data,
                                      out_weights=self.out_weights,
                                      instrument_trans=self.instrument_trans,
                                      log=self.log)

        if self.refit_full_model:

            for name, values in self.data['train']['labels'].iteritems():

                self.log.info(f"Training {name} on training and test data")

                train_dmatrix = xgboost.DMatrix(
                    self.data['train']['data'],
                    label=values,
                    weight=obs_weights,
                    feature_names=self.data['train']['data'].columns)

                # Get best hyperparameters for label from log
                log_best = log_params[name][max(log_params[name].keys())]
                best_params = log_best['params']

                fits[name] = xgboost.train(
                    best_params,
                    train_dmatrix,
                    num_boost_round=log_best['fit_props']['n_estimators'])

        model_dict = {'models': fits, 'model_props': model_props}

        return model_dict
Пример #35
0
    def train(self):

        params = {
            'eta': self.eta,
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'nthread': self.threads,
            'num_class': len(self.incl)
        }

        print('\033[92m' + 'Using the following parameters:' + '\033[0m')
        print('eta: {}'.format(self.eta))
        print('num_rounds: {}'.format(self.num_rounds))
        print('early_stopping_rounds: {}'.format(self.early_stop))

        grid_params = [(max_depth, subsample, colsample_bytree)
                       for max_depth in self.max_depth
                       for subsample in self.subsample
                       for colsample_bytree in self.colsample_bytree]

        print('\033[92m' +
              'Cross-validating with {} folds:'.format(self.nfold) + '\033[0m')

        min_mlogloss = float("Inf")
        best_params = None

        for max_depth, subsample, colsample_bytree in grid_params:
            print("CV with max_depth={}, subsample={}, colsample_bytree={}".
                  format(max_depth, subsample, colsample_bytree))

            params['max_depth'] = max_depth
            params['subsample'] = subsample
            params['colsample_bytree'] = colsample_bytree

            cv_results = xgb.cv(params,
                                self.dtrain,
                                num_boost_round=self.num_rounds,
                                seed=self.rnd_seed,
                                nfold=self.nfold,
                                metrics={'mlogloss'},
                                early_stopping_rounds=self.early_stop)
            # Update best mlogloss
            mean_mlogloss = cv_results['test-mlogloss-mean'].min()
            boost_rounds = cv_results['test-mlogloss-mean'].argmin()

            print("\tmlogloss {} for {} rounds".format(mean_mlogloss,
                                                       boost_rounds))

            if mean_mlogloss < min_mlogloss:
                min_mlogloss = mean_mlogloss
                best_params = (max_depth, subsample, colsample_bytree,
                               boost_rounds)
        print("Best params: {}, {}, {}, mlogloss: {}".format(
            best_params[0], best_params[1], best_params[2], min_mlogloss))

        print('\033[92m' + 'Training final model' + '\033[0m')

        params['max_depth'] = best_params[0]
        params['subsample'] = best_params[1]
        params['colsample_bytree'] = best_params[2]

        self.model = xgb.train(params,
                               self.dtrain,
                               num_boost_round=self.num_rounds,
                               evals=[(self.dtest, "Test")],
                               early_stopping_rounds=self.early_stop)

        self.boost_rounds = self.model.best_iteration

        self.model.save_model(self.out + 'xgb_repeats.model')
Пример #36
0
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))

    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain=dt,
        num_boost_round=300,
        seed=2,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
        
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))
# Best (11,4)
Пример #37
0
                                  reg_alpha=0.005,
                                  learning_rate=0.21,
                                  max_depth=1,
                                  alpha=10,
                                  n_estimators=1000,
                                  min_child_weight=10,
                                  gamma=0.2,
                                  nthread=4,
                                  scale_pos_weight=1)
        datamatrix = xgb.DMatrix(data=X, label=Y)
        xgpara = xg_reg.get_xgb_params()
        cv_results = xgb.cv(
            dtrain=datamatrix,
            params=xgpara,
            nfold=6,
            num_boost_round=xg_reg.get_params()['n_estimators'],
            early_stopping_rounds=30,
            metrics="rmse",
            as_pandas=True,
            seed=123)
        xg_reg.set_params(n_estimators=cv_results.shape[0])
        sc = MinMaxScaler(feature_range=(0, 1200))
        ta = sc.fit_transform(ta[:, :])
        xg_reg.fit(ta[:, :-1], ta[:, -1])

        #Getting the Predictions For Non-Linear Residue
        xgpred = xg_reg.predict(ta[:, :-1])
        xg_reg.fit(ta1[:, :-1], ta1[:, -1])
        xgpred1 = xg_reg.predict(ta1[:, :-1])
        arimaPreds1 = arima_predictions[start:end, 1]
        arimaPreds2 = arimaPreds1.reshape((arimaPreds1.shape[0], 1))
Пример #38
0
min_mae = float("Inf")
best_params = None
params['silent'] = 1

for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))

    # We update our parameters
    params['eta'] = eta

    # Run and time CV
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )

    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta

print("Best params: {}, MAE: {}".format(best_params, min_mae))
print('================\n')
Пример #39
0
    'eta': .01,
    'colsample_bytree': .8,
    'subsample': .8,
    'seed': 0,
    'nthread': 16,
    'objective': 'multi:softprob',
    'eval_metric': 'mlogloss',
    'num_class': 3,
    'silent': 0
}

dtrain = xgb.DMatrix(data=x_train, label=y_train)
dtest = xgb.DMatrix(data=x_test)

bst = xgb.cv(params,
             dtrain,
             10000,
             NFOLDS,
             early_stopping_rounds=50,
             verbose_eval=25)
best_rounds = np.argmin(bst['test-mlogloss-mean'])
bst = xgb.train(params, dtrain, best_rounds)

bst.save_model('xgboostreduced2.model')
preds = bst.predict(dtest)
preds = pd.DataFrame(preds)
cols = ['high', 'medium', 'low']
preds.columns = cols
preds['listing_id'] = test.listing_id.values

preds.to_csv('xgboostreduced2.csv', index=None)
Пример #40
0
best_params = None
t = time.time()
#search for max depth and min_child_weight
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
        max_depth, min_child_weight))

    params = initializeDefaultParamsGPU()

    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight

    cv_result = xgb.cv(params,
                       train_dmat,
                       num_boost_round=num_boost_round_default,
                       seed=seed,
                       nfold=nfold,
                       metrics=metrics,
                       early_stopping_rounds=early_stopping_round_default)

    mean_mlogloss = cv_result['test-mlogloss-mean'].min()
    boost_rounds = cv_result['test-mlogloss-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mlogloss, boost_rounds))
    if mean_mlogloss < min_mlogloss:
        min_mlogloss = mean_mlogloss
        best_params = (max_depth, min_child_weight)

print("Best max_depth and min_child_weight: {}, mlogloss: {}".format(
    best_params, min_mlogloss))
print("Time: {}".format(time.time() - t))
max_depth = best_params[0]
    'normalize': 'tree',
    'rate_drop': 0.1,
    'skip_drop': 0.9,
    'seed': 87,
    'nthread': 12,
    'slice': 0
}

watchlist = [(dtrain, 'train')]

print 'cv'
# #通过cv找最佳的nround
cv_log = xgb.cv(params,
                dtrain,
                num_boost_round=1000,
                nfold=5,
                metrics='rmse',
                early_stopping_rounds=50,
                seed=1024)
bst_rmse = cv_log['test-rmse-mean'].min()
cv_log['nb'] = cv_log.index
cv_log.index = cv_log['test-rmse-mean']
bst_nb = cv_log.nb.to_dict()[bst_rmse]

# watchlist  = [(dtrain,'train')]
model = xgb.train(params, dtrain, num_boost_round=bst_nb + 50, evals=watchlist)

#predict test set
test_y = model.predict(dtest)
print test_y
#
Пример #42
0
X_train_b = X_train[cols_b]
X_test_a = X_test[cols_a]
X_test_b = X_test[cols_b]
dtrain = xgb.DMatrix(X_train,y)
dtest= xgb.DMatrix(X_test)
dtrain_b = xgb.DMatrix(X_train_b,y)
dtest_b = xgb.DMatrix(X_test_b)

param_a = {'subsample':0.55, 'eta':0.05, 'seed': 10, 'max_depth': 4, 'gamma': 0.75,'objective':'reg:linear','colsample_bytree':0.7,'eval_metric': 'rmse','nthread': 8, 'min_child_weight': 4.0 ,'early_stopping_rounds':10,'verbose_eval':10,'booster':'gbtree'}
num_round_a = 3000
n = int(0.3*74067)
df_train_train = df_train.iloc[:n]
df_train_test = df_train.iloc[n:]
id_test = df_train_test['id'].astype(int)

clf_a = xgb.cv(param_a, dtrain, num_round_a,nfold = 5,metrics={'rmse'}, seed = 0)
clf_a[clf_a['test-rmse-mean']==min(clf_a['test-rmse-mean'])].index.tolist()

param_b = {'subsample':0.55, 'eta':0.05, 'seed': 10, 'max_depth': 4, 'gamma': 0.75,'objective':'reg:linear','colsample_bytree':0.7,'eval_metric': 'rmse','nthread': 8, 'min_child_weight': 4.0 ,'early_stopping_rounds':10,'verbose_eval':10,'booster':'gblinear'}
num_round_b = 8000
clf_b = xgb.cv(param_b, dtrain_b, num_round_b,nfold = 5,metrics={'rmse'}, seed = 0)
clf_b[clf_b['test-rmse-mean']==min(clf_b['test-rmse-mean'])].index.tolist()

########################## xgboost model 1 ##################################
num_round = 8000
bst = xgb.train(param_a,dtrain_a,num_round_a)
print("predict...")
y_pred = bst.predict(dtest_a)
for i in range(len(y_pred)):
    if y_pred[i]<1.0:
        y_pred[i] = 1.0
Пример #43
0
clf=xgb.train(params, dtrain, num_boost_round=10, evals=[], obj=None, feval=None, maximize=False, early_stopping_rounds=None, evals_result=None, verbose_eval=True, xgb_model=None, callbacks=None, learning_rates=None)
'''
num_boost_round:boost迭代次数
evals:一对对 (DMatrix, string)组成的列表,培训期间将评估哪些指标的验证集列表。验证指标将帮助我们跟踪模型的性能。用evallist = [(dtest, 'eval'), (dtrain, 'train')]指定。
obj
feval:自定义评价函数
maximize
early_stopping_rounds:验证指标需要至少在每轮early_stopping_rounds中改进一次才能继续训练,例如early_stopping_rounds=200表示每200次迭代将会检查验证指标是否有改进,如果没有就会停止训练,如果有多个指标,则只判断最后一个指标
evals_result
verbose_eval:取值可以是bool型也可以是整数,当取值为True时,表示每次迭代都显示评价指标,当取值为整数时,表示每该取值次数轮迭代后显示评价指标
xgb_model
callbacks
learning_rates
'''

xgb.cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None, metrics=(), obj=None, feval=None, maximize=False, early_stopping_rounds=None, fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True, seed=0, callbacks=None, shuffle=True)
'''
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)
model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()
'''

bst.save_model('0001.model')

ypred = clf.predict(data, output_margin=False, ntree_limit=None, validate_features=True)
'''
ntree_limit:限制预测中的树数;如果定义了最佳树数限制,则默认为最佳树数限制,否则为0(使用所有树)
'''

xgb.plot_tree(bst, num_trees=2)
xgb.to_graphviz(bst, num_trees=2)
Пример #44
0
    mdl = xgb.train(params, d_train, 1600, watchlist, early_stopping_rounds=150, maximize=True, verbose_eval=100)
    
    # Add model to the list of models (one for each fold)
    models_by_fold.append(mdl)


# In[ ]:


n_folds = 5
early_stopping = 10
params = {'eta': 0.02, 'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7, 'objective': 'binary:logistic', 'seed': 99, 'silent': 1, 'eval_metric':'auc', 'nthread':4}

xg_train = xgb.DMatrix(Xtrain_hashed, label=y_train);

cv = xgb.cv(params, xg_train, 5000, nfold=n_folds, early_stopping_rounds=early_stopping, verbose_eval=1)


# ## Work in progess-K folds

# In[ ]:


K = 5
kf = KFold(n_splits = K, random_state = 3228, shuffle = True)


# In[ ]:


for train_index, test_index in kf.split(train):
Пример #45
0
        help=
        "path to CSV file with labels reflecting relevances of pairs (theorem, premise)"
    )
    parser.add_argument(
        "output_directory",
        help=
        "path to directory where performance of tested model should be saved")
    args = parser.parse_args()

    y = read_csv(os.path.abspath(args.y), type_of_records="int")
    X = load_obj(os.path.abspath(args.X))
    output_directory = os.path.abspath(args.output_directory)

    dtrain = xgb.DMatrix(X, label=y)
    params = {
        "max_depth": p["max_depth"],
        "eta": p["eta"],
        "gamma": p["gamma"],
        "objective": "binary:logistic"
    }
    x = xgb.cv(params=params,
               dtrain=dtrain,
               num_boost_round=p["num_boost_round"],
               early_stopping_rounds=p["early_stopping_rounds"],
               nfold=4,
               metrics={"error", "auc", "logloss"})

    output_name = os.path.join(output_directory,
                               "_".join(map(str, list(p.values()))) + ".pkl")
    save_obj({"params": p, "stats": x}, output_name)
# Compute the accuracy of the predictions: accuracy
accuracy = float(np.sum(y_pred_4 == y_test)) / y_test.shape[0]
print("accuracy:", accuracy)

# Create the DMatrix: churn_dmatrix
churn_dmatrix = xgb.DMatrix(data=X, label=y)

# Create the parameter dictionary: params
params = {"objective": "reg:logistic", "max_depth": 3}

# Perform cross-validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix,
                    params=params,
                    nfold=3,
                    num_boost_round=5,
                    metrics="error",
                    as_pandas=True,
                    seed=123)

# Print cv_results
print(cv_results)

# Print the accuracy
print(((1 - cv_results["test-error-mean"]).iloc[-1]))

# Perform cross_validation: cv_results
cv_results = xgb.cv(dtrain=churn_dmatrix,
                    params=params,
                    nfold=3,
                    num_boost_round=5,
# get the parameters for xgboost
file_name = sys.argv[1]
folder_name = file_name.split('.')[0]
with open('../data/params_0119/' + file_name) as fread:
    jj = 0
    for line in fread.readlines():
        params = line.split(',')
        params[-1] = params[-1].strip()
        jj += 1
        # get the parameters for xgboost
        plst = get_params(eta=params[0],
                          min_child_weight=params[1],
                          subsample=params[2],
                          colsample_bytree=params[3],
                          max_depth=params[4])

        # print 'file: '+file_name+",params: "+str(jj)

        # train model
        model = xgb.cv(plst,
                       xgtrain0,
                       num_boost_round=xgb_num_rounds,
                       metrics=['auc'],
                       show_progress=True,
                       show_stdv=True)

        # get preds
        # train_validation_preds = model.predict(xgtrain_validation, ntree_limit=model.best_iteration)
        # print 'file: '+file_name+",params: "+str(jj)+' Train score is:', eval_wrapper(train_validation_preds, train_validation['Response'])
 def kfold(self, x_train, y_train, nfold=5):
     dtrain = xgb.DMatrix(x_train, y_train)
     cv_rounds = xgb.cv(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round,
                        nfold=nfold, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)
     return cv_rounds.iloc[-1,:]
def xgb_evaluate(max_depth,
                 min_child_weight,
                 colsample_bytree,
                 subsample,
                 gamma,
                 colsample_bylevel,
                 max_delta_step,
                 eta,
                 reg_alpha,
                 reg_lambda
         ):

    global AUCbest
    global ITERbest

    params={}
    params['booster'] = 'gbtree'
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = gamma
    params['colsample_bylevel'] = max(min(colsample_bylevel, 1), 0)
    params['max_delta_step']=max(int(max_delta_step),0)
    params['eta']=max(min(eta,1), 0)
    params['reg_alpha'] = max(reg_alpha, 0)
    params['reg_lambda']=max(min(reg_lambda, 1), 0)
    params['eval_metric']='auc'
    params['silent']=True
    params['objective']='binary:logistic'
    params['seed'] =42

    
    

    print("\n Search parameters (%d-fold validation):\n %s" % (folds, params), file=log_file )
    log_file.flush()

    xgbc = xgb.cv(
                    params,
                    dtrain,
                    num_boost_round = nrounds,
                    stratified = True,
                    nfold = folds,
                    early_stopping_rounds = 100,
                    metrics = 'auc',
                    show_stdv = True
               )


    val_score = xgbc['test-auc-mean'].iloc[-1]
    train_score = xgbc['train-auc-mean'].iloc[-1]
    print(' Stopped after %d iterations with train-auc = %f val-auc = %f ( diff = %f ) train-gini = %f val-gini = %f' 
          % ( len(xgbc), train_score, val_score, (train_score - val_score), (train_score*2-1),
(val_score*2-1)) , file=log_file)
    if ( val_score > AUCbest ):
        AUCbest = val_score
        ITERbest = len(xgbc)
        print('\n\nBest Valid AUC changed to %f'%AUCbest, file=log_file)
        log_file.flush()
        #
        print("\n Best parameters (%d-fold validation):\n %s" % (folds, params), file=log_file_bestparam )
        print('\n Best Valid AUC changed to %f'%AUCbest, file=log_file_bestparam)
        print('\n Train AUC is %f'%train_score, file=log_file_bestparam)
        log_file_bestparam.flush()
        #
    del xgbc
    gc.collect()
    return (val_score*2) - 1
Пример #50
0
test_data_features = vectorizer.transform(clean_test_reviews)
test_data_features = test_data_features.toarray()
##################################################################
## Random Forest; Spend 7 mins
# Let's try a random forest with the features we just created.
rf_clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0)  # Initialize a Random Forest classifier with 100 trees
# Use cross validation to evaluate the performance of Random Forest
rf_clf_error = 1 - cross_val_score(rf_clf, train_data_features, train['sentiment'], cv=5, scoring='accuracy', n_jobs=-1).mean()
print(rf_clf_error)  # Random Forest training error: 0.1573
##################################################################
## XGBoost; 比上面的还要慢好多
# Let's try a XGBoost with the features we created.
dtrain = xgb.DMatrix(train_data_features, label=train['sentiment'])  # Create xgb trianing set and parameters
params = {'silent': 1, 'nthread': -1, 'eval_metric': 'error'}
print('The cross validation may take a while...')  # Use cross validation to evaluate the performance of XGBoost
xgb_cv_results = xgb.cv(params, dtrain, num_boost_round=100, nfold=5, show_stdv=False, seed=0)  # 好慢啊, 忍不住 kill 了
xgb_error = xgb_cv_results['test-error-mean'].mean()
print(xgb_error)  # XGBoost trianing error: 0.1829
# It seems that Random Forest out-performed XGBoost. Thus we'll create a submission file with Random Forest.
##################################################################
## Creating a Submission of Random Forest; 经过上面在 test 上面的对比, 最终采用 Random Forest
# Fit the forest to the training set, using the bag of words as features and the sentiment labels as labels
rf_clf.fit(train_data_features, train['sentiment'])  # This may take a few minutes to run, 比上面的 XGBoost 快多了
# Use the random forest to make sentiment label predictions
result = rf_clf.predict(test_data_features); print(result)
# Copy the results to a pandas dataframe with an "id" column an a "sentiment" column
output = pd.DataFrame(data={"id":test["id"], "sentiment":result})
##################################################################
## 保存结果, 提交到比赛页面
# Use pandas to write the comma-separated output file
output.to_csv("tmp-Bag_of_Words_rf_clf_results.csv", index=False, quoting=3)
# Fitting the ANN to the Training set
model_history=classifier.fit(X_train.values, y_train.values,validation_split=0.20, batch_size = 10, epochs = 760)

import xgboost as xgb

data_dmatrix = xgb.DMatrix(data=X_train,label=y_train)

xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,max_depth = 5, alpha = 10, n_estimators = 500)

xg_reg.fit(X_train,y_train)

y_pred = xg_reg.predict(X_test)

params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)

cv_results.head()

y_pred=classifier.predict(test)

pred=pd.DataFrame(y_pred)
sub_df=pd.read_csv('sample_submission.csv')
datasets=pd.concat([sub_df['Id'],pred],axis=1)
datasets.columns=['Id','SalePrice']
datasets.to_csv('sample_submission.csv',index=False)

datasets.head()
Пример #52
0
# Author: 杨秀隆 sndnyang <*****@*****.**>
#         sndnyang.github.io
# Description:
#
#####################################################

import xgboost as xgb

if __name__ == '__main__':
    
    dtrain = xgb.DMatrix("train.buffer")
    param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
    param['nthread'] = 4
    
    num_round = 10
    print xgb.cv( param, dtrain, num_round, nfold=5,
       metrics={'error'}, seed = 0 )
    
    
    """test_features = pd.read_csv('test/orig_feature.csv')
    test_features = pd.read_csv('test/test_features.csv')
    
    test_features = test_features.sort('enrollment_id')
    columns = test_features.columns[2:]
    test_features[columns].values
    
    #
    print columns

    fp = file('lsvm_submit.csv', 'w')
    prd = clf.predict(test_features[columns].values)
    for eid, cls in zip(test_features['enrollment_id'].values, prd):
#########################################  Modeling  ###############################################

### PROC REG  -- stepwise selection  -- predicting TARGET

### PROC GENMOD  --  using stepwise selected variables  --  log link dist=nb

### PROC Logisitc  --  using stepwise selected variables  --

########################################  XGBoost  ###############################################
X = data.iloc[:, 3:]
y = data.iloc[:, 0]
dTrain = xgb.DMatrix(data=X, label=y)
params = {'objective': 'count:poisson', 'max_depth': 4}
cv_results = xgb.cv(dtrain=dTrain,
                    params=params,
                    nfold=4,
                    num_boost_round=10,
                    metrics='error',
                    as_pandas=True)
print("Accuracy: %f" % ((1 - cv_results["test-error-mean"]).iloc[-1]))
bst = xgb.train(params, dTrain)
preds = bst.predict(dTrain)
print("RMSE: %f" % np.sqrt(mean_squared_error(y, preds)))

X = data.iloc[:, 3:]
y = data.iloc[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=123)
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123)
xg_reg.fit(X_train, y_train)
Пример #54
0
    subsample=0.65,
    colsample_bytree=0.7,
    learning_rate=0.01,
    objective='multi:softmax',  #需要被最小化的损失函数,选的是多分类预测类别
    num_class=5,  #指定类别数目
    gamma=0,  #惩罚参数
    reg_alpha=0.05,
    reg_lambda=0.05,
    nthread=4,
    seed=27)
xgtrain = xgb.DMatrix(X, label=y)
xgb_param = clf.get_xgb_params()
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=5000,
                  nfold=5,
                  metrics=['mlogloss'],
                  early_stopping_rounds=50,
                  stratified=True,
                  seed=1301)

print('Best number of trees = {}'.format(cvresult.shape[0]))
clf.set_params(n_estimators=cvresult.shape[0],
               use_label_encoder=False)  #把clf的参数设置成最好的树对应的参数
clf.fit(X, y, eval_metric='merror')
dtest_x = xgb.DMatrix(X1)
pre = clf.predict(X1)
w_score(y1, pre)
#--------------------------------------------------------------
#number Two
#sklearn的CV调优
param_test1 = {
Пример #55
0
                         subsample=0.77,
                         colsample_bytree=0.7,
                         objective='reg:linear',
                         nthread=4,
                         alpha=.1,
                         lamda=1)

# ### Build xgboost model

# In[21]:

xgb_param = model.get_xgb_params()
xgtrain = xgb.DMatrix(features_filtered, label=y)
cvresult = xgb.cv(xgb_param,
                  xgtrain,
                  num_boost_round=model.get_params()['n_estimators'],
                  nfold=5,
                  early_stopping_rounds=50)
model.set_params(n_estimators=cvresult.shape[0])
model.fit(features_filtered, y)
model.predict(features_filtered)

# In[30]:

path = 'C:\\Users\\DALab\\Desktop\\data contest\\test data'
test = pd.DataFrame(np.nan,
                    index=range(75000),
                    columns=['id', 'time', '1st', '2nd', '3rd', '4th'])
index = 0
for file in listdir(path):
    temp = pd.read_excel(path + '\\' + file, header=None)
Пример #56
0
plt.show()

#The residual plot looks pretty good.To wrap it up let's predict on the test set
# and submit on the leaderboard:

#Adding an xgboost model:

#Let's add an xgboost model to our linear model to see if we can improve our score:

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y)
dtest = xgb.DMatrix(X_test)

params = {"max_depth": 2, "eta": 0.1}
model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100)

model.loc[30:, ["test-rmse-mean", "train-rmse-mean"]].plot(
)  # will plot train and test errors
plt.show()

model_xgb = xgb.XGBRegressor(
    n_estimators=360, max_depth=2,
    learning_rate=0.1)  #the params were tuned using xgb.cv
model_xgb.fit(X_train, y)

# Output: XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
#        learning_rate=0.1, max_delta_step=0, max_depth=2,
#        min_child_weight=1, missing=None, n_estimators=360, nthread=-1,
#        objective='reg:linear', reg_alpha=0, reg_lambda=1,
#        scale_pos_weight=1, seed=0, silent=True, subsample=1)
Пример #57
0

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'rmse', r2_score(labels, preds)

# form DMatrices for Xgboost training
dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

# xgboost, cross-validation
cv_result = xgb.cv(xgb_params,
                  dtrain,
                  obj=logcoshobj,
                  feval=xgb_r2_score,
                  num_boost_round=10000, # increase to have better results (~700)
                  early_stopping_rounds=100,
                  verbose_eval=50,
                  show_stdv=False,
                  maximize=True
                 )
best_iteration = cv_result.shape[0] - 1
print(best_iteration)
cv_mean = cv_result.iloc[-1, 2]
cv_std = cv_result.iloc[-1, 3]
print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))
#num_boost_rounds = len(cv_result)
#print('num_boost_rounds=' + str(num_boost_rounds))

# train model
model = xgb.train(dict(xgb_params, silent=1), dtrain, obj=logcoshobj, num_boost_round=best_iteration)
Пример #58
0
def train_test_xgb(dataset='data/dataset_for_analysis_test.csv',
                   finetuning=False,
                   force_data_prep=True):
    """ train and test a xgb model
    1) define target and predictors, prepare dataset
    2) OPTIONAL: hyper-parameter tuning
    3) train and test model, save CV-scores
    4) print scores and feature importance
    """

    # 1) define target and predictors, prepare dataset

    # notes: f*****g FLDAS updates after 2 months
    # alternatives
    # precipitation: NASA/GPM_L3/IMERG_V06

    # define labeld of predictors and target
    X_labels_base = [
        'MODIS_006_MOD11A1_LST_Day_1km',
        'MODIS_006_MOD11A1_LST_Night_1km',
        # 'MODIS_006_MYD13A1_EVI',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_Qair_f_tavg',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_Rainf_f_tavg',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_SoilMoi00_10cm_tavg',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_SoilTemp00_10cm_tavg',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_Wind_f_tavg',
        'NASA_FLDAS_NOAH01_C_GL_M_V001_Tair_f_tavg',
        'JAXA_GPM_L3_GSMaP_v6_operational_hourlyPrecipRateGC'
    ]
    y_label = 'mean_ovi'
    # define number of time steps to use (past observations)
    n_time_steps = 3
    # define labels of predictors at different time steps
    X_labels = []
    for ts in range(n_time_steps):
        X_labels += [x + '_' + str(ts) for x in X_labels_base]

    # prepare dataset
    if not os.path.exists(dataset) or force_data_prep:
        print(
            'dataset not found, preparing from raw data (this might take a wile)'
        )
        # define input data path
        ovitrap_data = 'data/ovitrap_data_month_adm2.csv'
        weather_data = 'data/merged_adm2.csv'
        prepare_dataset_for_training(ovitrap_data,
                                     weather_data,
                                     X_labels_base,
                                     filename=dataset,
                                     n_time_steps=n_time_steps)
    df = pd.read_csv(dataset)
    df = df.reset_index()
    df.date = pd.to_datetime(df.date)
    # save original dataset for later (inference)
    df_uncut = df.copy()
    df_uncut = df_uncut.dropna(subset=X_labels)
    df = df.dropna(subset=[y_label])

    # QA cuts
    # removing a few data points based on poor correlation with observed "good" predictors
    df = df[(df.date.dt.year >= 2013) & (df.date.dt.year <= 2017)]
    bad_provinces = [
        'Nueva Vizcaya', 'Surigao del Norte', 'Sarangani', 'Siquijor',
        'Dinagat Islands', 'Isabela', 'Capiz', 'South Cotabato', 'Maguindanao',
        'Biliran', 'Davao Occidental', 'Quirino', 'Guimaras', 'Aurora'
    ]
    df = df[~df.adm_level.isin(bad_provinces)]
    df = df[df.count_ovi > 5]

    # 2) OPTIONAL: hyper-parameter tuning

    if finetuning:
        # define training data
        dtrain = xgb.DMatrix(data=df[X_labels], label=df[y_label])
        # define grid of possible hyper-parameter values
        gridsearch_params = [{
            'max_depth': max_depth,
            'learning_rate': learning_rate,
            'n_estimators': n_estimators,
            'min_split_loss': min_split_loss,
            'subsample': subsample,
            'colsample_bytree': colsample_bytree,
            'reg_lambda': reg_lambda,
            'reg_alpha': reg_alpha,
            'booster': booster
        } for max_depth in [2, 5, 10, 20]
                             for learning_rate in [0.05, 0.1, 0.2]
                             for n_estimators in [1000, 2000, 5000]
                             for min_split_loss in [0., 0.1, 1.]
                             for subsample in [0.5, 0.8, 1.]
                             for colsample_bytree in [0.3, 0.6, 1.]
                             for reg_lambda in [1, 1.5, 2]
                             for reg_alpha in [0., 0.1, 1.]
                             for booster in ['gbtree']]
        # fixed hyper-parameters
        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'booster': 'gbtree',
            'num_boost_round': 1000
        }
        # loop over grid, find best hyper-parameters
        min_rmse = float("Inf")
        best_params = None
        for params_grid in tqdm(gridsearch_params):
            # Update parameters
            for name, value in params_grid.items():
                params[name] = value
            # Run CV
            cv_results = xgb.cv(dtrain=dtrain,
                                params=params,
                                num_boost_round=params['num_boost_round'],
                                early_stopping_rounds=50,
                                seed=42,
                                nfold=5,
                                as_pandas=True)
            # Update best MAE
            mean_rmse = cv_results['test-rmse-mean'].min()
            boost_rounds = cv_results['test-rmse-mean'].values.argmin() + 1
            print("\trmse {} for {} / {} rounds".format(
                mean_rmse, boost_rounds, len(cv_results)))
            if mean_rmse < min_rmse:
                min_rmse = mean_rmse
                best_params = params
        # print results
        print("best params: {}".format(best_params))
        print("min rmse: {}".format(min_rmse))
    else:
        # used fixed hyper-parameters
        best_params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'booster': 'gbtree',
            'num_boost_round': 1000,
            'max_depth': 20,
            'learning_rate': 0.2,
            'n_estimators': 5000,
            'subsample': 1.0,
            'colsample_bytree': 1.0,
            'reg_lambda': 2,
            'reg_alpha': 1.0,
            'min_split_loss': 0.
        }

    # 3) train and test model, save CV-scores

    nfolds = 5
    X, y = df[X_labels], df[y_label]
    dmdata = xgb.DMatrix(data=X, label=y)
    cv_results = xgb.cv(dtrain=dmdata,
                        params=best_params,
                        nfold=nfolds,
                        metrics=["rmse", "mae"],
                        num_boost_round=best_params['num_boost_round'],
                        early_stopping_rounds=50,
                        as_pandas=True,
                        seed=42)
    boost_rounds = len(cv_results)

    # 4) print scores and feature importance

    # print average performance & feature importance
    print("cv-performance: MAE {}, RMSE {}".format(
        cv_results["test-mae-mean"].tail(1).values[0],
        cv_results["test-rmse-mean"].tail(1).values[0]))

    xg_reg = xgb.train(dtrain=dmdata,
                       params=best_params,
                       num_boost_round=boost_rounds)

    dpredict = xgb.DMatrix(data=df_uncut[X_labels], label=df_uncut[y_label])
    predictions = xg_reg.predict(dpredict)
    df_pred = df_uncut.copy()
    df_pred[y_label] = predictions
    df_pred = inverse_transform(df_pred, dataset, [y_label] + X_labels)
    df_pred.to_csv('output/dataset_predictions.csv')
    xg_reg.save_model('models/best_model.json')

    df_uncut = df_uncut.dropna(subset=[y_label])
    dpredict = xgb.DMatrix(data=df_uncut[X_labels], label=df_uncut[y_label])
    predictions = xg_reg.predict(dpredict)
    print('R2 train', r2_score(df_uncut[y_label].values, predictions))

    xgb.plot_importance(xg_reg)
    plt.tight_layout()
    plt.show()
def main():
    print 'load datas...'
    train, test = data_util.load_dataset()
    y_train_all = train['y']
    del train['ID']
    del train['y']
    id_test = test['ID']
    del test['ID']

    print 'train:', train.shape, ', test:', test.shape

    train_r2_scores = []
    val_r2_scores = []
    num_boost_roundses = []

    X_test = test
    df_columns = train.columns.values
    dtest = xgb.DMatrix(X_test, feature_names=df_columns)

    xgb_params = {
        'eta': 0.005,
        'max_depth': 4,
        'subsample': 0.93,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }

    for i in range(0, 5):
        random_state = 42 + i
        X_train, X_val, y_train, y_val = train_test_split(
            train, y_train_all, test_size=0.25, random_state=random_state)

        dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns)
        dval = xgb.DMatrix(X_val, y_val, feature_names=df_columns)

        y_mean = np.mean(y_train)

        cv_result = xgb.cv(
            dict(xgb_params,
                 base_score=y_mean),  # base prediction = mean(target)
            dtrain,
            num_boost_round=2000,  # increase to have better results (~700)
            early_stopping_rounds=50,
        )

        num_boost_rounds = len(cv_result)
        num_boost_roundses.append(num_boost_rounds)
        model = xgb.train(dict(xgb_params, base_score=y_mean),
                          dtrain,
                          num_boost_round=num_boost_rounds)
        train_r2_score = r2_score(dtrain.get_label(), model.predict(dtrain))
        val_r2_score = r2_score(dval.get_label(), model.predict(dval))
        print 'perform {} cross-validate: train r2 score = {}, validate r2 score = {}'.format(
            i + 1, train_r2_score, val_r2_score)
        train_r2_scores.append(train_r2_score)
        val_r2_scores.append(val_r2_score)

    print '\naverage train r2 score = {}, average validate r2 score = {}'.format(
        sum(train_r2_scores) / len(train_r2_scores),
        sum(val_r2_scores) / len(val_r2_scores))

    best_num_boost_rounds = sum(num_boost_roundses) // len(num_boost_roundses)
    print 'best_num_boost_rounds =', best_num_boost_rounds
    # train model
    print 'training on total training data...'
    dtrain_all = xgb.DMatrix(train, y_train_all, feature_names=df_columns)
    model = xgb.train(dict(xgb_params, base_score=np.mean(y_train_all)),
                      dtrain_all,
                      num_boost_round=best_num_boost_rounds)

    print 'predict submit...'
    xgb_result = model.predict(dtest)

    # ===================================== model stacking =================================
    stacked_pipeline = make_pipeline(
        StackingEstimator(estimator=LassoLarsCV(normalize=True)),
        StackingEstimator(
            estimator=GradientBoostingRegressor(learning_rate=0.001,
                                                loss="huber",
                                                max_depth=3,
                                                max_features=0.55,
                                                min_samples_leaf=18,
                                                min_samples_split=14,
                                                subsample=0.7)), LassoLarsCV())

    stacked_pipeline.fit(train.values, y_train_all)
    stack_results = stacked_pipeline.predict(X_test.values)

    df_sub = pd.DataFrame({'ID': id_test, 'y': stack_results})
    df_sub.to_csv('model_stacking_result.csv', index=False)

    y_pred = xgb_result * 0.784 + stack_results * 0.216
    df_sub = pd.DataFrame({'ID': id_test, 'y': y_pred})
    df_sub.to_csv(Configure.submission_path, index=False)
Пример #60
0
    'colsample_bytree': 0.8,
    'silent': 1,
    'subsample': 0.6,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'max_depth': 1,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'eval_metric': 'rmse',
}

res = xgb.cv(xgb_params,
             dtrain,
             num_boost_round=1000,
             nfold=4,
             seed=SEED,
             stratified=False,
             early_stopping_rounds=25,
             verbose_eval=10,
             show_stdv=True)

best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('Ensemble-CV: {0}±{1}'.format(cv_mean, cv_std))

gbdt = xgb.train(xgb_params, dtrain, best_nrounds)

submission = pd.read_csv(SUBMISSION_FILE)
submission.iloc[:, 1] = gbdt.predict(dtest)