def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
  if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        display(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])

    
  print('Start Training')
  alg.fit(X_train, y_train, eval_metric='auc')
  print("Start Predicting")
  predictions = alg.predict(X_test)
  pred_proba = alg.predict_proba(X_test)[:, 1]

    # Model performance
  print("\nModel statistic")
  print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
  print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba))
  print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions))

  feat_imp = alg.feature_importances_
  feat = X_train.columns.tolist()
  res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
  res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')
  plt.show()
  print(res_df)
  print(res_df["Features"].tolist())
  return cvresult, alg
示例#2
0
def xgb():
    print("Training an XGB Classifier")

    params = {
        "max_depth": 8,
        "n_estimators": 400,
        "learning_rate": 0.05,
        "n_jobs": -1,
        "subsample": 0.8,
        "nthread": 4,
    }

    trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3)

    gbm = XGBClassifier(**params)
    print(gbm.get_xgb_params())

    gbm.fit(trX_, trY_, eval_set=[(tvX_, tvY_)], verbose=True)

    # Find training accuracy
    trP = classes[gbm.predict(trX)]
    print("Training Accuracy: ", 100 * accuracy(trY, trP))

    # Dump test labels
    tsP = classes[gbm.predict(tsX)]
    write_csv("xgb_d5_n150.csv", tsP)
def xgb_cv(X, y):
    # Instantiate XGBoost
    n_estimators = 100
    dtrain = xgb.DMatrix(X, y)

    # XGBoost was tuned on the raw data.
    bst = XGBClassifier(n_estimators=100, #70
                        max_depth=3, 
                        min_child_weight=5, 
                        gamma=0.5, 
                        learning_rate=0.05, 
                        subsample=0.7, 
                        colsample_bytree=0.7, 
                        reg_alpha=0.001,
                        seed=1)

    # Cross-validate XGBoost
    params = bst.get_xgb_params() # Extract parameters from XGB instance to be used for CV
    num_boost_round = bst.get_params()['n_estimators'] # XGB-CV has different names than sklearn

    cvresult = xgb.cv(params, dtrain, num_boost_round=num_boost_round, 
                      nfold=10, metrics=['logloss', 'auc'], seed=1)

    print("="*80)
    print("\nXGBoost results for 10-fold cross-validation:")
    print(cvresult)
    print("="*80)

    # XGBoost summary
    print("="*80)
    print("\nXGBoost summary for 100 rounds of 10-fold cross-validation:")
    print("\nBest mean log-loss: %.4f" % cvresult['test-logloss-mean'].min())
    print("\nBest mean AUC: %.4f" % cvresult['test-auc-mean'].max())
    print("="*80)
示例#4
0
def xgmethod(X,Y):
  
    # split data into train and test sets
    seed = 7
    test_size = 0.3
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

    scaler = preprocessing.StandardScaler().fit(X_train)
    scaler.transform(X_train) 
    # XGtrain matrix
    xgtrain = xgb.DMatrix(X_train, label=y_train)
    
   
    model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=100,objective='binary:logistic')
    xgb_param = model.get_xgb_params()
    
    print ('Start cross validation')
    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'],
     early_stopping_rounds=50, stratified=True, seed=1301)
    print('Best number of trees = {}'.format(cvresult.shape[0]))
    
    model.set_params(n_estimators=cvresult.shape[0])
    print('Fit on the trainingsdata')
    model.fit(X_train, y_train, eval_metric='auc')
   
    pred = model.predict(X_test, ntree_limit=cvresult.shape[0])
    
  
    # make predictions for test data
    predictions = [round(value) for value in pred]
   
    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    return accuracy
示例#5
0
    def training(self):
        """
        Training is done at each max_depth loop.
        XGBoost's cv is used to find the optimum number of tree (estimators) at each depth, up to 1000 trees.
        Once traning result doesn't improve for 50 epochs, training will stop. The tree number used in the last epoch
        will be used to fit the train and test set again. Metrics will then be measured again this XGB model.
        """

        max_depth = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        best_depth = 0
        best_estimator = 0
        max_score = 0
        for md in max_depth:
            model = XGBClassifier(learning_rate=0.3, n_estimators=1000, max_depth=md, min_child_weight=1,
                                  gamma=1, subsample=1, colsample_bytree=0.1, reg_lambda=0, reg_alpha=1,
                                  random_state=42)
            xgb_param = model.get_xgb_params()
            xgtrain = xgboost.DMatrix(self.Xtrain.values, label=self.ytrain.values)

            cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=1000, early_stopping_rounds=50,
                                  nfold=8, metrics='auc', stratified=True, shuffle=True, seed=42,
                                  verbose_eval=False)
            print("There are {} trees in the XGB model. CV-mean: {:.4f}, CV-std: {:.4f}.".format(
                cvresult.shape[0], cvresult.iloc[cvresult.shape[0] - 1, 0],
                cvresult.iloc[cvresult.shape[0] - 1, 1]))
            n = cvresult.shape[0]
            model.set_params(n_estimators=n)
            model.fit(self.Xtrain,
                      self.ytrain,
                      eval_metric=self._metric,
                      eval_set=[(self.Xtrain, self.ytrain), (self.Xtest, self.ytest)],
                      verbose=False)
            y_pred = model.predict(self.Xtest)
            score = accuracy_score(self.ytest, y_pred)
            mse = mean_squared_error(self.ytest, y_pred)

            if score > max_score:
                max_score = score
                min_mse = mse
                best_depth = md
                best_estimator = n
                self.best_xgb = model
            print("Accuracy score: " + str(round(score, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
            print("Mean square error: " + str(round(mse, 4)) + " at depth: " + str(md) + " and estimator " + str(n))
        print("Best score: " + str(round(max_score, 4)) + " Best MSE: " + str(round(min_mse, 4)) + " at depth: " + str(
            best_depth) + " and estimator of " + str(best_estimator))
示例#6
0
def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()
示例#7
0
    def xgb_cv_param(X_train, y_train, early_stopping_rounds=50):
        cv_param = 'n_estimators'
        # cv_param = 'gamma'
        DTrain = xgb.DMatrix(X_train.values, label=y_train.values.ravel())

        # StratifiedKFold:采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
        SKFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=666)

        xgb_beta = XGBClassifier(
            learning_rate=0.1,
            n_estimators=70,
            max_depth=6,
            min_child_weight=2,
            # gamma=0,
            # subsample=0.6,
            # colsample_bytree=0.4,
            objective='multi:softmax',
            # reg_lambda=0.1
        )

        xgb_param = xgb_beta.get_xgb_params()
        xgb_param['num_class'] = 2
        # 交叉验证
        print('进行交叉验证......')
        time_cv_start = time.clock()
        cv_result = xgb.cv(xgb_param,
                           DTrain,
                           num_boost_round=xgb_param[cv_param],
                           folds=SKFold,
                           metrics='mlogloss',
                           early_stopping_rounds=early_stopping_rounds)
        print('交叉验证结束!')
        print('参数停止数为:', cv_result.shape[0])
        time_cv_end = time.clock()
        time_cv_cost = (time_cv_end - time_cv_start)
        print('耗时:', time_cv_cost)
        # print('cv_result:\n', cv_result)
        cv_result.to_csv('data/result/ee_smote_cv_n_estimators_result.csv',
                         index_label='n_estimators')
        print('文件生成成功!')
示例#8
0
xgb_clf = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=9,
 min_child_weight=1,
 gamma=0.2,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27,
 reg_alpha=1e-05)

xgb_param = xgb_clf.get_xgb_params()
xgtrain = xgb.DMatrix(x_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_clf.get_params()['n_estimators'], nfold=5,
                          metrics='auc', early_stopping_rounds=50)
xgb_clf.set_params(n_estimators=cvresult.shape[0])
xgb_clf.fit(x_train, y_train)
y_pred_xgb=xgb_clf.predict(x_test)
y_pred_xgb_test_data=xgb_clf.predict(test)
score = accuracy_score(y_test, y_pred_xgb)
f1_score_xgboost=f1_score(y_test,y_pred_xgb)

print(cvresult.shape[0])


print(
    "\nModel Report")
示例#9
0
    learning_rate=0.1,
    objective='binary:logistic',
    eval_metric='auc',
    # base_score = proportion_2j,
    n_jobs=cpu_n_jobs,
    random_state=42,
    silent=True)

clf_org_lgb = LGBMClassifier(n_estimators=1000,
                             learning_rate=0.1,
                             objective='binary',
                             n_jobs=cpu_n_jobs,
                             random_state=42,
                             silent=True)

xgb_params = clf_org_xgb.get_xgb_params()

lgb_params = clf_org_lgb.get_params()
lgb_params.pop('n_estimators')
lgb_params.pop('silent')

xgb_cv_early_stopping = CV_EarlyStoppingTrigger(
    stopping_rounds=early_stopping_rounds, maximize_score=True, method='xgb')

lgb_cv_early_stopping = CV_EarlyStoppingTrigger(
    stopping_rounds=early_stopping_rounds, maximize_score=True, method='lgb')

from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_auc_score
# from sklearn.model_selection import StratifiedKFold
import scipy.stats as sp_stats
示例#10
0
def xgb_classifier(X_train,
                   X_test,
                   y_train,
                   y_test,
                   useTrainCV=True,
                   cv_folds=5,
                   early_stopping_rounds=50):
    """
    关于现在这个模型
    准确率 : 0.9995
    AUC : 0.887708
    F1 Score : 0.847584
    ----------------------------------->
    关于现在这个模型
    准确率 : 0.9996
    AUC 得分 (训练集): 0.977480
    F1 Score 得分 (训练集): 0.858209
    ---------------------------------->
    关于现在这个模型
    ['V14', 'V4', 'V17', 'V10', 'V12', 'V20', 'Amount', 'V21', 'V26', 'V28', 'V11', 'V19', 'V8', 'V7', 'V13']
    准确率 : 0.9996
    AUC 得分 (训练集): 0.978563
    F1 Score 得分 (训练集): 0.859259
    ---------------------------------->
    # {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3} 0.862920874517388
    # {'colsample_bytree': 1.0, 'gamma': 0.2} 0.871
    # {'gamma': 0.2, 'scale_pos_weight': 1} 0.8702009952422571
    # {'subsample': 0.6} 0.864310306628855
    """
    alg = XGBClassifier(learning_rate=0.1,
                        n_estimators=140,
                        max_depth=5,
                        min_child_weight=3,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=1.0,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    # 建模
    print('Start Training')
    alg.fit(X_train, y_train, eval_metric='auc')

    # param_test1 = {}
    # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
    #                                                 min_child_weight=3, gamma=0.2, subsample=0.8,
    #                                                 colsample_bytree=1.0,
    #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
    #                                                 seed=27),
    #                         param_grid=param_test1,
    #                         scoring='f1',
    #                         n_jobs=4, iid=False, cv=5)
    # gsearch1.fit(X_train, y_train)
    # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

    # 对训练集预测
    print("Start Predicting")
    predictions = alg.predict(X_test)
    pred_proba = alg.predict_proba(X_test)[:, 1]

    # 输出模型的一些结果
    print("\n关于现在这个模型")
    print("准确率 : %.4g" % metrics.accuracy_score(y_test, predictions))
    print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(y_test, pred_proba))
    print("F1 Score 得分 (训练集): %f" % metrics.f1_score(y_test, predictions))

    feat_imp = alg.feature_importances_
    feat = X_train.columns.tolist()
    # clf.best_estimator_.booster().get_fscore()
    res_df = pd.DataFrame({
        'Features': feat,
        'Importance': feat_imp
    }).sort_values(by='Importance', ascending=False)
    res_df.plot('Features',
                'Importance',
                kind='bar',
                title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
    print(res_df)
    print(res_df["Features"].tolist())
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]

    # evaluate predictions
    accuracy = accuracy_score(y_test, predictions)
    print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0))

features = sorted(list(enumerate(model.feature_importances_)),
                  key=lambda x: x[1],
                  reverse=True)
for f in features[0:25]:
    print("%d\t%f\t%s" % (f[0], f[1], cntizer.get_feature_names()[f[0]]))

# Save xgb_params
default_get_xgb_params = model.get_xgb_params()

# setup parameters for xgboost
param = {}

param['n_estimators'] = 200
param['max_depth'] = 2
param['nthread'] = 8
param['learning_rate'] = 0.2

# Training type indicator seperately
for l in range(len(type_indicators)):
    print("%s ..." % (type_indicators[l]))

    Y = list_personality[:, l]
示例#12
0
    y,
    test_size=0.20,
    random_state=42,
)

#%%

clf = XGBClassifier(n_estimators=n_estimators,
                    learning_rate=0.1,
                    objective='binary:logistic',
                    eval_metric='auc',
                    n_jobs=cpu_n_jobs,
                    random_state=42,
                    silent=True)

params = clf.get_xgb_params()

cv_early_stopping = CV_EarlyStoppingTrigger(
    stopping_rounds=early_stopping_rounds, maximize_score=True, method='xgb')

Dmatrix_train = xgboost.DMatrix(X_train, label=y_train)

#%%

# run k-fold CV with XGB
cvres = xgboost.cv(
    params,
    Dmatrix_train,
    num_boost_round=num_boost_round,
    nfold=n_fold,
    # metrics = metrics_xgb,
示例#13
0
early_stopping_rounds = 100

# for i in range(1):
for i in range(train_preds_all.shape[0]):

    params = { 'tree_method':'gpu_hist', 'predictor':'gpu_predictor' }
    alg = XGBClassifier(learning_rate=0.01, n_estimators=500, max_depth=8,
                    min_child_weight=1.0, gamma=0.2, subsample=0.6, colsample_bytree=0.2,
                    objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27, **params)

    X_train = train_preds_all[i].transpose([1,0])
    y_train = train_y

    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train, label=y_train)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    # print('Start Training')
    alg.fit(X_train, y_train, eval_metric='auc', verbose=True)
    y_prob = alg.predict_proba(X_train)
    threshold = threshold_search(y_train, y_prob[:,1])

    # print("Start Predicting")
    X_test = np.array(test_local_pred_models).transpose([1,2,0])[i]
    y_test = np.array(test_local_target_models).transpose([1,2,0])[i,:,0]
    pred_proba = alg.predict_proba(X_test)[:, 1]
示例#14
0
class XGBoostClassifier(ClassifierBase):
    def __init__(self, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
        super(XGBoostClassifier, self).__init__()
        self.useTrainCV = useTrainCV
        self.cv_folds = cv_folds
        self.early_stopping_rounds = early_stopping_rounds
        self.clf = XGBClassifier(learning_rate=0.1,
                                 n_estimators=140,
                                 max_depth=5,
                                 min_child_weight=3,
                                 gamma=0.2,
                                 subsample=0.6,
                                 colsample_bytree=1.0,
                                 objective='binary:logistic',
                                 n_jobs=6,
                                 scale_pos_weight=1,
                                 seed=27)

    def train(self, X_train, y_train):
        if self.useTrainCV:
            print("Start Feeding Data for Cross Validation")
            xgb_param = self.clf.get_xgb_params()
            xgtrain = xgb.DMatrix(X_train, label=y_train)
            cvresult = xgb.cv(
                xgb_param,
                xgtrain,
                num_boost_round=self.clf.get_params()['n_estimators'],
                nfold=self.cv_folds,
                early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(**cvresult)
            # param_test1 = {}
            # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
            #                                                 min_child_weight=3, gamma=0.2, subsample=0.8,
            #                                                 colsample_bytree=1.0,
            #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
            #                                                 seed=27),
            #                         param_grid=param_test1,
            #                         scoring='f1',
            #                         n_jobs=4, iid=False, cv=5)
            # gsearch1.fit(X_train, y_train)
            # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

        self.clf.fit(X_train, y_train, eval_metric='auc')

    def predict(self, X_test, y_test=None):

        y_pred_proba = self.clf.predict_proba(X_test)[:, 1]
        if not (y_test is None):
            print("Score: ", self.clf.score(X_test, y_test))
            y_pred = self.clf.predict(X_test)
            print("Acc : %.4g" % metrics.accuracy_score(y_test, y_pred))
            print("F1 score is: {}".format(f1_score(y_test, y_pred)))
            print("AUC Score is: {}".format(roc_auc_score(
                y_test, y_pred_proba)))
        return y_pred_proba

    def printFeatureImportance(self, X_train):
        feat_imp = self.clf.feature_importances_
        feat = X_train.columns.tolist()
        #res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
        #res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
        #plt.ylabel('Feature Importance Score')
        #plt.show()
        #print(res_df)
        #print(res_df["Features"].tolist())
        print('Importance feats:', feat)

    def save(self, path):
        dump(self.clf, os.path.join(path, 'clf.joblib'))

    def load(self, path):
        self.clf = load(os.path.join(path, 'clf.joblib'))
示例#15
0
best_model.fit(train_feature_2019, train_label_2019)
xg_2020_pred = best_model.predict_proba(test_feature_2020)[:, 1]
xg_2020_evaluation = valid.evaluate(
    test_label_2020, xg_2020_pred, save_path="../data/xg(2020)_evaluation.json"
)
plot_evaluation(test_label_2020, xg_2020_pred, "../figure", method="XG_2020")

#%%
# additive learning for xgboost
import xgboost as xgb

glimse_index = list(
    np.random.choice(list(test_feature_2020.index), 1000, replace=False)
)
test_index = list(set(test_feature_2020.index) - set(glimse_index))
params = best_model.get_xgb_params()
xg_2020_train = xgb.DMatrix(
    test_feature_2020.loc[glimse_index, :], label=test_label_2020[glimse_index]
)
xg_2020_test = xgb.DMatrix(
    test_feature_2020.loc[test_index, :], label=test_label_2020[test_index]
)
best_model.save_model("../data/xg_2019.model")
additive_xg = xgb.train(params, xg_2020_train, 5, xgb_model="../data/xg_2019.model")
additive_xg_pred = additive_xg.predict(xg_2020_test)
additive_xg_evaluation = valid.evaluate(
    test_label_2020[test_index],
    additive_xg_pred,
    save_path="../data/additive_xg(2020)_evaluation.json",
)
plot_evaluation(
示例#16
0
clf = XGBClassifier(learning_rate = 0.01,
					n_estimators = 5000, 
					reg_alpha = 0.025, 
					colsample_bytree = 0.8, 
					silent	= 1, 
					scale_pos_weight = 0, 					
					nthread = 4, 
					min_child_weight = 1, 
					subsample= 0.8, 
					seed = 1337, 
					objective= 'multi:softprob', 
					max_depth = 7, 
					gamma= .2)

# use the xgb interface
xgb_param = clf.get_xgb_params()
xgb_param['num_class'] = 5
xgb_param['eval_metric'] = 'mlogloss'
Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
cvresult = xgb.cv(xgb_param, 
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
示例#17
0
def xgboost_train():
    train_file_list = fetch_file_list(data_dir=TRAINING_DATA_DIR, portion=1)
    tg = build_numpy(file_list=train_file_list,
                     num_samples=None,
                     xcolumns=X_COLUMNS,
                     ycolumns=Y_COLUMNS,
                     ytx=None,
                     skip_header=1,
                     shuffle=False,
                     is_csv=IS_CSV)
    val_file_list = fetch_file_list(data_dir=VALIDATION_DATA_DIR, portion=1)
    vg = build_numpy(file_list=val_file_list,
                     num_samples=None,
                     xcolumns=X_COLUMNS,
                     ycolumns=Y_COLUMNS,
                     ytx=None,
                     skip_header=1,
                     shuffle=False,
                     is_csv=IS_CSV)

    x_train = copy.deepcopy(tg[0])
    y_train = copy.deepcopy(tg[1].reshape(-1))
    x_val = copy.deepcopy(vg[0])
    y_val = copy.deepcopy(vg[1].reshape(-1))
    del tg
    del vg

    count = np.sum(y_train)
    print("Number of Positive Training Windows: {}".format(count))
    print(
        "Number of Negative Training Windows: {}".format(len(y_train) - count))

    eval_set = [(x_train, y_train), (x_val, y_val)]
    my_model = XGBClassifier(base_score=0.5,
                             booster='gbtree',
                             colsample_by_level=1,
                             colsample_bynode=1,
                             colsample_bytree=0.8,
                             eta=0.03,
                             gamma=0.1,
                             learning_rate=0.1,
                             ax_delta_step=0,
                             max_depth=6,
                             min_child_weight=3,
                             missing=None,
                             n_estimators=600,
                             n_jobs=1,
                             nthread=None,
                             objective='binary:logistic',
                             random_state=0,
                             reg_alpha=0,
                             reg_lambda=1,
                             scale_pos_weight=XGBOOST_POSITIVE_WEIGHT,
                             seed=1234,
                             subsample=0.8,
                             verbosity=2,
                             tree_method='hist')
    my_model.get_xgb_params()
    # logloss here equivalent to CategoricalCrossEntropy in tensorflow
    trained = my_model.fit(x_train,
                           y_train,
                           early_stopping_rounds=15,
                           eval_metric=["logloss", "error"],
                           eval_set=eval_set,
                           verbose=True)

    key = "xgboost-withClassWeight"
    file_path = MODEL_CHECKPOINT + key + datetime.datetime.now().strftime(
        "%Y%m%d-%H%M%S")
    trained.save_model(file_path)
    return trained
示例#18
0
fig.savefig("QPPS6-logreg-Importance-Variables-2goups.png", bbox_inches="tight", dpi=600)
##############################################################

#########################################################################
# XGBOOST  
##########################################################################
#xgboost avec parametres standards par défaut

myXGBoost =   XGBClassifier().fit(X_train,y_train)
print("Training set score: {:.3f}".format(myXGBoost.score(X_train,y_train))) 
print("Test set score: {:.3f}".format(myXGBoost.score(X_test,y_test))) 



#pour info : parametres par défaut    
myXGBoost.get_xgb_params()

##########################################################################
# MERCI pour votre attention !
##########################################################################
#on reste dans l'IDE
#if __name__ == '__main__':
#  main()







示例#19
0
xgtrain = xgboost.DMatrix(X_1, label=y.values)

xgb = XGBClassifier(
                     learning_rate =0.01,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     n_jobs=-1,
                     random_state=42
                    )

xgb_param = xgb.get_xgb_params()
xgb_param

cvresult = xgboost.cv(xgb_param, xgtrain, 
                  num_boost_round=xgb.get_params()['n_estimators'], 
                  nfold=5,
                  metrics='auc', 
                  early_stopping_rounds=50,
                  seed=42
                  )

cvresult.head()

cvresult.shape

xgb_best_param = {'n_estimators': cvresult.shape[0]}
示例#20
0
                    silent=1,  # silent=0时,不输出中间过程(默认)silent=1时,输出中间过程

                    subsample=0.8,  # 使用的数据占全部训练集的比例。防止overfitting。默认值为1,典型值为0.5-1。
                    colsample_bytree=0.8,  # 使用的特征占全部特征的比例。防止overfitting。默认值为1,典型值为0.5-1。
                    colsample_bylevel=0.7,

                    learning_rate=0.01,  # 学习率,控制每次迭代更新权重时的步长,值越小,训练越慢。默认0.3,典型值为0.01-0.2。
                    n_estimators=1000000,  # 总共迭代的次数,即决策树的个数,数值大没关系,cv会自动返回合适的n_estimators
                    max_depth=5,  # 树的深度,默认值为6,典型值3-10。
                    min_child_weight=2,  # 值越大,越容易欠拟合;值越小,越容易过拟合(值较大时,避免模型学习到局部的特殊样本)。默认值为1
                    gamma=0,  # 惩罚项系数,指定节点分裂所需的最小损失函数下降值。
                    objective='multi:softprob',
                    )

if useTrainCV:
    xgb_param = xgb1.get_xgb_params()

    xgtrain = xgb.DMatrix(X_train, label=y_train)

    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], folds=cv_folds,
                      metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)

    n_estimators = cvresult.shape[0]
    xgb1.set_params(n_estimators=n_estimators)
    # print(cvresult)
# Fit the algorithm on the data
xgb1.fit(X_train, y_train, eval_metric='mlogloss')
# Predict training set:
train_predprob = xgb1.predict_proba(X_train)
logloss = metrics.log_loss(y_train, train_predprob)