def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) display(cvresult) alg.set_params(n_estimators=cvresult.shape[0]) print('Start Training') alg.fit(X_train, y_train, eval_metric='auc') print("Start Predicting") predictions = alg.predict(X_test) pred_proba = alg.predict_proba(X_test)[:, 1] # Model performance print("\nModel statistic") print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba)) print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions)) feat_imp = alg.feature_importances_ feat = X_train.columns.tolist() res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False) res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print(res_df) print(res_df["Features"].tolist()) return cvresult, alg
def xgb(): print("Training an XGB Classifier") params = { "max_depth": 8, "n_estimators": 400, "learning_rate": 0.05, "n_jobs": -1, "subsample": 0.8, "nthread": 4, } trX_, tvX_, trY_, tvY_ = train_test_split(trX, trYi, test_size=0.3) gbm = XGBClassifier(**params) print(gbm.get_xgb_params()) gbm.fit(trX_, trY_, eval_set=[(tvX_, tvY_)], verbose=True) # Find training accuracy trP = classes[gbm.predict(trX)] print("Training Accuracy: ", 100 * accuracy(trY, trP)) # Dump test labels tsP = classes[gbm.predict(tsX)] write_csv("xgb_d5_n150.csv", tsP)
def xgb_cv(X, y): # Instantiate XGBoost n_estimators = 100 dtrain = xgb.DMatrix(X, y) # XGBoost was tuned on the raw data. bst = XGBClassifier(n_estimators=100, #70 max_depth=3, min_child_weight=5, gamma=0.5, learning_rate=0.05, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.001, seed=1) # Cross-validate XGBoost params = bst.get_xgb_params() # Extract parameters from XGB instance to be used for CV num_boost_round = bst.get_params()['n_estimators'] # XGB-CV has different names than sklearn cvresult = xgb.cv(params, dtrain, num_boost_round=num_boost_round, nfold=10, metrics=['logloss', 'auc'], seed=1) print("="*80) print("\nXGBoost results for 10-fold cross-validation:") print(cvresult) print("="*80) # XGBoost summary print("="*80) print("\nXGBoost summary for 100 rounds of 10-fold cross-validation:") print("\nBest mean log-loss: %.4f" % cvresult['test-logloss-mean'].min()) print("\nBest mean AUC: %.4f" % cvresult['test-auc-mean'].max()) print("="*80)
def xgmethod(X,Y): # split data into train and test sets seed = 7 test_size = 0.3 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed) scaler = preprocessing.StandardScaler().fit(X_train) scaler.transform(X_train) # XGtrain matrix xgtrain = xgb.DMatrix(X_train, label=y_train) model = XGBClassifier(max_depth=6, learning_rate=0.3, n_estimators=100,objective='binary:logistic') xgb_param = model.get_xgb_params() print ('Start cross validation') cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=500, nfold=10, metrics=['auc'], early_stopping_rounds=50, stratified=True, seed=1301) print('Best number of trees = {}'.format(cvresult.shape[0])) model.set_params(n_estimators=cvresult.shape[0]) print('Fit on the trainingsdata') model.fit(X_train, y_train, eval_metric='auc') pred = model.predict(X_test, ntree_limit=cvresult.shape[0]) # make predictions for test data predictions = [round(value) for value in pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) return accuracy
def training(self): """ Training is done at each max_depth loop. XGBoost's cv is used to find the optimum number of tree (estimators) at each depth, up to 1000 trees. Once traning result doesn't improve for 50 epochs, training will stop. The tree number used in the last epoch will be used to fit the train and test set again. Metrics will then be measured again this XGB model. """ max_depth = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] best_depth = 0 best_estimator = 0 max_score = 0 for md in max_depth: model = XGBClassifier(learning_rate=0.3, n_estimators=1000, max_depth=md, min_child_weight=1, gamma=1, subsample=1, colsample_bytree=0.1, reg_lambda=0, reg_alpha=1, random_state=42) xgb_param = model.get_xgb_params() xgtrain = xgboost.DMatrix(self.Xtrain.values, label=self.ytrain.values) cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=1000, early_stopping_rounds=50, nfold=8, metrics='auc', stratified=True, shuffle=True, seed=42, verbose_eval=False) print("There are {} trees in the XGB model. CV-mean: {:.4f}, CV-std: {:.4f}.".format( cvresult.shape[0], cvresult.iloc[cvresult.shape[0] - 1, 0], cvresult.iloc[cvresult.shape[0] - 1, 1])) n = cvresult.shape[0] model.set_params(n_estimators=n) model.fit(self.Xtrain, self.ytrain, eval_metric=self._metric, eval_set=[(self.Xtrain, self.ytrain), (self.Xtest, self.ytest)], verbose=False) y_pred = model.predict(self.Xtest) score = accuracy_score(self.ytest, y_pred) mse = mean_squared_error(self.ytest, y_pred) if score > max_score: max_score = score min_mse = mse best_depth = md best_estimator = n self.best_xgb = model print("Accuracy score: " + str(round(score, 4)) + " at depth: " + str(md) + " and estimator " + str(n)) print("Mean square error: " + str(round(mse, 4)) + " at depth: " + str(md) + " and estimator " + str(n)) print("Best score: " + str(round(max_score, 4)) + " Best MSE: " + str(round(min_mse, 4)) + " at depth: " + str( best_depth) + " and estimator of " + str(best_estimator))
def opt_BDT(input, output, params, show, names): model = XGBClassifier(**params) xgb_param = model.get_xgb_params() cvscores = [] AUC = [] X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42) matrix_train = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, matrix_train, num_boost_round=model.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=30, verbose_eval=True, ) model.set_params(n_estimators=cvresult.shape[0]) model.fit(X_train, y_train, eval_metric="auc") y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc)) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster()) plt.subplots_adjust(left=0.3) plt.show()
def xgb_cv_param(X_train, y_train, early_stopping_rounds=50): cv_param = 'n_estimators' # cv_param = 'gamma' DTrain = xgb.DMatrix(X_train.values, label=y_train.values.ravel()) # StratifiedKFold:采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同。 SKFold = StratifiedKFold(n_splits=5, shuffle=True, random_state=666) xgb_beta = XGBClassifier( learning_rate=0.1, n_estimators=70, max_depth=6, min_child_weight=2, # gamma=0, # subsample=0.6, # colsample_bytree=0.4, objective='multi:softmax', # reg_lambda=0.1 ) xgb_param = xgb_beta.get_xgb_params() xgb_param['num_class'] = 2 # 交叉验证 print('进行交叉验证......') time_cv_start = time.clock() cv_result = xgb.cv(xgb_param, DTrain, num_boost_round=xgb_param[cv_param], folds=SKFold, metrics='mlogloss', early_stopping_rounds=early_stopping_rounds) print('交叉验证结束!') print('参数停止数为:', cv_result.shape[0]) time_cv_end = time.clock() time_cv_cost = (time_cv_end - time_cv_start) print('耗时:', time_cv_cost) # print('cv_result:\n', cv_result) cv_result.to_csv('data/result/ee_smote_cv_n_estimators_result.csv', index_label='n_estimators') print('文件生成成功!')
xgb_clf = XGBClassifier( learning_rate =0.1, n_estimators=1000, max_depth=9, min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=1e-05) xgb_param = xgb_clf.get_xgb_params() xgtrain = xgb.DMatrix(x_train, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_clf.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50) xgb_clf.set_params(n_estimators=cvresult.shape[0]) xgb_clf.fit(x_train, y_train) y_pred_xgb=xgb_clf.predict(x_test) y_pred_xgb_test_data=xgb_clf.predict(test) score = accuracy_score(y_test, y_pred_xgb) f1_score_xgboost=f1_score(y_test,y_pred_xgb) print(cvresult.shape[0]) print( "\nModel Report")
learning_rate=0.1, objective='binary:logistic', eval_metric='auc', # base_score = proportion_2j, n_jobs=cpu_n_jobs, random_state=42, silent=True) clf_org_lgb = LGBMClassifier(n_estimators=1000, learning_rate=0.1, objective='binary', n_jobs=cpu_n_jobs, random_state=42, silent=True) xgb_params = clf_org_xgb.get_xgb_params() lgb_params = clf_org_lgb.get_params() lgb_params.pop('n_estimators') lgb_params.pop('silent') xgb_cv_early_stopping = CV_EarlyStoppingTrigger( stopping_rounds=early_stopping_rounds, maximize_score=True, method='xgb') lgb_cv_early_stopping = CV_EarlyStoppingTrigger( stopping_rounds=early_stopping_rounds, maximize_score=True, method='lgb') from sklearn.model_selection import RandomizedSearchCV from sklearn.metrics import roc_auc_score # from sklearn.model_selection import StratifiedKFold import scipy.stats as sp_stats
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): """ 关于现在这个模型 准确率 : 0.9995 AUC : 0.887708 F1 Score : 0.847584 -----------------------------------> 关于现在这个模型 准确率 : 0.9996 AUC 得分 (训练集): 0.977480 F1 Score 得分 (训练集): 0.858209 ----------------------------------> 关于现在这个模型 ['V14', 'V4', 'V17', 'V10', 'V12', 'V20', 'Amount', 'V21', 'V26', 'V28', 'V11', 'V19', 'V8', 'V7', 'V13'] 准确率 : 0.9996 AUC 得分 (训练集): 0.978563 F1 Score 得分 (训练集): 0.859259 ----------------------------------> # {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3} 0.862920874517388 # {'colsample_bytree': 1.0, 'gamma': 0.2} 0.871 # {'gamma': 0.2, 'scale_pos_weight': 1} 0.8702009952422571 # {'subsample': 0.6} 0.864310306628855 """ alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) # xgtest = xgb.DMatrix(X_test.values, label=y_test.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) # 建模 print('Start Training') alg.fit(X_train, y_train, eval_metric='auc') # param_test1 = {} # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, # min_child_weight=3, gamma=0.2, subsample=0.8, # colsample_bytree=1.0, # objective='binary:logistic', nthread=4, scale_pos_weight=1, # seed=27), # param_grid=param_test1, # scoring='f1', # n_jobs=4, iid=False, cv=5) # gsearch1.fit(X_train, y_train) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) # 对训练集预测 print("Start Predicting") predictions = alg.predict(X_test) pred_proba = alg.predict_proba(X_test)[:, 1] # 输出模型的一些结果 print("\n关于现在这个模型") print("准确率 : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(y_test, pred_proba)) print("F1 Score 得分 (训练集): %f" % metrics.f1_score(y_test, predictions)) feat_imp = alg.feature_importances_ feat = X_train.columns.tolist() # clf.best_estimator_.booster().get_fscore() res_df = pd.DataFrame({ 'Features': feat, 'Importance': feat_imp }).sort_values(by='Importance', ascending=False) res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print(res_df) print(res_df["Features"].tolist())
# make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("* %s Accuracy: %.2f%%" % (type_indicators[l], accuracy * 100.0)) features = sorted(list(enumerate(model.feature_importances_)), key=lambda x: x[1], reverse=True) for f in features[0:25]: print("%d\t%f\t%s" % (f[0], f[1], cntizer.get_feature_names()[f[0]])) # Save xgb_params default_get_xgb_params = model.get_xgb_params() # setup parameters for xgboost param = {} param['n_estimators'] = 200 param['max_depth'] = 2 param['nthread'] = 8 param['learning_rate'] = 0.2 # Training type indicator seperately for l in range(len(type_indicators)): print("%s ..." % (type_indicators[l])) Y = list_personality[:, l]
y, test_size=0.20, random_state=42, ) #%% clf = XGBClassifier(n_estimators=n_estimators, learning_rate=0.1, objective='binary:logistic', eval_metric='auc', n_jobs=cpu_n_jobs, random_state=42, silent=True) params = clf.get_xgb_params() cv_early_stopping = CV_EarlyStoppingTrigger( stopping_rounds=early_stopping_rounds, maximize_score=True, method='xgb') Dmatrix_train = xgboost.DMatrix(X_train, label=y_train) #%% # run k-fold CV with XGB cvres = xgboost.cv( params, Dmatrix_train, num_boost_round=num_boost_round, nfold=n_fold, # metrics = metrics_xgb,
early_stopping_rounds = 100 # for i in range(1): for i in range(train_preds_all.shape[0]): params = { 'tree_method':'gpu_hist', 'predictor':'gpu_predictor' } alg = XGBClassifier(learning_rate=0.01, n_estimators=500, max_depth=8, min_child_weight=1.0, gamma=0.2, subsample=0.6, colsample_bytree=0.2, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27, **params) X_train = train_preds_all[i].transpose([1,0]) y_train = train_y if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) # xgtest = xgb.DMatrix(X_test.values, label=y_test.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) # print('Start Training') alg.fit(X_train, y_train, eval_metric='auc', verbose=True) y_prob = alg.predict_proba(X_train) threshold = threshold_search(y_train, y_prob[:,1]) # print("Start Predicting") X_test = np.array(test_local_pred_models).transpose([1,2,0])[i] y_test = np.array(test_local_target_models).transpose([1,2,0])[i,:,0] pred_proba = alg.predict_proba(X_test)[:, 1]
class XGBoostClassifier(ClassifierBase): def __init__(self, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): super(XGBoostClassifier, self).__init__() self.useTrainCV = useTrainCV self.cv_folds = cv_folds self.early_stopping_rounds = early_stopping_rounds self.clf = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', n_jobs=6, scale_pos_weight=1, seed=27) def train(self, X_train, y_train): if self.useTrainCV: print("Start Feeding Data for Cross Validation") xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=self.cv_folds, early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(**cvresult) # param_test1 = {} # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, # min_child_weight=3, gamma=0.2, subsample=0.8, # colsample_bytree=1.0, # objective='binary:logistic', nthread=4, scale_pos_weight=1, # seed=27), # param_grid=param_test1, # scoring='f1', # n_jobs=4, iid=False, cv=5) # gsearch1.fit(X_train, y_train) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) self.clf.fit(X_train, y_train, eval_metric='auc') def predict(self, X_test, y_test=None): y_pred_proba = self.clf.predict_proba(X_test)[:, 1] if not (y_test is None): print("Score: ", self.clf.score(X_test, y_test)) y_pred = self.clf.predict(X_test) print("Acc : %.4g" % metrics.accuracy_score(y_test, y_pred)) print("F1 score is: {}".format(f1_score(y_test, y_pred))) print("AUC Score is: {}".format(roc_auc_score( y_test, y_pred_proba))) return y_pred_proba def printFeatureImportance(self, X_train): feat_imp = self.clf.feature_importances_ feat = X_train.columns.tolist() #res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False) #res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') #plt.ylabel('Feature Importance Score') #plt.show() #print(res_df) #print(res_df["Features"].tolist()) print('Importance feats:', feat) def save(self, path): dump(self.clf, os.path.join(path, 'clf.joblib')) def load(self, path): self.clf = load(os.path.join(path, 'clf.joblib'))
best_model.fit(train_feature_2019, train_label_2019) xg_2020_pred = best_model.predict_proba(test_feature_2020)[:, 1] xg_2020_evaluation = valid.evaluate( test_label_2020, xg_2020_pred, save_path="../data/xg(2020)_evaluation.json" ) plot_evaluation(test_label_2020, xg_2020_pred, "../figure", method="XG_2020") #%% # additive learning for xgboost import xgboost as xgb glimse_index = list( np.random.choice(list(test_feature_2020.index), 1000, replace=False) ) test_index = list(set(test_feature_2020.index) - set(glimse_index)) params = best_model.get_xgb_params() xg_2020_train = xgb.DMatrix( test_feature_2020.loc[glimse_index, :], label=test_label_2020[glimse_index] ) xg_2020_test = xgb.DMatrix( test_feature_2020.loc[test_index, :], label=test_label_2020[test_index] ) best_model.save_model("../data/xg_2019.model") additive_xg = xgb.train(params, xg_2020_train, 5, xgb_model="../data/xg_2019.model") additive_xg_pred = additive_xg.predict(xg_2020_test) additive_xg_evaluation = valid.evaluate( test_label_2020[test_index], additive_xg_pred, save_path="../data/additive_xg(2020)_evaluation.json", ) plot_evaluation(
clf = XGBClassifier(learning_rate = 0.01, n_estimators = 5000, reg_alpha = 0.025, colsample_bytree = 0.8, silent = 1, scale_pos_weight = 0, nthread = 4, min_child_weight = 1, subsample= 0.8, seed = 1337, objective= 'multi:softprob', max_depth = 7, gamma= .2) # use the xgb interface xgb_param = clf.get_xgb_params() xgb_param['num_class'] = 5 xgb_param['eval_metric'] = 'mlogloss' Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan) cvresult = xgb.cv(xgb_param, Xg_train, num_boost_round = clf.get_params()['n_estimators'], nfold = 5, show_progress = True, early_stopping_rounds = 100) clf.set_params(n_estimators=cvresult.shape[0]) clf.fit(X_train, y_train) best_outcome_params = clf.get_params() best_outcome_score = cvresult.min() try:
def xgboost_train(): train_file_list = fetch_file_list(data_dir=TRAINING_DATA_DIR, portion=1) tg = build_numpy(file_list=train_file_list, num_samples=None, xcolumns=X_COLUMNS, ycolumns=Y_COLUMNS, ytx=None, skip_header=1, shuffle=False, is_csv=IS_CSV) val_file_list = fetch_file_list(data_dir=VALIDATION_DATA_DIR, portion=1) vg = build_numpy(file_list=val_file_list, num_samples=None, xcolumns=X_COLUMNS, ycolumns=Y_COLUMNS, ytx=None, skip_header=1, shuffle=False, is_csv=IS_CSV) x_train = copy.deepcopy(tg[0]) y_train = copy.deepcopy(tg[1].reshape(-1)) x_val = copy.deepcopy(vg[0]) y_val = copy.deepcopy(vg[1].reshape(-1)) del tg del vg count = np.sum(y_train) print("Number of Positive Training Windows: {}".format(count)) print( "Number of Negative Training Windows: {}".format(len(y_train) - count)) eval_set = [(x_train, y_train), (x_val, y_val)] my_model = XGBClassifier(base_score=0.5, booster='gbtree', colsample_by_level=1, colsample_bynode=1, colsample_bytree=0.8, eta=0.03, gamma=0.1, learning_rate=0.1, ax_delta_step=0, max_depth=6, min_child_weight=3, missing=None, n_estimators=600, n_jobs=1, nthread=None, objective='binary:logistic', random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=XGBOOST_POSITIVE_WEIGHT, seed=1234, subsample=0.8, verbosity=2, tree_method='hist') my_model.get_xgb_params() # logloss here equivalent to CategoricalCrossEntropy in tensorflow trained = my_model.fit(x_train, y_train, early_stopping_rounds=15, eval_metric=["logloss", "error"], eval_set=eval_set, verbose=True) key = "xgboost-withClassWeight" file_path = MODEL_CHECKPOINT + key + datetime.datetime.now().strftime( "%Y%m%d-%H%M%S") trained.save_model(file_path) return trained
fig.savefig("QPPS6-logreg-Importance-Variables-2goups.png", bbox_inches="tight", dpi=600) ############################################################## ######################################################################### # XGBOOST ########################################################################## #xgboost avec parametres standards par défaut myXGBoost = XGBClassifier().fit(X_train,y_train) print("Training set score: {:.3f}".format(myXGBoost.score(X_train,y_train))) print("Test set score: {:.3f}".format(myXGBoost.score(X_test,y_test))) #pour info : parametres par défaut myXGBoost.get_xgb_params() ########################################################################## # MERCI pour votre attention ! ########################################################################## #on reste dans l'IDE #if __name__ == '__main__': # main()
xgtrain = xgboost.DMatrix(X_1, label=y.values) xgb = XGBClassifier( learning_rate =0.01, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 ) xgb_param = xgb.get_xgb_params() xgb_param cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=xgb.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50, seed=42 ) cvresult.head() cvresult.shape xgb_best_param = {'n_estimators': cvresult.shape[0]}
silent=1, # silent=0时,不输出中间过程(默认)silent=1时,输出中间过程 subsample=0.8, # 使用的数据占全部训练集的比例。防止overfitting。默认值为1,典型值为0.5-1。 colsample_bytree=0.8, # 使用的特征占全部特征的比例。防止overfitting。默认值为1,典型值为0.5-1。 colsample_bylevel=0.7, learning_rate=0.01, # 学习率,控制每次迭代更新权重时的步长,值越小,训练越慢。默认0.3,典型值为0.01-0.2。 n_estimators=1000000, # 总共迭代的次数,即决策树的个数,数值大没关系,cv会自动返回合适的n_estimators max_depth=5, # 树的深度,默认值为6,典型值3-10。 min_child_weight=2, # 值越大,越容易欠拟合;值越小,越容易过拟合(值较大时,避免模型学习到局部的特殊样本)。默认值为1 gamma=0, # 惩罚项系数,指定节点分裂所需的最小损失函数下降值。 objective='multi:softprob', ) if useTrainCV: xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], folds=cv_folds, metrics='mlogloss', early_stopping_rounds=early_stopping_rounds) n_estimators = cvresult.shape[0] xgb1.set_params(n_estimators=n_estimators) # print(cvresult) # Fit the algorithm on the data xgb1.fit(X_train, y_train, eval_metric='mlogloss') # Predict training set: train_predprob = xgb1.predict_proba(X_train) logloss = metrics.log_loss(y_train, train_predprob)