def runXGBoost(x_train, y_train, x_test, y_test, p): # Here we instantiate the extra gradient boosting classifier clf = XGBClassifier() clf.set_params(**p) clf.fit(x_train, y_train) # now, make the predictions using our classifier xgb_predictions = clf.predict(x_test) # now we have to computer the classification accuracy # think about what two variables we have to compare xgb_score = accuracy_score(y_test, xgb_predictions) print("XGB classification accuracy on test data is " + str(xgb_score), file=sys.stderr) etc_predictions = clf.predict(x_test) dt_score = accuracy_score(y_test, etc_predictions) print("accuracy score on test data: " + str(dt_score), file=sys.stderr) train_score = accuracy_score(y_train, clf.predict(x_train)) print("accuracy score on training data: " + str(train_score), file=sys.stderr) return (train_score, dt_score)
def modelfit(useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(**params) df = data.sample(frac=0.3) pX = df.drop('LABEL', axis=1) py = df['LABEL'] if useTrainCV: print("start use cv") xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_param['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print(cvresult.shape[0]) alg.set_params(n_estimators=cvresult.shape[0]) params['n_estimators'] = cvresult.shape[0] print("best tree size is {}".format(cvresult.shape[0])) # Fit the algorithm on the data alg.fit(X, y, eval_metric='auc') y_pred = alg.predict(pX) accuracy = metrics.accuracy_score(py, y_pred) print("精确率Accuracy: %.2f%%" % (accuracy * 100.0)) print('auc:', metrics.roc_auc_score(py, y_pred)) train_report = metrics.classification_report(py, y_pred) print(train_report) feat_imp = pd.Series( alg.get_booster().get_fscore()).sort_values(ascending=False) print(feat_imp) return alg
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators, max_depth, min_child_weight, gamma, subsample, colsample_bytree, reg_alpha, eval_metric): ROCforest = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, objective='binary:logistic', nthread=4, seed=12) cv_folds = 5 eval_metric = eval_metric xgb_param = ROCforest.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=ROCforest.get_params()['n_estimators'], nfold=cv_folds, metrics=eval_metric) ROCforest.set_params(n_estimators=cvresult.shape[0]) ROCforest.fit(X_train, y_train) return ROCforest
def predict_xgboost(param_grid, train, test, col_type, nthread=3, seed=1): target = col_type['target'] features = col_type['features'] ID = col_type['ID'] params = dict() for key, value in param_grid.items(): params[key] = value[0] params['objective'] = 'binary:logistic' params['nthread'] = nthread params['random_state'] = seed params['seed'] = seed params['silent'] = True xgb_model = XGBClassifier() xgb_model.set_params(**params) X_train = train[features].values y_train = train[target].values X_test = test[features].values # Fit the algorithm on train data xgb_model.fit(X_train, y_train, eval_metric='auc') # Predict on test data pred = xgb_model.predict_proba(X_test)[:, 1] pred = pd.concat([test.loc[:, [ID]], pd.Series(pred, name='pred_xgboost')], axis=1) return pred
def eval_fn(params): model = XGBClassifier(n_estimators=n_estimators_max, learning_rate=learning_rate, seed=seed) score = 0 n_estimators = 0 for tr, va in skf: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss', early_stopping_rounds=50, verbose=False) score += model.best_score n_estimators += model.best_iteration score /= n_folds n_estimators /= n_folds n_estimators_lst.append(n_estimators) result_str = "train:%.4f ntree:%5d " % (score, n_estimators) if X_valid is not None: model.n_estimators = n_estimators model.fit(X_train, y_train) pr = model.predict_proba(X_valid)[:,1] sc_valid = log_loss(y_valid, pr) score_valid.append(sc_valid) result_str += "valid:%.4f" % sc_valid if verbose: print result_str return score
def xgb_model(x1, y1): X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") imp = importances(xgb1, X_test, y_test) # permutation imp = imp.reset_index() imp_ = imp[imp["Importance"] >= 0.0001] feats = [] for _ in imp_["Feature"]: feats.append(_) return imp, feats
def XGBoost_training_single(Training, label, XGBoost_param): XGBoost_param['subsample'] = 0.8 XGBoost_param['colsample_bytree'] = 0.8 bst = XGBClassifier() bst.set_params(**XGBoost_param) bst.fit(Training, label) return (bst)
def test_param(params, X_train, y_train, X_test, y_test, seed, verbose=True): # Costruisco un modello con i parametri specificati xgb1 = XGBClassifier(objective='multi:softmax', num_class=9, seed=seed) xgb1.set_params(**params) # Addestro il modello con una parte del dataset modelfit(xgb1, X_train, y_train, verbose=verbose) # Valuto il modello sul trainingset test_rmse = evaluate(xgb1, X_test, y_test) return xgb1, test_rmse
def grid_search(xgb_model, param_search, dtrain): xgb_gs = XGBClassifier(booster='gbtree', learning_rate=0.1, n_estimators=300, max_depth=6, reg_alpha=0.05, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=1, objective='multi:softmax', num_class=10, scale_pos_weight=1) param_set = xgb_model.get_params() xgb_gs.set_params(**param_set) # gsearch = GridSearchCV(estimator=xgb_model, param_grid=param, scoring='accuracy', cv=5) gsearch = GridSearchCV(estimator=xgb_gs, param_grid=param_search, cv=5) gsearch.fit(dtrain.values[:, 1:], dtrain.values[:, 0]) print(gsearch.cv_results_) print(gsearch.best_score_) print(gsearch.best_params_) return gsearch
def Blend(Number_of_algo, L, Phi, Chi, N, X, y): '''Build a new data set with the created F and G matrixes for L levels. Train the Belended algorithm with the new data and it's corresponding parameters''' rho = 0.7 Levels_D_fw = [] for i in range(0, L): sample_size = int(rho * len(X)) #Get sample size for this level indices = range(0, len(X)) #Get all the indices of the data to sample new_indices = random.sample( indices, sample_size) #sample the indices according to the sample size D_dash = X[new_indices] #Retrieve sampled indices data from X Labels_dash = y[new_indices] #Retrieve sampled indices data from y remaining_indices = list( set(indices) - set(new_indices)) #Get those indices which are not sampled D_complement = X[ remaining_indices] #Retrieve remaining indices data from X Labels_complement = y[ remaining_indices] #Retrieve remaining indices data from y Labels_complement = Labels_complement.reshape( -1, 1 ) #Convert a one dimensional single array into a two dimensional single array M = Build_Models( N, Chi, D_dash, Labels_dash) #Generate models for the sampled indices data Models_count = len(M) #Total number of Models G_Matrix = G(M, D_complement, Models_count, Number_of_algo, 3) #Generate G F_Matrix = F(G_Matrix, D_complement, Models_count, Number_of_algo) #Generate F D_fw_temp = np.concatenate( (D_complement, G_Matrix, F_Matrix, Labels_complement), axis=1 ) #Generate new dataset i.e add the elements of reamining data, F & G row wise Levels_D_fw.append( D_fw_temp) #Add each such new dataset for the present Level '''Number of columns of D_fw would be d(columns of the data set) + Nc(Number of models * Number of classes) + dNc(Number of columns * Number of models * Number of classes) + 1(Labels columns which is 1)''' D_fw = np.concatenate( Levels_D_fw, axis=0 ) #Column wise append all the new datasets generated for each Level Object = XGBClassifier() #Create a base algorithm classifier Object.set_params( **Phi['params'] ) #Set the randomly generated params for the base algorithm classifier Object.fit(D_fw[:, :(D_fw.shape[1] - 1)], D_fw[:, [D_fw.shape[1] - 1]]) #Train the base algorithm with the new Data return Object #Return the trained Model
def main(): data_train = pd.read_csv(args.train_dataset) X_train = data_train.drop(['Id', 'Class'], axis=1) y_train = data_train.loc[:, 'Class'] data_test = pd.read_csv(args.test_dataset) X_test = data_test.drop(['Id'], axis=1) Id = data_test.loc[:, 'Id'] clf = XGBClassifier() clf.set_params(**best_dicts) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test) columns = ['Prediction'+str(i) for i in range(1, 10)] prediction = pd.DataFrame(prediction, columns=columns) results = pd.concat([Id, prediction], axis=1) return (clf, results)
def learn_xgboost(train, predictors, class_target): xgb_train = xgb.DMatrix(train[predictors].values, label=train[class_target].values) xgb1 = XGBClassifier() # Tunning parameters cv_folds = 10 early_stopping_rounds = 20 show_progress = True params = { 'reg_alpha': 0, 'colsample_bytree': 0.6, 'silent': 1, 'colsample_bylevel': 1, 'scale_pos_weight': 1, 'learning_rate': 0.1, 'missing': -1, 'max_delta_step': 0, 'nthread': 4, 'base_score': 0.5, 'n_estimators': 500, 'subsample': 0.7, 'reg_lambda': 1, 'seed': 0, 'min_child_weight': 1, 'objective': 'multi:softprob', 'max_depth': 6, 'gamma': 0, 'num_class': 2 } # Cross-Validation cvresult = xgb.cv(params, xgb_train, num_boost_round=params['n_estimators'], nfold=cv_folds, metrics=['mlogloss'], early_stopping_rounds=early_stopping_rounds) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(train[predictors], train[class_target], eval_metric=['mlogloss']) return xgb1
def XGBoost_training(Training, label, XGBoost_param, cv, threads=6): ## 分类方法:XGBoost XGBoost_gs = GridSearchCV(estimator=XGBClassifier(subsample=0.8, colsample_bytree=0.8), param_grid=XGBoost_param, scoring='roc_auc', n_jobs=threads, iid=False, cv=cv) XGBoost_gs.fit(Training, label) bst = XGBClassifier() bst.set_params(**XGBoost_gs.best_params_) bst.fit(Training, label) return (bst)
def main(): # Load data for XGBClassifier dtrain = load_svmlight_file("../data/agaricus.txt.train") dtest = load_svmlight_file("../data/agaricus.txt.test") # load data for xgboost xgtrain = xgb.DMatrix("../data/agaricus.txt.train") #xgtest = xgb.DMatrix("../data/agaricus.txt.test") xgb0 = XGBClassifier(learning_rate=0.1, n_estimators=2, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) # res0=modelfit(xgb0, dtrain, dtest, xgtrain) # xgb0.set_params(n_estimators=res0[4]) res00 = base(xgb0, dtrain, dtest, xgtrain) param_test1 = { 'max_depth': list(range(3, 10, 2)), 'min_child_weight': list(range(1, 6, 2)) } gridSearch_res1 = tune(xgb0, param_test=param_test1, dtrain=dtrain) xgb0.set_params(max_depth=gridSearch_res1[1]['max_depth'], min_child_weight=gridSearch_res1[1]['min_child_weight']) res1 = modelfit(xgb0, dtrain, dtest, xgtrain) print(res1) param_test2 = {'gamma': [i / 10 for i in range(0, 5)]} gridSearch_res2 = tune(xgb0, param_test=param_test2, dtrain=dtrain) xgb0.set_params(gamma=gridSearch_res2[1]['gamma']) res2 = modelfit(xgb0, dtrain, dtest, xgtrain) param_test3 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } gridSearch_res3 = tune(xgb0, param_test=param_test3, dtrain=dtrain) xgb0.set_params(subsample=gridSearch_res3[1]['subsample'], colsample_bytree=gridSearch_res3[1]['colsample_bytree']) res3 = modelfit(xgb0, dtrain, dtest, xgtrain)
def train(self, train_set, dev_set): logger.log('Get features from training set') if os.path.exists(train_features_file): train_features = np.load(train_features_file) _, _, train_labels, _, _ = self.get_minibatch( train_set, 0, len(train_set)) else: train_features = None train_labels = [] total_batch = int(len(train_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(train_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) train_features = minibatch_features[0] if train_features is None \ else np.concatenate((train_features, minibatch_features[0])) train_labels += minibatch_labels np.save(train_features_file, train_features) logger.log('Get features from dev set') if os.path.exists(dev_features_file): dev_features = np.load(dev_features_file) _, _, dev_labels, _, _ = self.get_minibatch( dev_set, 0, len(dev_set)) else: dev_features = None dev_labels = [] total_batch = int(len(dev_set) - 1) / self.batch_size + 1 for i in tqdm(range(total_batch)): minibatch_premise_vectors, minibatch_hypothesis_vectors, minibatch_labels, \ minibatch_prem_dep, minibatch_hypo_dep = \ self.get_minibatch(dev_set, i * self.batch_size, (i+1) * self.batch_size) feed_dict = { self.model.premise_x: minibatch_premise_vectors, self.model.hypothesis_x: minibatch_hypothesis_vectors, self.model.y: minibatch_labels, self.model.keep_rate_ph: 1.0 } if 'dep_avg' in self.model_type: feed_dict[self.model.prem_dep] = minibatch_prem_dep feed_dict[self.model.hypo_dep] = minibatch_hypo_dep minibatch_features = self.sess.run([self.model.features], feed_dict) dev_features = minibatch_features[0] if dev_features is None \ else np.concatenate((dev_features, minibatch_features[0])) dev_labels += minibatch_labels np.save(dev_features_file, dev_features) tuned_parameters = {'max_depth': [4, 6, 8], 'n_estimators': [100, 200]} best_score = 0. best_params = [] for g in ParameterGrid(tuned_parameters): clf = XGBClassifier(nthread=24) clf.set_params(**g) clf.fit(train_features, train_labels) score = clf.score(dev_features, dev_labels) logger.log('%s: %f' % (str(g), score)) if best_score < score: best_score = score best_params = g self.clf = clf logger.log('Best score: %s %f' % (str(best_params), best_score))
from sklearn.ensemble import RandomForestClassifier as rfc rfc = rfc() rfc.set_params(n_estimators=1000, min_samples_split=5, min_samples_leaf=1, max_features='sqrt', max_depth=60, random_state=42, class_weight={0:.2, 1:.8}) FinalTrainX = FinalTrain.drop('readmitted', axis=1) FinalTrainY = FinalTrain['readmitted'].replace([2,1], [1,0]) rfc.fit(FinalTrainX, FinalTrainY) from xgboost.sklearn import XGBClassifier as xgb xgb = xgb() xgb.set_params(n_estimators=500, min_child_weight=10, max_depth=5, gamma=5, colsample_bytree=0.6, max_delta_step=5, random_state=42, scale_pos_weight=1) xgb.fit(FinalTrainX, FinalTrainY) #Now, import new test cleaned test set and run it through our model: TestDF = pd.read.csv(filename) TestDF = TestDF.drop('IsTrain', axis=1) TestDFLR = TestDF.drop((['diabfeat_neurologic', 'race_AfricanAmerican', 'A1Cresult_>7', 'primarydiag_injury', 'number_diagnoses', 'med_glimepiride', 'med_insulin', 'diag_infection', 'medical_specialty_Orthopedics', 'med_nateglinide', 'discharge_disposition_leftAMA', 'admission_source_id_3', 'change_Ch', 'diag_circulatory', 'medical_specialty_Gastroenterology', 'medical_specialty_Surgery', 'primarydiag_infection', 'primarydiag_mentaldis'], axis=1)) predictprobsLR = lgr.predict_proba(TestDFLR)[:,1]
# Take a random 20% of the dataset as validation data x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.3, random_state=1981) gc.collect() print("loaded data") parameters = { 'learning_rate': 0.02, 'max_depth': 4, 'min_child_weight': 1, 'subsample': 0.9, 'colsample_bytree': 0.9, 'objective': 'binary:logistic', 'scale_pos_weight': 1, 'random_state': 1987, 'silent': True, } model = XGBClassifier() model.set_params(**parameters) # single_run(parameters, d_train, d_valid, d_test) predictors = [x for x in train_df.columns if x not in ['target', 'id']] # multiple_run(model, train_df, predictors) grid_search(model, train_df, predictors)
def xgb_model2(x1, y1, ft): ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once ##Copy results from XGB2_FEATS into 'unwanted' # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'} # ft = [e for e in ft if e not in unwanted] print("XGB Features:\n", ft, "\n") x1 = x1.loc[:, ft] X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb_cv_score = cross_val_score( xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc" ) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") D = feature_dependence_matrix(X_train) viz1 = plot_dependence_heatmap(D, figsize=(11, 10)) viz1.save("output/Psych_XGB_feat_depend_" + outfile) xgb_predict = xgb1.predict(X_test) print("=== All AUC Scores [CV - Train] ===") print(xgb_cv_score, "\n") print("=== Mean AUC Score [CV - Train] ===") print(xgb_cv_score.mean(), "\n") print("=== Confusion Matrix [Test] ===") print(confusion_matrix(y_test, xgb_predict), "\n") print("=== Classification Report [Test] ===") print(classification_report(y_test, xgb_predict), "\n") print("=== AUC Score [Test] ===") print(roc_auc_score(y_test, xgb_predict), "\n") imp = importances(xgb1, X_test, y_test) # permutation viz2 = plot_importances(imp) viz2.save("output/Psych_XGB_feat_imp_" + outfile) imp = imp.reset_index() imp_ = imp[imp["Importance"] < 0.00000] feats = [] for _ in imp_["Feature"]: feats.append(_) xgb_roc_auc = roc_auc_score(y_test, xgb_predict) fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1]) plt.figure() plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc) plt.plot([0, 1], [0, 1], "r--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic: Clinical Data Only [XGB]") plt.legend(loc="lower right") plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.show() return imp, feats
Parmt_XGBoost = { 'n_estimators': [50, 100], 'max_depth': [3, 5], 'learning_rate': [0.01, 0.1, 0.3], 'colsample_bytree': [0.5, 1], 'gamma': [0], } Parmt_model_XGBoost = GridSearchCV(estimator=XGBoost_model, param_grid=Parmt_XGBoost, scoring='roc_auc', n_jobs=-1, cv=cv_inner).fit( train_data, train_target) best_parameters = Parmt_model_XGBoost.best_params_ # Set best parameters to XGBoost model XGBoost_model.set_params(**best_parameters) # Train optimized XGBoost model on train data XGBoost_model.fit(train_data, train_target) # Train data results prob_train[train_idx, ncv_idx] = XGBoost_model.predict_proba(train_data)[:, 1] aucs_train[ncv_idx] = metrics.roc_auc_score(train_target, prob_train[train_idx, ncv_idx]) # Test data results prob_test[test_idx, ncv_idx] = XGBoost_model.predict_proba(test_data)[:, 1] aucs_test[ncv_idx] = metrics.roc_auc_score(test_target, prob_test[test_idx, ncv_idx])
train.drop(x, axis=1, inplace=True) test.drop(x, axis=1, inplace=True) y_train = train['TARGET'].values X_train = train.drop(['ID','TARGET'], axis=1).values y_test = test['ID'] X_test = test.drop(['ID'], axis=1).values xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=600, max_depth=5, min_child_weight=1, gamma=0, subsample=0.6815, colsample_bytree=0.701, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['auc'], early_stopping_rounds=50, show_progress=False) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(X_train, y_train, eval_metric='auc') output = xgb1.predict_proba(X_test)[:,1] submission = pd.DataFrame({"ID":y_test, "TARGET":output}) submission.to_csv("submission.csv", index=False)
class ParamTuner: def __init__(self, X_train, y_train): self._clf = XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=0) self._dtrain = xgb.DMatrix(X_train, label=y_train) self._X_train = X_train self._y_train = y_train @property def clf(self): return self._clf def show_params(self): logging.info("-" * 40) logging.info("current params:\n" + str(self._clf.get_params())) logging.info("-" * 40) def get_param(self, name): return self._clf.get_params()[name] def set_param(self, name, value): self._clf.set_params(**{name: value}) def set_params(self, params): self._clf.set_params(**params) def tune_num_boost_round(self): logging.info("turn num_boost_round") history = xgb.cv(self._clf.get_params(), dtrain=self._dtrain, num_boost_round=NUM_BOOST_ROUND, nfold=CV_FOLDS, metrics='auc', early_stopping_rounds=EARLY_STOPPING_ROUNDS, show_stdv=True) logging.info("tail of history:\n" + str(history.tail(1))) logging.info("learning rate: %f, best boosting num: %d" % (self.get_param('learning_rate'), history.shape[0])) self.set_param('n_estimators', history.shape[0]) self.show_params() def grid_search(self, param_grid): logging.info("grid search on %s" % param_grid.keys()) gs = GridSearchCV(estimator=self._clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, iid=False, cv=CV_FOLDS) gs.fit(X=self._X_train, y=self._y_train) logging.info("grid_scores:\n" + '\n'.join(map(str, gs.grid_scores_))) logging.info("best_params: " + str(gs.best_params_)) logging.info("best_score: " + str(gs.best_score_)) self.set_params(gs.best_params_) self.show_params()
def train_model_xgb_cv(X_train, X_test, y_train, y_test): dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) xgb_sklearn = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01) xgb_params = xgb_sklearn.get_params() cvresult = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=5) n_estimators = cvresult.shape[0] print("n_estimators: ", n_estimators) xgb_sklearn.set_params(n_estimators=n_estimators) xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc') pred_y = xgb_sklearn.predict(X_test) pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1] # auc auc = roc_auc_score(y_test, pred_y_prob) print('AUC: ', auc) # error score = xgb_sklearn.score(X_test, y_test) print('error: ', 1 - score) # grid search params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]} model = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1, n_estimators=300, # max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01), param_grid=params, cv=2) model.fit(np.array(X_train), np.array(y_train), eval_metric='auc') print(model.cv_results_, model.best_params_, model.best_score_) feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore( fmap='xgb.fmap')).sort_values(ascending=True) feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6)) plt.ylabel('Feature name') plt.xlabel('Feature score') plt.savefig( 'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png', dpi=300) plt.show()
# Makes the model more robust by shrinking the weights on each step # Typical final values to be used: 0.01-0.2 xgb1 = XGBClassifier( #起点模型 learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=-1, scale_pos_weight=1, seed=27) best_iter = modelfit(xgb1, x, y) # 作图、找最优iter等 xgb1.set_params(n_estimators=best_iter) # 改最优iter #%% 以下的区块中,每一块都是调某一两个参数,格式是类似的 # 有些para_grid的range很小,因为那是最后的代码。一开始range都是很大的,我根据结果再把range调小然后再重复执行,所以留下的是最后的小range的代码。 #%% Tune max_depth and min_child_weight gsearch = [] # 这个变量用来记录调参的整个过程。每次调一个参数就增加一项(xgb,para),所以最后的最优模型就是gsearch[-1][0].best_estimator_。这样便于调用前面某一步得到的模型。 # min_child_weight [default=1] # Defines the minimum sum of weights of all observations required in a child. # This is similar to min_child_leaf in GBM but not exactly. This refers to min “sum of weights” of observations while GBM has min “number of observations”. # Used to control over-fitting. Higher values prevent a model from learning relations which might be highly specific to the particular sample selected for a tree. # Too high values can lead to under-fitting hence, it should be tuned using CV. # max_depth [default=6] # The maximum depth of a tree, same as GBM.
class XGBoostModel: def __init__(self, n_classes=2, data_str=None): # TODO: decide what nthread should be? if n_classes > 2 and data_str == 'voice': self.model = XGBClassifier(learning_rate=0.1, n_estimators=650, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='multi:softmax', nthread=4, scale_pos_weight=1, seed=0) # parameters found by tuning on voice data with 400 points: self.model.set_params( **{ "base_score": 0.5, "booster": "gbtree", "colsample_bylevel": 1, "colsample_bynode": 1, "colsample_bytree": 0.25, "gamma": 0.0, "learning_rate": 0.01, "max_delta_step": 0, "max_depth": 2, "min_child_weight": 1, "n_estimators": 3600, "nthread": 4, "objective": "multi:softprob", "reg_alpha": 1e-05, "reg_lambda": 1, "scale_pos_weight": 1, "seed": 0, "subsample": 0.75, "verbosity": 1 }) elif data_str == 'heart': self.model = XGBClassifier(learning_rate=0.1, n_estimators=20, max_depth=2, min_child_weight=0.3, gamma=0, subsample=0.65, colsample_bytree=0.15, reg_alpha=0, objective='binary:logistic', nthread=4, scale_pos_weight=1) elif data_str == 'ads': print('TODO: tune xgboost on ads! not done yet?') # TODO: tune model on heart and ads data too self.model = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=0) else: print( 'ERROR in xgBoostModel, only data_str in (voice, heart, ads) supported. Please add tuning parameters to file' ) def fit(self, x, y, with_tuning=False): self.model.fit(x, y) def predict(self, x): return self.model.predict(x) def predict_proba(self, x): return self.model.predict_proba(x)
def set_parameters(set_name, golden_set, input_file): golden = str_to_bool(golden_set) #------------------------------------------------------------------------- #read in the directory that is being run data_dir = set_name #read in the parameters file and load it full_path = os.path.join(working_dir, "{0}".format(data_dir), 'params.yaml') stream = open(full_path, 'r') parameters = yaml.load(stream, Loader=yaml.FullLoader) #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers df = pd.read_csv(input_file) set_number = set_name #------------------------------------------------------------------------- if golden: df2 = df.copy() df2.loc[df2[(df2['Exo'] == 1) & (df2['MaxPMass'] > parameters['gas_giant_mass'])]. sample(10, random_state=np.random.RandomState()).index, 'Exo'] = 0 yy = df2.loc[df2['Exo'] == 0].index zz = df.loc[df['Exo'] == 0].index changed = [ind for ind in yy if not ind in zz] changedhips = [df['HIP'][ind] for ind in changed] df = df2.copy() yy2 = df2.loc[df2['Exo'] == 0].index zz2 = df.loc[df['Exo'] == 0].index changed2 = [ind for ind in yy2 if not ind in zz2] #------------------------------------------------------------------------- df.index = df['HIP'] df['Exo'] = df['Exo'].astype('category') #category = limited possibilities df['Multi'] = df['Multi'].astype('category') df['MaxPMass'] = df['MaxPMass'].astype(np.number) df['Sampled'] = np.zeros((df.shape[0])) df['Predicted'] = np.zeros((df.shape[0])) df = df.drop(['HIP'], 1) # Print a bunch of stuff in terminal print('Parameters used in simulation:') print('------------------------------') print('') for key in parameters.keys(): print('{0} = {1}'.format(key, parameters[key])) cv_folds = parameters['cv_folds'] early_stopping_rounds = parameters['early_stopping_rounds'] N_iterations = parameters['N_iterations'] N_samples = parameters['N_samples'] gas_giant_mass = parameters['gas_giant_mass'] features = parameters['features'] relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted'] #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml if (parameters['dropnans']): df = df[relevant_columns].dropna() print('Number of samples used in simulation: {0}'.format(df.shape[0])) print('') #Define the confusion matrix and other arrays cfm = np.zeros((2, 2)) auc_score_train = [] precision_score_train = [] feat_imp_train = pd.DataFrame(columns=features) probabilities_total = pd.DataFrame(index=df.index) print('iteration \t estimators') print('---------------------------') #---------------------------XGBOOST LOOP---------------------------------------------- # Loop for all of the iterations (defined in yaml) for iteration in range(0, N_iterations): #dataframe of 200 random hosts with giant planets df_iter_with_exo = df[(df['Exo'] == 1) & (df['MaxPMass'] > gas_giant_mass)].sample( N_samples, random_state=np.random.RandomState()) #dataframe of 200 random non hosts df_iter_none_exo = df[df['Exo'] == 0].sample( N_samples, random_state=np.random.RandomState()) # make a new dataframe of the 400 star subset df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0) # make a dataframe of those stars NOT in the training set (to predict on) df_predict = df[~df.index.isin(df_train.index)] # The train dataframe with everything but the Exo column X = df_train.drop(['Exo'], 1) # The Exo column (and hips) Y = df_train.Exo # Note: Using gbtree booster alg = XGBClassifier( learning_rate= 0.1, #def=0.3, prevents overfitting and makes feature weight conservative n_estimators=1000, #number of boosted trees to fit max_depth=6, #def=6, max depth of tree/complexity min_child_weight= 1, #def=1, min weight needed to continue leaf partitioning gamma= 0, #def=0, minimum loss reduction required to make partition on a leaf subsample=0.8, #def=1, subsample ratio of the training set colsample_bytree= 0.8, #def=1, subsample ratio of columns when making each tree objective= 'binary:logistic', #def=linear, logistic regression for binary classification, output probability nthread= 1, #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost scale_pos_weight=1, #def=1, balance positive and neg weights seed=27) #def=0, random number seed #get input parameters of algorithm xgb_param = alg.get_xgb_params() #construct training set matrix xgtrain = xgb.DMatrix(X[features].values, label=Y) #cross validation (CV) of xgboost to avoid overfitting cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(iteration, '\t \t', cvresult.shape[0]) alg.fit(X[features], Y, eval_metric='auc') dtrain_predictions = alg.predict(X[features]) dtrain_predprob = alg.predict_proba(X[features])[:, 1] feat_imp = alg.get_booster().get_fscore() # See how the algorithm performs on the Exo data auc_score = metrics.roc_auc_score(Y, dtrain_predprob) precision_score = metrics.precision_score(Y, dtrain_predictions) metric_score = metrics.confusion_matrix(Y, dtrain_predictions) # Weighting function to ignore the null values normalized_features = pd.DataFrame( (1 - df_train[features].isnull().sum() / df_train[features].count()) * pd.Series(alg.get_booster().get_fscore()), columns=[iteration]).T #calculate the confusion matrix feat_imp_train = pd.concat([ feat_imp_train, pd.DataFrame(feat_imp, columns=features, index=[iteration]) ]) feat_imp_train_normal = pd.concat( [feat_imp_train, normalized_features]) auc_score_train.append(auc_score) precision_score_train.append(precision_score) cfm += metric_score df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index)) df.loc[df_predict.index, 'Predicted'] += alg.predict(df_predict[features]) df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features]) values = df['Prob'] probabilities_total = pd.concat( [probabilities_total, pd.Series(values, name=str(iteration))], axis=1) if (not iteration % 10): probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) #------------------------------------------------------------------------- # Calculate the confusion matrix cfm /= N_iterations cfm[0] /= cfm[0].sum() cfm[1] /= cfm[1].sum() # Print confusion matrix print(np.round(cfm, 3)) df['Prob'] = df['Predicted'] / df['Sampled'] ###########-------------------Output List of Planets------------------------######### #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns planets = df[(df.Prob > .90) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] print('Number of most probable planet hosts: {0}'.format(planets.shape[0])) #Sort the stars with predicted planets and save that file planetprobs = planets.sort_values(by='Prob', ascending=False) name = data_dir + '/figures/planet_probabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name = data_dir+'/figures/planet_probabilities.csv' outfile = open(name, 'w') planetprobs.to_csv(outfile) outfile.close() #Create a second list with all stars in Hypatia and the probabilities planets2 = df[(df.Prob > .0) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] if golden: #if 10 stars were randomly taken out changeddf = pd.DataFrame([]) #make empty dataframe for star in changedhips: #loop over the 10 known planets hosts (defined at top) changeddf = changeddf.append(planets2.loc[planets2.index == star]) if planets2.loc[ planets2.index == star].empty: #catch for when a known planet host was cut (bc of abunds) temp = pd.Series([nan, nan, nan], index=['Sampled', 'Predicted', 'Prob']) temp.name = star changeddf = changeddf.append( temp) #append blank file (with star name as index) #Save golden set as a separate file with the date and time as a tag filename = '{0}/figures/goldenSetProbabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' changeddf.to_csv(filename.format(set_number), na_rep=" ") #Save the file with all of the probabilities planetprobs2 = planets2.sort_values(by='Prob', ascending=False) name2 = data_dir + '/figures/planet_probabilitiesAll' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name2 = data_dir+'/figures/planet_probabilitiesAll.csv' outfile2 = open(name2, 'w') planetprobs2.to_csv(outfile2) outfile2.close() ###########------------------------Save Files------------------------########## print('Saving data files') #Save files feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir)) feat_imp_train_normal.to_pickle( '{0}/features_train_normal.pkl'.format(data_dir)) probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) df.to_pickle('{0}/df_info_all.pkl'.format(data_dir)) np.save('{0}/auc_score_train.npy'.format(data_dir), np.array(auc_score_train)) np.save('{0}/precision_score_train.npy'.format(data_dir), np.array(precision_score_train)) np.save('{0}/cfm.npy'.format(data_dir), cfm) print('Simulation completed successfully.') if golden: print("Changed indices and HIP numbers:") print(changed) print(changedhips)
# xgb为了兼容sklean的GridSearchCV借口,定义了模型类,参数基本一直,但是有三个参数的命名不同 params['n_estimators'] = boost_num params['learning_rate'] = params['eta'] params['reg_alpha'] = params['alpha'] params['reg_lambda'] = params['lambda'] params.pop('eta') params.pop('alpha') params.pop('lambda') # 设置想要搜索的参数及阈值 search_param = { 'max_depth':[3,5], 'min_child_weight':[2,4] } # 删除param里想要搜索的参数 for key in search_param: params.pop(key) model = XGBClassifier() model.set_params(**params) gridsearch = GridSearchCV(estimator=model,param_grid=search_param,scoring='roc_auc',n_jobs=4,cv=5,iid=False) gridsearch.fit(train_data,train_label) print(gridsearch.cv_results_) print(gridsearch.best_params_) print(gridsearch.best_score_) with open('best_param.pkl','wb') as f: pickle.dump(gridsearch.best_params_,f)
train_size=n_train, random_state=123) for idx, ignore in sss_train: X_train = X[train_idx][idx] y_train = target[train_idx][idx] # # 2. sss_train_inner = StratifiedShuffleSplit(y_train, n_iter=n_iter_cv, test_size=.1, random_state=456) model = XGBClassifier(n_estimators=1000, max_depth=10, subsample=.8, seed=987) params_lst_optimized = [] for params in xgb_params_lst: n_estimators = 0 for tr, va in sss_train_inner: X_tr, y_tr = X_train[tr], y_train[tr] X_va, y_va = X_train[va], y_train[va] model.set_params(**params) model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric="mlogloss", early_stopping_rounds=50, verbose=False) n_estimators += model.best_iteration sc = params.copy() sc.update({'n_estimators':n_estimators / n_iter_cv}) params_lst_optimized.append(sc) print 'Step 2 Done.', datetime.now() - t0 # 3. model = XGBClassifier(max_depth=10, subsample=.8) for params in params_lst_optimized: for seed_train in range(100, 100+n_iter_pred): params.update({'seed':seed_train}) model.set_params(**params) model.fit(X_train, y_train) pr = model.predict_proba(X_test)
def get_XgbClassifer(train_data, train_target, test_data, feature_names, parameters, early_stopping_rounds, num_folds, eval_metric, model_name='model', stratified=True): ''' :param train_data: 一定是numpy :param train_target: :param parameters: :param round: :param k: :param eval_metrics:自定义 or 内置字符串 :return: ''' # 如果在param中设置,会莫名报参数不存在的错误 clf = XGBClassifier(num_class=n_class) clf.set_params(**parameters) # 定义一些变量 oof_preds = np.zeros((train_data.shape[0], n_class)) sub_preds = np.zeros((test_data.shape[0], n_class)) feature_importance_df = pd.DataFrame() cv_result = [] # K-flod if stratified: folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1234) else: folds = KFold(n_splits=num_folds, shuffle=True, random_state=1234) for n_flod, (train_index, val_index) in enumerate(folds.split(train_data, train_target)): train_X = train_data[train_index] val_X = train_data[val_index] train_Y = train_target[train_index] val_Y = train_target[val_index] # 参数初步定之后划分20%为验证集,准备一个watchlist 给train和validation set ,设置num_round 足够大(比如100000),以至于你能发现每一个round 的验证集预测结果, # 如果在某一个round后 validation set 的预测误差上升了,你就可以停止掉正在运行的程序了。 watchlist = [(train_X, train_Y)] # early_stop 看validate的eval是否下降,这时候必须传eval_set,并取eval_set的最后一个作为validate clf.fit(train_X, train_Y, early_stopping_rounds=early_stopping_rounds, eval_set=watchlist, eval_metric=eval_metric) # 获得每次的预测值补充 oof_preds[val_index] = clf.predict_proba(val_X) # 获得预测的平均值,这里直接加完再除m sub_preds += clf.predict_proba(test_data) # 计算当前准确率 result = mean_absolute_error(val_Y, clf.predict(val_X)) print('Fold %2d macro-f1 : %.6f' % (n_flod + 1, result)) print(type(result)) cv_result.append(round(result, 5)) gc.collect() # 默认就是gain 如果要修改要再参数定义中修改importance_type # 保存特征重要度 gain = clf.feature_importances_ fold_importance_df = pd.DataFrame({ 'feature': feature_names, 'gain': 100 * gain / gain.sum(), 'fold': n_flod, }).sort_values('gain', ascending=False) feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) # 进行保存 sub_preds = sub_preds / folds.n_splits if not os.path.isdir('./cv'): os.makedirs('./cv') pd.DataFrame(oof_preds, columns=['class_' + str(i) for i in range(n_class) ]).to_csv('./cv/val_prob_{}.csv'.format(model_name), index=False, float_format='%.4f') pd.DataFrame(sub_preds, columns=['class_' + str(i) for i in range(n_class) ]).to_csv('./cv/test_prob_{}.csv'.format(model_name), index=False, float_format='%.4f') oof_preds = [np.argmax(x) for x in oof_preds] sub_preds = [np.argmax(x) for x in sub_preds] if not os.path.isdir('./sub'): os.makedirs('./sub') pd.DataFrame(oof_preds, columns=['class' ]).to_csv('./sub/val_{}.csv'.format(model_name), index=False) pd.DataFrame(sub_preds, columns=['class' ]).to_csv('./sub/test_{}.csv'.format(model_name), index=False) save_importances(feature_importance_df, model_name) return clf
# %% # 4.2. tuning parameters predictors = [x for x in df.columns if x not in ['label']] clf = XGBClassifier(learning_rate=0.1, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1) modelfit(clf, df2, predictors, cv_folds=10) # %% clf.set_params(n_estimators=193) # %% param1 = {'max_depth': range(3, 12, 2), 'min_child_weight': range(1, 6, 2)} gs1 = GridSearchCV(estimator=clf, param_grid=param1, scoring='f1_weighted', n_jobs=-1, iid=False, cv=10) gs1.fit(X, y) print(gs1.best_params_) print(gs1.best_score_) # %% clf.set_params(max_depth=7, min_child_weight=1)
def fit_xgboost(param_grid, param_table, train, col_type, find_n_estimator=False, cv_iterations=5, cv_folds=5, nthread=3, seed=1, verbose=0): target = col_type['target'] features = col_type['features'] ID = col_type['ID'] start_time = strftime("%Y-%m-%d %H-%M", gmtime()) pred_return = {} for params in param_table.itertuples(index=True, name='NamedTuple'): params = params._asdict() index = params['Index'] params.pop('Index') # remove "Index" from params params['objective'] = 'binary:logistic' params['nthread'] = nthread params['random_state'] = seed params['seed'] = seed params['silent'] = True xgb_model = XGBClassifier() xgb_model.set_params(**params) if find_n_estimator: xgb_train = xgb.DMatrix(train[features], label=train[target]) cv_result = xgb.cv( xgb_model.get_xgb_params(), xgb_train, num_boost_round=int(params['n_estimators']), nfold=cv_folds, metrics='auc', early_stopping_rounds=50, seed=seed) best_n_estimator = cv_result.shape[0] param_table.at[index, 'n_estimators'] = best_n_estimator xgb_model.set_params(n_estimators=best_n_estimator) scores = [] pred_all = [] for cv_index in range(cv_iterations): pred = train.loc[:, [ID]] # get only the ID column # k-fold cross validation skf = StratifiedKFold(n_splits=cv_folds, random_state=cv_index, shuffle=True) for train_index, dev_index in skf.split(train[features].values, train[target].values): X_train = train[features].iloc[train_index].values y_train = train[target].iloc[train_index].values X_dev = train[features].iloc[dev_index].values y_dev = train[target].iloc[dev_index].values # Fit the algorithm on train folds xgb_model.fit(X_train, y_train, eval_metric='auc') # Predict on dev fold pred_dev = xgb_model.predict_proba(X_dev)[:, 1] pred.at[dev_index, 'Pred'] = pred_dev # Compute the score score = metrics.roc_auc_score(y_dev, pred_dev) scores.append(score) if len(pred_all) == 0: pred_all = pred else: pred_all = pd.concat([pred_all, pred], axis=0) pred_mean = pred_all.groupby(ID)['Pred'].mean() # avg predict_proba for each ID score = metrics.roc_auc_score(train.sort_values(ID)[target].values, pred_mean) # use avg pred to compute auc score pred_return['Pred_' + str(index)] = pred_mean # store the pred result for use in stacking param_table.at[index, 'Score'] = score param_table.at[index, 'Score_Std'] = np.std(scores) if verbose == 1: print('{} : {}'.format(index, param_table.iloc[index, :])) param_table["Score_Weighted"] = param_table["Score"] - 0.1 * param_table["Score_Std"] # update_param_grid best_param_index = param_table["Score_Weighted"].idxmax() print("Param_grid size: {}".format(param_table.shape[0])) print("Current Score: {}, Score_Std: {}".format(param_table.loc[best_param_index, "Score"], param_table.loc[best_param_index, "Score_Std"])) print("--------------------------") for param in param_grid: best_param = param_table.loc[best_param_index, param] if isinstance(param_grid[param], list): if len(param_grid[param]) > 1 or (len(param_grid[param]) == 1 and param_grid[param][0] != best_param): print("{}: tuned to {}".format(param, best_param)) else: print("{}: tuned to {}".format(param, best_param)) param_grid[param] = [best_param] return param_grid, pred_return
def modelfit(train, labels, test, features, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): model = XGBClassifier(learning_rate=0.2, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=27) test_percent = 0.2 X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=test_percent, random_state=23) xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(X_train[features], y_train) xgcv = xgb.DMatrix(X_test[features]) xgtest = xgb.DMatrix(test[features]) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print("n_estimators=") print(cvresult.shape[0]) model.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data model.fit(X_train, y_train) ##training predictions proba = model.predict_proba(X_test) preds = proba[:, 1] score = roc_auc_score(y_test, preds) print("Area under ROC {0}".format(score)) #Print model report: # print "\nModel Report" # print "Accuracy : %.4g" % accuracy_score(y_train, preds) # print "AUC Score (Train): %f" % roc_auc_score(y_train, preds) feat_imp = pd.Series( model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') # plt.show() ##test predictions test_proba = model.predict_proba(test) test_preds = test_proba[:, 1] return test_preds
'colsample_bytree': 0.7, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. # //无效'eta': 0.007, # 如同学习率 'seed': 1000, 'nthread': 7, # cpu 线程数 # 'eval_metric': 'auc' } # 训练模型 model = XGBClassifier() # 构建模型 model.get_params() #获取参数 model.set_params(**params) # 设置参数 # 开始训练 model.fit(aTrain_X, aTrain_Y, eval_metric='auc') # 保存模型 score0 = 0 # model.score(aTrain_X, aTrain_Y) score1 = model.score(aTest_X, aTest_Y) if score1 > 0.745: pickle.dump( model, open( '{}/qa_data/pre_trained_models/xgboost_qaquality_21_60dz_s{}.pkl' .format(cur_dir, round(score1, 3)), 'wb')) print('====> yes found good xgboost model') # print(i+1, score) # 打印每轮训练的准确率 # 打印准确率 和 召回率
colsample_bytree=0.8, seed=1) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(x_train_ss, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50, verbose_eval=10) print('n_estimators', cvresult.shape[0]) print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0]) xgb1.set_params(n_estimators=cvresult.shape[0]) print('model', xgb1) #n_estimators 137 #test-auc: 0.8731962 tuned_parameters = { 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'min_child_weight': [1, 2, 3, 4, 5, 6] } xgb2 = XGBClassifier(learning_rate=0.1, n_estimators=137, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8,
xgb_param, xgtrain, num_boost_round=1000, #model.get_params()['n_estimators'], nfold=5, metrics='merror', early_stopping_rounds=50, stratified=True) print('\ntraining error') print(cvresult['train-merror-mean']) print('\nvalidation error') print(cvresult['test-merror-mean']) cvresult[['train-merror-mean', 'test-merror-mean']].plot() model.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data model.fit(X_train, y_train, eval_metric='merror') #Predict training set: predictions = model.predict(X_test) predprob = model.predict_proba(X_test)[:, 1] # Print model report: print("\nModel Report") print("Training Accuracy : %.4g" % metrics.accuracy_score(y_train, model.predict(X_train))) print("Testing Accuracy : %.4g" % metrics.accuracy_score(y_test, model.predict(X_test)))