def train_xgb(model=False): print('train_xgb') ''' input: output: ''' global log params = grid_search_xgb(True) clf = XGBClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'xgb' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', max_depth: %d' % params['max_depth'] log += ', min_child_weight: %d' % params['min_child_weight'] log += ', gamma: %.1f' % params['gamma'] log += ', subsample: %.1f' % params['subsample'] log += ', colsample_bytree: %.1f' % params['colsample_bytree'] log += '\n\n' model = train(clf) file = open('xgb-model.pkl', 'wb') pickle.dump(model, file) file.close() print('train_xgb end') return
def Create_Model(X_train, X_test, y_train, y_test, learning_rate, n_estimators, max_depth, min_child_weight, gamma, subsample, colsample_bytree, reg_alpha, eval_metric): ROCforest = XGBClassifier(learning_rate=learning_rate, n_estimators=n_estimators, max_depth=max_depth, min_child_weight=min_child_weight, gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, objective='binary:logistic', nthread=4, seed=12) cv_folds = 5 eval_metric = eval_metric xgb_param = ROCforest.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=ROCforest.get_params()['n_estimators'], nfold=cv_folds, metrics=eval_metric) ROCforest.set_params(n_estimators=cvresult.shape[0]) ROCforest.fit(X_train, y_train) return ROCforest
def xgb_model(x1, y1): X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") imp = importances(xgb1, X_test, y_test) # permutation imp = imp.reset_index() imp_ = imp[imp["Importance"] >= 0.0001] feats = [] for _ in imp_["Feature"]: feats.append(_) return imp, feats
def cv(self, cache=False): if not cache: rows = get_db_data(300000) features = rows[:, :-1] ys = rows[:, -1] try: dump_svmlight_file(features, ys, 'catarse.txt.all') except Exception as inst: print(inst) dtrain = xgb.DMatrix(features, label=ys) X = features y = ys else: # load file from text file, also binary buffer generated by xgboost dtrain = xgb.DMatrix('catarse_recommender/common/catarse.txt.all') data = load_svmlight_file( 'catarse_recommender/common/catarse.txt.all') X = data[0] y = data[1] xgb1 = XGBClassifier(learning_rate=0.01, n_estimators=800, max_depth=4, nthread=8, objective='binary:logistic', seed=27) xgb_param = xgb1.get_xgb_params() cvresult = xgb.cv(xgb_param, dtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['logloss', 'error'], early_stopping_rounds=20, stratified=True, shuffle=True) print(cvresult) filehandler = open(b"catarse_recommender/common/cv_result.obj", "wb") pickle.dump(cvresult, filehandler)
def train_xgb(model=False): global log params = grid_search_xgb(True) clf = XGBClassifier().set_params(**params) if model: return clf params = clf.get_params() log += 'xgb' log += ', learning_rate: %.3f' % params['learning_rate'] log += ', n_estimators: %d' % params['n_estimators'] log += ', max_depth: %d' % params['max_depth'] log += ', min_child_weight: %d' % params['min_child_weight'] log += ', gamma: %.1f' % params['gamma'] log += ', subsample: %.1f' % params['subsample'] log += ', colsample_bytree: %.1f' % params['colsample_bytree'] log += '\n\n' return train(clf)
d_train = xgb.DMatrix(x_train, y_train) d_test = xgb.DMatrix(x_test, y_test) wathchlist = [(d_train, "train"), (d_test, "test")] clf = XGBClassifier(learning_rate=0.1, n_estimators=100, objective="binary:logistic", eval_metric="logloss", min_child_weight=1, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, n_jobs=-1, nthread=-1, seed=3) params = clf.get_params() evals_res = {} model_sklearn = xgb.train(params=params, dtrain=d_train, evals=wathchlist, evals_result=evals_res, early_stopping_rounds=10, verbose_eval=True) y_hat = model_sklearn.predict(d_test) df_evals = pd.DataFrame({ "loss_train": evals_res.get("train").get("logloss"), "loss_test": evals_res.get("test").get("logloss") }) df_evals.plot() y_pred = np.where(y_hat <= 0.5, 0, 1)
def modelfit(train, labels, test, features, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): model = XGBClassifier(learning_rate=0.2, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=27) test_percent = 0.2 X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=test_percent, random_state=23) xgb_param = model.get_xgb_params() xgtrain = xgb.DMatrix(X_train[features], y_train) xgcv = xgb.DMatrix(X_test[features]) xgtest = xgb.DMatrix(test[features]) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=model.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) print("n_estimators=") print(cvresult.shape[0]) model.set_params(n_estimators=cvresult.shape[0]) #Fit the algorithm on the data model.fit(X_train, y_train) ##training predictions proba = model.predict_proba(X_test) preds = proba[:, 1] score = roc_auc_score(y_test, preds) print("Area under ROC {0}".format(score)) #Print model report: # print "\nModel Report" # print "Accuracy : %.4g" % accuracy_score(y_train, preds) # print "AUC Score (Train): %f" % roc_auc_score(y_train, preds) feat_imp = pd.Series( model.booster().get_fscore()).sort_values(ascending=False) feat_imp.plot(kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') # plt.show() ##test predictions test_proba = model.predict_proba(test) test_preds = test_proba[:, 1] return test_preds
def xgb_model2(x1, y1, ft): ## Remove features that negatively impact the model - Used after xgb_mode2 is already run once ##Copy results from XGB2_FEATS into 'unwanted' # unwanted = {'fever_unknown', 'other_ext_injury', 'med_angiotensin_ii_i', 'wbc_disease', 'proc_124', 'acute_bronch', 'hemorrhoid', 'chf', 'poison_psycho', 'eye_inflam', 'lower_limb_fract', 'biliary_tract', 'other_bone_disease', 'med_antifungal', 'spondylosis', 'secndry_malig', 'other_joint', 'neoplasm_unspec', 'chest_pain_nos', 'acq_foot_deform', 'mood', 'nonmalig_breast', 'schizo', 'suicide', 'osteo_arth', 'other_connective', 'medical_eval'} # ft = [e for e in ft if e not in unwanted] print("XGB Features:\n", ft, "\n") x1 = x1.loc[:, ft] X_train, X_test, y_train, y_test = train_test_split( x1, y1, test_size=0.3, random_state=SEED ) # Down-sample controls in training set, [1:1] case:control if subsample is True: X_train, y_train = subsample_df(X_train, y_train) # Implement SMOTE to balance training set, [1:1] case:control if smote is True: X_train, y_train = smote_sample(X_train, y_train) columns = X_train.columns # Weight Rescale ratio = float( np.sum(y_train["psych_hosp"].values == 0) / np.sum(y_train["psych_hosp"].values == 1) ) # Instantiate the XGBClassifier and specify parameters xgb1 = XGBClassifier( learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective="binary:logistic", nthread=4, scale_pos_weight=ratio, seed=SEED, ) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train[columns].values, label=y_train["psych_hosp"].values) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=xgb1.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=50, ) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb_cv_score = cross_val_score( xgb1, X_train, np.ravel(y_train), cv=10, scoring="roc_auc" ) # Fit the algorithm on the data xgb1.fit(X_train, np.ravel(y_train), eval_metric="auc") D = feature_dependence_matrix(X_train) viz1 = plot_dependence_heatmap(D, figsize=(11, 10)) viz1.save("output/Psych_XGB_feat_depend_" + outfile) xgb_predict = xgb1.predict(X_test) print("=== All AUC Scores [CV - Train] ===") print(xgb_cv_score, "\n") print("=== Mean AUC Score [CV - Train] ===") print(xgb_cv_score.mean(), "\n") print("=== Confusion Matrix [Test] ===") print(confusion_matrix(y_test, xgb_predict), "\n") print("=== Classification Report [Test] ===") print(classification_report(y_test, xgb_predict), "\n") print("=== AUC Score [Test] ===") print(roc_auc_score(y_test, xgb_predict), "\n") imp = importances(xgb1, X_test, y_test) # permutation viz2 = plot_importances(imp) viz2.save("output/Psych_XGB_feat_imp_" + outfile) imp = imp.reset_index() imp_ = imp[imp["Importance"] < 0.00000] feats = [] for _ in imp_["Feature"]: feats.append(_) xgb_roc_auc = roc_auc_score(y_test, xgb_predict) fpr, tpr, thresholds = roc_curve(y_test, xgb1.predict_proba(X_test)[:, 1]) plt.figure() plt.plot(fpr, tpr, label="XGB Classifier (area = %0.3f)" % xgb_roc_auc) plt.plot([0, 1], [0, 1], "r--") plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.title("Receiver operating characteristic: Clinical Data Only [XGB]") plt.legend(loc="lower right") plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.savefig("output/ROC_Psych_XGB_" + outfile) plt.show() return imp, feats
train.drop(x, axis=1, inplace=True) test.drop(x, axis=1, inplace=True) y_train = train['TARGET'].values X_train = train.drop(['ID','TARGET'], axis=1).values y_test = test['ID'] X_test = test.drop(['ID'], axis=1).values xgb1 = XGBClassifier( learning_rate =0.1, n_estimators=600, max_depth=5, min_child_weight=1, gamma=0, subsample=0.6815, colsample_bytree=0.701, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27) xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb1.get_xgb_params(), xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics=['auc'], early_stopping_rounds=50, show_progress=False) xgb1.set_params(n_estimators=cvresult.shape[0]) xgb1.fit(X_train, y_train, eval_metric='auc') output = xgb1.predict_proba(X_test)[:,1] submission = pd.DataFrame({"ID":y_test, "TARGET":output}) submission.to_csv("submission.csv", index=False)
class ParamTuner: def __init__(self, X_train, y_train): self._clf = XGBClassifier(learning_rate=0.01, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', scale_pos_weight=1, seed=0) self._dtrain = xgb.DMatrix(X_train, label=y_train) self._X_train = X_train self._y_train = y_train @property def clf(self): return self._clf def show_params(self): logging.info("-" * 40) logging.info("current params:\n" + str(self._clf.get_params())) logging.info("-" * 40) def get_param(self, name): return self._clf.get_params()[name] def set_param(self, name, value): self._clf.set_params(**{name: value}) def set_params(self, params): self._clf.set_params(**params) def tune_num_boost_round(self): logging.info("turn num_boost_round") history = xgb.cv(self._clf.get_params(), dtrain=self._dtrain, num_boost_round=NUM_BOOST_ROUND, nfold=CV_FOLDS, metrics='auc', early_stopping_rounds=EARLY_STOPPING_ROUNDS, show_stdv=True) logging.info("tail of history:\n" + str(history.tail(1))) logging.info("learning rate: %f, best boosting num: %d" % (self.get_param('learning_rate'), history.shape[0])) self.set_param('n_estimators', history.shape[0]) self.show_params() def grid_search(self, param_grid): logging.info("grid search on %s" % param_grid.keys()) gs = GridSearchCV(estimator=self._clf, param_grid=param_grid, scoring='roc_auc', n_jobs=-1, iid=False, cv=CV_FOLDS) gs.fit(X=self._X_train, y=self._y_train) logging.info("grid_scores:\n" + '\n'.join(map(str, gs.grid_scores_))) logging.info("best_params: " + str(gs.best_params_)) logging.info("best_score: " + str(gs.best_score_)) self.set_params(gs.best_params_) self.show_params()
def train_model_xgb_cv(X_train, X_test, y_train, y_test): dtrain = xgb.DMatrix(X_train, label=y_train) dtest = xgb.DMatrix(X_test, label=y_test) xgb_sklearn = XGBClassifier(learning_rate=0.1, n_estimators=300, max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01) xgb_params = xgb_sklearn.get_params() cvresult = xgb.cv(xgb_params, dtrain, num_boost_round=xgb_params['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=5) n_estimators = cvresult.shape[0] print("n_estimators: ", n_estimators) xgb_sklearn.set_params(n_estimators=n_estimators) xgb_sklearn.fit(np.array(X_train), np.array(y_train), eval_metric='auc') pred_y = xgb_sklearn.predict(X_test) pred_y_prob = xgb_sklearn.predict_proba(X_test)[:, 1] # auc auc = roc_auc_score(y_test, pred_y_prob) print('AUC: ', auc) # error score = xgb_sklearn.score(X_test, y_test) print('error: ', 1 - score) # grid search params = {'max_depth': [2, 3, 4, 5, 6, 7, 8]} model = GridSearchCV( estimator=XGBClassifier( learning_rate=0.1, n_estimators=300, # max_depth=3, min_child_weight=1, gamma=0.3, subsample=0.6, colsample_bytree=0.7, objective='binary:logistic', nthread=4, seed=27, reg_lambda=0.01), param_grid=params, cv=2) model.fit(np.array(X_train), np.array(y_train), eval_metric='auc') print(model.cv_results_, model.best_params_, model.best_score_) feat_imp = pd.Series(xgb_sklearn.get_booster().get_fscore( fmap='xgb.fmap')).sort_values(ascending=True) feat_imp.plot(kind='barh', color='black', legend=False, figsize=(10, 6)) plt.ylabel('Feature name') plt.xlabel('Feature score') plt.savefig( 'C:/Users/Administrator.USER-20161227PQ/Desktop/paper figure/figure5.png', dpi=300) plt.show()
def set_parameters(set_name, golden_set, input_file): golden = str_to_bool(golden_set) #------------------------------------------------------------------------- #read in the directory that is being run data_dir = set_name #read in the parameters file and load it full_path = os.path.join(working_dir, "{0}".format(data_dir), 'params.yaml') stream = open(full_path, 'r') parameters = yaml.load(stream, Loader=yaml.FullLoader) #read in Hypatia data as pandas dataframe (2D structure), drop HIP numbers df = pd.read_csv(input_file) set_number = set_name #------------------------------------------------------------------------- if golden: df2 = df.copy() df2.loc[df2[(df2['Exo'] == 1) & (df2['MaxPMass'] > parameters['gas_giant_mass'])]. sample(10, random_state=np.random.RandomState()).index, 'Exo'] = 0 yy = df2.loc[df2['Exo'] == 0].index zz = df.loc[df['Exo'] == 0].index changed = [ind for ind in yy if not ind in zz] changedhips = [df['HIP'][ind] for ind in changed] df = df2.copy() yy2 = df2.loc[df2['Exo'] == 0].index zz2 = df.loc[df['Exo'] == 0].index changed2 = [ind for ind in yy2 if not ind in zz2] #------------------------------------------------------------------------- df.index = df['HIP'] df['Exo'] = df['Exo'].astype('category') #category = limited possibilities df['Multi'] = df['Multi'].astype('category') df['MaxPMass'] = df['MaxPMass'].astype(np.number) df['Sampled'] = np.zeros((df.shape[0])) df['Predicted'] = np.zeros((df.shape[0])) df = df.drop(['HIP'], 1) # Print a bunch of stuff in terminal print('Parameters used in simulation:') print('------------------------------') print('') for key in parameters.keys(): print('{0} = {1}'.format(key, parameters[key])) cv_folds = parameters['cv_folds'] early_stopping_rounds = parameters['early_stopping_rounds'] N_iterations = parameters['N_iterations'] N_samples = parameters['N_samples'] gas_giant_mass = parameters['gas_giant_mass'] features = parameters['features'] relevant_columns = features + ['Exo', 'MaxPMass', 'Sampled', 'Predicted'] #Redefine dataframe with the "relevant columns" and remove nans if dropnans==True in yaml if (parameters['dropnans']): df = df[relevant_columns].dropna() print('Number of samples used in simulation: {0}'.format(df.shape[0])) print('') #Define the confusion matrix and other arrays cfm = np.zeros((2, 2)) auc_score_train = [] precision_score_train = [] feat_imp_train = pd.DataFrame(columns=features) probabilities_total = pd.DataFrame(index=df.index) print('iteration \t estimators') print('---------------------------') #---------------------------XGBOOST LOOP---------------------------------------------- # Loop for all of the iterations (defined in yaml) for iteration in range(0, N_iterations): #dataframe of 200 random hosts with giant planets df_iter_with_exo = df[(df['Exo'] == 1) & (df['MaxPMass'] > gas_giant_mass)].sample( N_samples, random_state=np.random.RandomState()) #dataframe of 200 random non hosts df_iter_none_exo = df[df['Exo'] == 0].sample( N_samples, random_state=np.random.RandomState()) # make a new dataframe of the 400 star subset df_train = pd.concat([df_iter_with_exo, df_iter_none_exo], axis=0) # make a dataframe of those stars NOT in the training set (to predict on) df_predict = df[~df.index.isin(df_train.index)] # The train dataframe with everything but the Exo column X = df_train.drop(['Exo'], 1) # The Exo column (and hips) Y = df_train.Exo # Note: Using gbtree booster alg = XGBClassifier( learning_rate= 0.1, #def=0.3, prevents overfitting and makes feature weight conservative n_estimators=1000, #number of boosted trees to fit max_depth=6, #def=6, max depth of tree/complexity min_child_weight= 1, #def=1, min weight needed to continue leaf partitioning gamma= 0, #def=0, minimum loss reduction required to make partition on a leaf subsample=0.8, #def=1, subsample ratio of the training set colsample_bytree= 0.8, #def=1, subsample ratio of columns when making each tree objective= 'binary:logistic', #def=linear, logistic regression for binary classification, output probability nthread= 1, #originall = 8, but issue on laptop...def=max, number of parallel threads used to run xgboost scale_pos_weight=1, #def=1, balance positive and neg weights seed=27) #def=0, random number seed #get input parameters of algorithm xgb_param = alg.get_xgb_params() #construct training set matrix xgtrain = xgb.DMatrix(X[features].values, label=Y) #cross validation (CV) of xgboost to avoid overfitting cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, metrics='auc', early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) print(iteration, '\t \t', cvresult.shape[0]) alg.fit(X[features], Y, eval_metric='auc') dtrain_predictions = alg.predict(X[features]) dtrain_predprob = alg.predict_proba(X[features])[:, 1] feat_imp = alg.get_booster().get_fscore() # See how the algorithm performs on the Exo data auc_score = metrics.roc_auc_score(Y, dtrain_predprob) precision_score = metrics.precision_score(Y, dtrain_predictions) metric_score = metrics.confusion_matrix(Y, dtrain_predictions) # Weighting function to ignore the null values normalized_features = pd.DataFrame( (1 - df_train[features].isnull().sum() / df_train[features].count()) * pd.Series(alg.get_booster().get_fscore()), columns=[iteration]).T #calculate the confusion matrix feat_imp_train = pd.concat([ feat_imp_train, pd.DataFrame(feat_imp, columns=features, index=[iteration]) ]) feat_imp_train_normal = pd.concat( [feat_imp_train, normalized_features]) auc_score_train.append(auc_score) precision_score_train.append(precision_score) cfm += metric_score df.loc[df_predict.index, 'Sampled'] += np.ones(len(df_predict.index)) df.loc[df_predict.index, 'Predicted'] += alg.predict(df_predict[features]) df.loc[df_predict.index, 'Prob'] = alg.predict(df_predict[features]) values = df['Prob'] probabilities_total = pd.concat( [probabilities_total, pd.Series(values, name=str(iteration))], axis=1) if (not iteration % 10): probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) #------------------------------------------------------------------------- # Calculate the confusion matrix cfm /= N_iterations cfm[0] /= cfm[0].sum() cfm[1] /= cfm[1].sum() # Print confusion matrix print(np.round(cfm, 3)) df['Prob'] = df['Predicted'] / df['Sampled'] ###########-------------------Output List of Planets------------------------######### #Find the stars with >90% probability of hosting a planet, with the Sampled, Predicted, and Prob columns planets = df[(df.Prob > .90) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] print('Number of most probable planet hosts: {0}'.format(planets.shape[0])) #Sort the stars with predicted planets and save that file planetprobs = planets.sort_values(by='Prob', ascending=False) name = data_dir + '/figures/planet_probabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name = data_dir+'/figures/planet_probabilities.csv' outfile = open(name, 'w') planetprobs.to_csv(outfile) outfile.close() #Create a second list with all stars in Hypatia and the probabilities planets2 = df[(df.Prob > .0) & (df.Exo == 0)][['Sampled', 'Predicted', 'Prob']] if golden: #if 10 stars were randomly taken out changeddf = pd.DataFrame([]) #make empty dataframe for star in changedhips: #loop over the 10 known planets hosts (defined at top) changeddf = changeddf.append(planets2.loc[planets2.index == star]) if planets2.loc[ planets2.index == star].empty: #catch for when a known planet host was cut (bc of abunds) temp = pd.Series([nan, nan, nan], index=['Sampled', 'Predicted', 'Prob']) temp.name = star changeddf = changeddf.append( temp) #append blank file (with star name as index) #Save golden set as a separate file with the date and time as a tag filename = '{0}/figures/goldenSetProbabilities' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' changeddf.to_csv(filename.format(set_number), na_rep=" ") #Save the file with all of the probabilities planetprobs2 = planets2.sort_values(by='Prob', ascending=False) name2 = data_dir + '/figures/planet_probabilitiesAll' + str( datetime.today().strftime('-%h%d-%H%M')) + '.csv' #name2 = data_dir+'/figures/planet_probabilitiesAll.csv' outfile2 = open(name2, 'w') planetprobs2.to_csv(outfile2) outfile2.close() ###########------------------------Save Files------------------------########## print('Saving data files') #Save files feat_imp_train.to_pickle('{0}/features_train.pkl'.format(data_dir)) feat_imp_train_normal.to_pickle( '{0}/features_train_normal.pkl'.format(data_dir)) probabilities_total.to_pickle( '{0}/probabilities_total.pkl'.format(data_dir)) df.to_pickle('{0}/df_info_all.pkl'.format(data_dir)) np.save('{0}/auc_score_train.npy'.format(data_dir), np.array(auc_score_train)) np.save('{0}/precision_score_train.npy'.format(data_dir), np.array(precision_score_train)) np.save('{0}/cfm.npy'.format(data_dir), cfm) print('Simulation completed successfully.') if golden: print("Changed indices and HIP numbers:") print(changed) print(changedhips)
label_encoder = LabelEncoder() encoded_y_train = label_encoder.fit_transform(y_train) xgb = XGBClassifier( max_depth=args.max_depth, learning_rate=args.learning_rate, n_estimators=args.n_estimators, objective="multi:softprob", gamma=0, min_child_weight=1, max_delta_step=0, subsample=args.subsample, colsample_bytree=args.colsample_bytree, colsample_bylevel=args.colsample_bylevel, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None, silent=True, nthread=-1, seed=42 ) kf = KFold(len(x_train), n_folds=10, random_state=42) score = cross_val_score(xgb, x_train, encoded_y_train, cv=kf, scoring=ndcg_scorer) print(xgb.get_params(), score.mean())
'subsample': 0.7, # 随机采样训练样本 'colsample_bytree': 0.7, # 生成树时进行的列采样 'min_child_weight': 3, # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. # //无效'eta': 0.007, # 如同学习率 'seed': 1000, 'nthread': 7, # cpu 线程数 # 'eval_metric': 'auc' } # 训练模型 model = XGBClassifier() # 构建模型 model.get_params() #获取参数 model.set_params(**params) # 设置参数 # 开始训练 model.fit(aTrain_X, aTrain_Y, eval_metric='auc') # 保存模型 score0 = 0 # model.score(aTrain_X, aTrain_Y) score1 = model.score(aTest_X, aTest_Y) if score1 > 0.745: pickle.dump( model, open( '{}/qa_data/pre_trained_models/xgboost_qaquality_21_60dz_s{}.pkl' .format(cur_dir, round(score1, 3)), 'wb')) print('====> yes found good xgboost model') # print(i+1, score) # 打印每轮训练的准确率
encoded_y_train = label_encoder.fit_transform(y_train) xgb = XGBClassifier(max_depth=args.max_depth, learning_rate=args.learning_rate, n_estimators=args.n_estimators, objective="multi:softprob", gamma=0, min_child_weight=1, max_delta_step=0, subsample=args.subsample, colsample_bytree=args.colsample_bytree, colsample_bylevel=args.colsample_bylevel, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None, silent=True, nthread=-1, seed=42) kf = KFold(len(x_train), n_folds=10, random_state=42) score = cross_val_score(xgb, x_train, encoded_y_train, cv=kf, scoring=ndcg_scorer) print(xgb.get_params(), score.mean())
class XGBOOST(BaseEstimator): """ This class inherits from BaseEstimator and wraps SKLEARN RandomForestClassifier or RandomForestRegressor estimator ... Attributes ---------- estimator_parameters : dict parameter values name : string name of the estimator tune_parameters: dict Hyperparameter optimization settings Methods ------- build(X) Instance the estimator optimizing it if tune=true. """ def __init__(self, X, Y, parameters, conveyor): # Initialize parent class try: BaseEstimator.__init__(self, X, Y, parameters, conveyor) LOG.debug('Initialize BaseEstimator parent class') except Exception as e: self.conveyor.setError( f'Error initializing BaseEstimator parent class with exception: {e}' ) LOG.error( f'Error initializing BaseEstimator parent class with exception: {e}' ) return try: import xgboost as xgb xgb.set_config(verbosity=0) except: LOG.error('XGboost not found, please revise your environment') # Load estimator parameters self.estimator_parameters = self.param.getDict('XGBOOST_parameters') # Load tune parameters self.tune_parameters = self.param.getDict('XGBOOST_optimize') if self.param.getVal('quantitative'): self.estimator_parameters['objective'] = 'reg:squarederror' self.name = "XGB-Regressor" else: self.estimator_parameters['objective'] = 'binary:logistic' self.name = "XGB-Classifier" # Missing value must be defined. Otherwyse it returns 'nan' which cannot be # converted to JSON and produces trouble in different points self.estimator_parameters['missing'] = -99.99999 def build(self): '''Build a new XGBOOST model with the X and Y numpy matrices ''' try: from xgboost.sklearn import XGBClassifier from xgboost.sklearn import XGBRegressor except Exception as e: return False, 'XGboost not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) results.append(('model', 'model type', 'XGBOOST')) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing XGBOOST estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = XGBRegressor(**self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) else: self.estimator = XGBClassifier(**self.estimator_parameters) params = self.estimator.get_params() params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) except Exception as e: return False, f'Exception optimizing XGBOOST estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative XGBOOST model") self.estimator = XGBRegressor(**self.estimator_parameters) else: LOG.info("Building Qualitative XGBOOST model") self.estimator = XGBClassifier(**self.estimator_parameters) # self.estimator.fit(X, Y) # self.feature_importances = self.estimator.feature_importances_ self.regularBuild(X, Y) except Exception as e: return False, f'Exception building XGBOOST estimator with exception {e}' if not self.param.getVal('conformal'): return True, results self.estimator_temp = self.estimator success, error = self.conformalBuild(X, Y) if success: return True, results else: return False, error
seed=27) Xtrain = Xtrain.tocsr() mask = np.random.choice([False, True], Xtrain.shape[0], p=[0.75, 0.25]) not_mask = ~mask #kf = list(StratifiedKFold(y, n_folds=10, shuffle=True, random_state=4242))[0] #Xtr, Xte = Xtrain[kf[0], :], Xtrain[kf[1], :] #ytr, yte = y[kf[0]], y[kf[1]] #print('Training set: ' + str(Xtr.shape)) #print('Validation set: ' + str(Xte.shape)) dtrain = xgb.DMatrix(Xtrain[not_mask], label=y[not_mask]) dtrain_watch = xgb.DMatrix(Xtrain[mask], label=y[mask]) dtest = xgb.DMatrix(Xtest) evallist = [(dtrain, 'train'), (dtrain_watch, 'eval')] dtrain = xgb.DMatrix(Xtrain, label=y) params = xgb4.get_params() params['num_class'] = 12 model = xgb.train(params=params, dtrain=dtrain, evals=evallist, early_stopping_rounds=4, verbose_eval=1, num_boost_round=100) #model = xgb.train(params=params, dtrain=dtrain,verbose_eval=1,num_boost_round=100) preds = pd.DataFrame(model.predict(dtest), index=gatest.index, columns=targetencoder.classes_) preds.to_csv('LT_pred_xgboost2.csv', index=True) #model = modelfit(xgb4,y,predictors) #dtest=xgb.DMatrix(Xtest) #pred1 = pd.DataFrame(model.predict_proba(dtest), index = gatest.index, columns=targetencoder.classes_)
class XGBOOST(BaseEstimator): """ This class inherits from BaseEstimator and wraps SKLEARN RandomForestClassifier or RandomForestRegressor estimator ... Attributes ---------- estimator_parameters : dict parameter values name : string name of the estimator tune_parameters: dict Hyperparameter optimization settings Methods ------- build(X) Instance the estimator optimizing it if tune=true. """ def __init__(self, X, Y, parameters, conveyor): # Initialize parent class try: BaseEstimator.__init__(self, X, Y, parameters, conveyor) LOG.debug('Initialize BaseEstimator parent class') except Exception as e: self.conveyor.setError( f'Error initializing BaseEstimator parent class with exception: {e}' ) LOG.error( f'Error initializing BaseEstimator parent class with exception: {e}' ) return # Load estimator parameters self.estimator_parameters = self.param.getDict('XGBOOST_parameters') # Load tune parameters self.tune_parameters = self.param.getDict('XGBOOST_optimize') if self.param.getVal('quantitative'): self.estimator_parameters['objective'] = 'reg:squarederror' self.name = "XGB-Regressor" else: self.estimator_parameters['objective'] = 'binary:logistic' self.name = "XGB-Classifier" # Missing value must be defined. Otherwyse it returns 'nan' which cannot be # converted to JSON and produces trouble in different points self.estimator_parameters['missing'] = -99.99999 def build(self): '''Build a new XGBOOST model with the X and Y numpy matrices ''' try: from xgboost.sklearn import XGBClassifier from xgboost.sklearn import XGBRegressor except Exception as e: return False, 'XGboost not found, please revise your environment' # Make a copy of data matrices X = self.X.copy() Y = self.Y.copy() results = [] results.append(('nobj', 'number of objects', self.nobj)) results.append(('nvarx', 'number of predictor variables', self.nvarx)) results.append(('model', 'model type', 'XGBOOST')) # If tune then call gridsearch to optimize the estimator if self.param.getVal('tune'): LOG.info("Optimizing XGBOOST estimator") try: # Check type of model if self.param.getVal('quantitative'): self.estimator = XGBRegressor(**self.estimator_parameters) self.optimize(X, Y, self.estimator, self.tune_parameters) else: self.estimator = XGBClassifier(**self.estimator_parameters) params = self.estimator.get_params() params['num_class'] = 2 self.optimize(X, Y, self.estimator, self.tune_parameters) except Exception as e: return False, f'Exception optimizing XGBOOST estimator with exception {e}' else: try: if self.param.getVal('quantitative'): LOG.info("Building Quantitative XGBOOST model") self.estimator = XGBRegressor(**self.estimator_parameters) else: LOG.info("Building Qualitative XGBOOST model") self.estimator = XGBClassifier(**self.estimator_parameters) self.estimator.fit(X, Y) LOG.debug(self.estimator) except Exception as e: return False, f'Exception building XGBOOST estimator with exception {e}' if not self.param.getVal('conformal'): return True, results self.estimator_temp = self.estimator success, error = self.conformalBuild(X, Y) if success: return True, results else: return False, error ## Overriding of parent methods # def CF_quantitative_validation(self): # ''' performs validation for conformal quantitative models ''' # def CF_qualitative_validation(self): # ''' performs validation for conformal qualitative models ''' # def quantitativeValidation(self): # ''' performs validation for quantitative models ''' # def qualitativeValidation(self): # ''' performs validation for qualitative models ''' # def validate(self): # ''' Validates the model and computes suitable model quality scoring values''' # def optimize(self, X, Y, estimator, tune_parameters): # ''' optimizes a model using a grid search over a range of values for diverse parameters''' # def regularProject(self, Xb, results): # ''' projects a collection of query objects in a regular model, for obtaining predictions ''' # def conformalProject(self, Xb, results): # ''' projects a collection of query objects in a conformal model, for obtaining predictions ''' # def project(self, Xb, results): # ''' Uses the X matrix provided as argument to predict Y'''
print('train.shape', x_train.shape, 'test.shape', x_test.shape) xgb1 = XGBClassifier(learning_rate=0.1, n_estimators=500, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, seed=1) xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(x_train_ss, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50, verbose_eval=10) print('n_estimators', cvresult.shape[0]) print('test-auc:', cvresult.iloc[cvresult.shape[0] - 1, 0]) xgb1.set_params(n_estimators=cvresult.shape[0]) print('model', xgb1) #n_estimators 137 #test-auc: 0.8731962 tuned_parameters = { 'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
n_estimators=20000, max_depth=9, min_child_weight=15, gamma=0, subsample=0.9, colsample_bylevel=0.7, colsample_bytree=0.7, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) #调优n_estimators modelfit(xgb1, train, predictors) params = xgb1.get_params() print(params) # max_depth 和 min_weight 参数调优 # param_test1 = { # 'max_depth': range(3, 10, 2), # 'min_child_weight': range(1, 6, 2) # } # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.01, n_estimators=563, max_depth=5, # min_child_weight=1, gamma=0, subsample=0.9, colsample_bytree=0.7,colsample_bylevel=0.7, # objective='binary:logistic', nthread=4, scale_pos_weight=1, # seed=27), # param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) # gsearch1.fit(train[predictors], train[target]) # print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)