def xgb_diagnostics(X_train, X_test, y_train, y_test, params, n_repeats=30, plot_title=''): aucs_test, aucs_train = [], [] for i in range(n_repeats): metric = 'aucpr' model = XGBClassifier(**params, random_state=i, n_jobs=-1) eval_set = [(X_train, y_train), (X_test, y_test)] model.fit(X_train, y_train, eval_metric=[metric], eval_set=eval_set, verbose=False) results = model.evals_result() test = results['validation_1'][metric] train = results['validation_0'][metric] aucs_test += [test[-1]] aucs_train += [train[-1]] return aucs_test
def xgb_clf(training, training_label, test, test_label): # Hyperparameters sat with inspiration from: # https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ rounds = 400 eta = 0.2 # 0.01-2 max_depth = 7 # 3-10 default: 6 gamma = 0.1 # default 0, but should be tuned # Evaluation eval_set = [(training, training_label), (test, test_label)] # Construct model and train model = XGBClassifier(seed=42, eta=eta, max_depth=max_depth, gamma=gamma, n_estimators=rounds, verbose=False) model.fit(training, training_label, eval_metric='logloss', eval_set=eval_set) results = model.evals_result() model.fit(training, training_label, eval_metric='logloss', eval_set=eval_set, early_stopping_rounds=5, verbose=False) EPOCHS = len(results["validation_0"]['logloss']) # Saving evolution of metrics throughout the iterations fig, ax = plt.subplots(figsize=(10, 5)) ax.plot(np.arange(0, EPOCHS), results["validation_0"]['logloss'], label="Train log loss", color='darkblue') ax.plot(np.arange(0, EPOCHS), results["validation_1"]['logloss'], label="Test log loss", color='darkorange') ax.plot([model.best_iteration, model.best_iteration], [0, 1], '--r') ax.set( xlabel="Iteration", ylabel=("Logarithmic loss"), #ylim=(-0.01, 1.01) ) ax.grid(True) ax.legend() #loc=(0.3,0.4)) fig.savefig('evolutionXGB' + str(EPOCHS) + '.pdf') y_pred = model.predict(test) predictions = [round(value) for value in y_pred] acc = accuracy_score(test_label, predictions) return model, acc
def hyperopt_train_test(params): params['max_depth'] = int(params['max_depth']) xgb = XGBClassifier(**params) xgb.fit(train[X_vars], train[y_var], early_stopping_rounds=8, eval_metric='logloss', eval_set=[(train[X_vars], train[y_var]), (valid[X_vars], valid[y_var])]) return xgb.evals_result()['validation_1']['logloss'][-8]
def test_classifier(self): X_train = np.random.random((100, 28)) y_train = np.random.randint(10, size=(100, 1)) X_test = np.random.random((100, 28)) y_test = np.random.randint(10, size=(100, 1)) xgb1 = XGBClassifier(n_estimators=3, use_label_encoder=False) xgb1.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='mlogloss', ) self.assertIn("validation_0", xgb1.evals_result())
def performance(model: XGBClassifier): # retrieve performance metrics results = model.evals_result() epochs = len(results['validation_0']['logloss']) x_axis = range(0, epochs) # plot log loss fig, ax = plt.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train') ax.plot(x_axis, results['validation_1']['logloss'], label='Test') ax.legend() plt.ylabel('Log Loss') plt.title('XGBoost Log Loss') plt.show() print(model) plot_importance(model) plt.show()
def XGB_learning(data, labels): data_train, data_test, labels_train, labels_test = \ train_test_split(data, labels, test_size=0.2, random_state=7) # fit model no training data model = XGBClassifier() eval_set = [(data_train, labels_train), (data_test, labels_test)] model.fit(data_train, labels_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=False) labels_pred = model.predict(data_test) predictions = [round(value) for value in labels_pred] results = model.evals_result() accuracy = accuracy_score(labels_test, predictions) learn_curve = 1 - np.array(results['validation_1']['error']) # print('Accuracy = ', accuracy) return accuracy, learn_curve
def final_xgb(X_train, y_train, X_test, y_test, scale_pos_weight, best_params, analysis): xgb = XGBClassifier(**best_params) xgb.set_params(njobs=4, random_state=0, objective='binary:logistic', scale_pos_weight=scale_pos_weight) eval_set = [(X_train, y_train), (X_test, y_test)] eval_metric = ["error", "auc"] xgb.fit(X_train, y_train, eval_metric=eval_metric, eval_set=eval_set, verbose=0) results = xgb.evals_result() fig1, axes1 = plt.subplots(figsize=(10, 8), nrows=1, ncols=2) axes1[0].plot(results['validation_0']['error'], label='Train Error') axes1[0].plot(results['validation_1']['error'], label='Validation Error') axes1[0].set_title("Final XGBoost Error") axes1[0].set_xlabel("Iteration") axes1[0].set_ylabel("Error") axes1[0].legend() axes1[1].plot(results['validation_0']['auc'], label='Train AUC-ROC') axes1[1].plot(results['validation_1']['auc'], label='Validation AUC-ROC') axes1[1].set_title("Final XGBoost AUC-ROC") axes1[1].set_xlabel("Iteration") axes1[1].set_ylabel("AUC") axes1[1].legend() fig1.tight_layout() fig1.savefig(fig_dir + '/{}_final_xgb_model.png'.format(analysis), format='png', dpi=300, transparent=False) return xgb
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.1) xgb.fit(x_train, y_train, verbose=True, eval_metric="rmse", eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = xgb.predict(x_test) acc = accuracy_score(y_test, y_pre) score = xgb.score(x_test, y_test) result = xgb.evals_result() print(__file__) print(result) print("acc") print(acc) print("score") print(score) # import pickle #파이썬에서 제공한다 # pickle.dump(xgb,open("./model/xgb_save/cancer.plckle.dat","wb")) import joblib joblib.dump(xgb, "./model/xgb_save/cancer.joblib.dat") print("start")
X_train, y_train, test_size=test_size, random_state=seed) num_round = 100 bst = XGBClassifier(max_depth=2, learning_rate=0.1, n_estimators=num_round, silent=True, objective='binary:logistic') eval_set = [(X_train_part, y_train_part), (X_validate, y_validate)] bst.fit(X_train_part, y_train_part, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=False) results = bst.evals_result() # epochs = len(results['validation_0']['error']) x_axis = range(0, num_round) # plot log loss fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train') ax.plot(x_axis, results['validation_1']['logloss'], label='Test') ax.legend() pyplot.ylabel('Log Loss') pyplot.title('XGBoost Log Loss') pyplot.show() # plot classification error fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['error'], label='Train') ax.plot(x_axis, results['validation_1']['error'], label='Test')
# y = dataset.target x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.8, random_state=66) # model = XGBRegressor(n_estimators=5,learning_rate=0.1) model = XGBClassifier(n_estimators=5,learning_rate=0.1) # model.fit(x_train,y_train, verbose=True, eval_metric='error',eval_set=[(x_train, y_train), (x_test, y_test)]) model.fit(x_train,y_train, verbose=True, eval_metric='rmse',eval_set=[(x_train, y_train), (x_test, y_test)]) # rmse, mae, logloss, error, auc // error이 acc라고? result = model.evals_result() # 평가? 라고 생각 # print(f'result : {result}') y_pred = model.predict(x_test) acc = accuracy_score(y_pred, y_test) print(f'acc1 : {acc}') # score = model.score(x_test,y_test) # print(f"r2 : {score}") import pickle pickle.dump(model,open('./model/xgb_save/cancer.pickle.dat','wb')) print("save complete!!")
class XGBooster(object): """ The main class to train/encode/explain XGBoost models. """ def __init__(self, options, from_data=None, from_model=None, from_encoding=None): """ Constructor. """ assert from_data or from_model or from_encoding, \ 'At least one input file should be specified' self.init_stime = resource.getrusage(resource.RUSAGE_SELF).ru_utime self.init_ctime = resource.getrusage(resource.RUSAGE_CHILDREN).ru_utime # saving command-line options self.options = options self.seed = self.options.seed np.random.seed(self.seed) if from_data: self.use_categorical = self.options.use_categorical # saving data self.data = from_data dataset = np.asarray(self.data.samps, dtype=np.float32) # split data into X and y self.feature_names = self.data.names[:-1] self.nb_features = len(self.feature_names) self.X = dataset[:, 0:self.nb_features] self.Y = dataset[:, self.nb_features] self.num_class = len(set(self.Y)) self.target_name = list(range(self.num_class)) param_dist = { 'n_estimators': self.options.n_estimators, 'max_depth': self.options.maxdepth } if (self.num_class == 2): param_dist['objective'] = 'binary:logistic' self.model = XGBClassifier(**param_dist) # split data into train and test sets self.test_size = self.options.testsplit if (self.test_size > 0): self.X_train, self.X_test, self.Y_train, self.Y_test = \ train_test_split(self.X, self.Y, test_size=self.test_size, random_state=self.seed) else: self.X_train = self.X self.X_test = [] # need a fix self.Y_train = self.Y self.Y_test = [] # need a fix # check if we have info about categorical features if (self.use_categorical): self.categorical_features = from_data.categorical_features self.categorical_names = from_data.categorical_names self.target_name = from_data.class_names #################################### # this is a set of checks to make sure that we use the same as anchor encoding cat_names = sorted(self.categorical_names.keys()) assert (cat_names == self.categorical_features) self.encoder = {} for i in self.categorical_features: self.encoder.update( {i: OneHotEncoder(categories='auto', sparse=False)}) #, self.encoder[i].fit(self.X[:, [i]]) else: self.categorical_features = [] self.categorical_names = [] self.encoder = [] fname = from_data elif from_model: fname = from_model self.load_datainfo(from_model) if (self.use_categorical is False) and (self.options.use_categorical is True): print( "Error: Note that the model is trained without categorical features info. Please do not use -c option for predictions" ) exit() # load model elif from_encoding: fname = from_encoding # encoding, feature names, and number of classes # are read from an input file enc = SMTEncoder(None, None, None, self, from_encoding) self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, \ self.num_class = enc.access() # create extra file names try: os.stat(options.output) except: os.mkdir(options.output) self.mapping_features() ################# self.test_encoding_transformes() bench_name = os.path.splitext(os.path.basename(options.files[0]))[0] bench_dir_name = options.output + "/" + bench_name try: os.stat(bench_dir_name) except: os.mkdir(bench_dir_name) self.basename = (os.path.join( bench_dir_name, bench_name + "_nbestim_" + str(options.n_estimators) + "_maxdepth_" + str(options.maxdepth) + "_testsplit_" + str(options.testsplit))) data_suffix = '.splitdata.pkl' self.modfile = self.basename + '.mod.pkl' self.mod_plainfile = self.basename + '.mod.txt' self.resfile = self.basename + '.res.txt' self.encfile = self.basename + '.enc.txt' self.expfile = self.basename + '.exp.txt' def form_datefile_name(self, modfile): data_suffix = '.splitdata.pkl' return modfile + data_suffix def pickle_save_file(self, filename, data): try: f = open(filename, "wb") pickle.dump(data, f) f.close() except: print("Cannot save to file", filename) exit() def pickle_load_file(self, filename): try: f = open(filename, "rb") data = pickle.load(f) f.close() return data except: print("Cannot load from file", filename) exit() def save_datainfo(self, filename): print("saving model to ", filename) self.pickle_save_file(filename, self.model) filename_data = self.form_datefile_name(filename) print("saving data to ", filename_data) samples = {} samples["X"] = self.X samples["Y"] = self.Y samples["X_train"] = self.X_train samples["Y_train"] = self.Y_train samples["X_test"] = self.X_test samples["Y_test"] = self.Y_test samples["feature_names"] = self.feature_names samples["target_name"] = self.target_name samples["num_class"] = self.num_class samples["categorical_features"] = self.categorical_features samples["categorical_names"] = self.categorical_names samples["encoder"] = self.encoder samples["use_categorical"] = self.use_categorical self.pickle_save_file(filename_data, samples) def load_datainfo(self, filename): print("loading model from ", filename) self.model = XGBClassifier() self.model = self.pickle_load_file(filename) datafile = self.form_datefile_name(filename) print("loading data from ", datafile) loaded_data = self.pickle_load_file(datafile) self.X = loaded_data["X"] self.Y = loaded_data["Y"] self.X_train = loaded_data["X_train"] self.X_test = loaded_data["X_test"] self.Y_train = loaded_data["Y_train"] self.Y_test = loaded_data["Y_test"] self.feature_names = loaded_data["feature_names"] self.target_name = loaded_data["target_name"] self.num_class = loaded_data["num_class"] self.nb_features = len(self.feature_names) self.categorical_features = loaded_data["categorical_features"] self.categorical_names = loaded_data["categorical_names"] self.encoder = loaded_data["encoder"] self.use_categorical = loaded_data["use_categorical"] def train(self, outfile=None): """ Train a tree ensemble using XGBoost. """ return self.build_xgbtree(outfile) def encode(self, test_on=None): """ Encode a tree ensemble trained previously. """ encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) self.enc, self.intvs, self.imaps, self.ivars = encoder.encode() if test_on: encoder.test_sample(np.array(test_on)) encoder.save_to(self.encfile) def explain(self, sample, use_lime=False, use_anchor=False, use_shap=False, expl_ext=None, prefer_ext=False, nof_feats=5): """ Explain a prediction made for a given sample with a previously trained tree ensemble. """ if use_lime: expl = use_lime(self, sample=sample, nb_samples=5, nb_features_in_exp=nof_feats) elif use_anchor: expl = use_anchor(self, sample=sample, nb_samples=5, nb_features_in_exp=nof_feats, threshold=0.95) elif use_shap: expl = use_shap(self, sample=sample, nb_features_in_exp=nof_feats) else: if 'x' not in dir(self): self.x = SMTExplainer(self.enc, self.intvs, self.imaps, self.ivars, self.feature_names, self.num_class, self.options, self) expl = self.x.explain(np.array(sample), self.options.smallest, expl_ext, prefer_ext) # returning the explanation return expl def validate(self, sample, expl): """ Make an attempt to show that a given explanation is optimistic. """ # there must exist an encoding if 'enc' not in dir(self): encoder = SMTEncoder(self.model, self.feature_names, self.num_class, self) self.enc, _, _, _ = encoder.encode() if 'v' not in dir(self): self.v = SMTValidator(self.enc, self.feature_names, self.num_class, self) # try to compute a counterexample return self.v.validate(np.array(sample), expl) def transform(self, x): if (len(x) == 0): return x if (len(x.shape) == 1): x = np.expand_dims(x, axis=0) if (self.use_categorical): assert (self.encoder != []) tx = [] for i in range(self.nb_features): self.encoder[i].drop = None if (i in self.categorical_features): tx_aux = self.encoder[i].transform(x[:, [i]]) tx_aux = np.vstack(tx_aux) tx.append(tx_aux) else: tx.append(x[:, [i]]) tx = np.hstack(tx) return tx else: return x def transform_inverse(self, x): if (len(x) == 0): return x if (len(x.shape) == 1): x = np.expand_dims(x, axis=0) if (self.use_categorical): assert (self.encoder != []) inverse_x = [] for i, xi in enumerate(x): inverse_xi = np.zeros(self.nb_features) for f in range(self.nb_features): if f in self.categorical_features: nb_values = len(self.categorical_names[f]) v = xi[:nb_values] v = np.expand_dims(v, axis=0) iv = self.encoder[f].inverse_transform(v) inverse_xi[f] = iv xi = xi[nb_values:] else: inverse_xi[f] = xi[0] xi = xi[1:] inverse_x.append(inverse_xi) return inverse_x else: return x def transform_inverse_by_index(self, idx): if (idx in self.extended_feature_names): return self.extended_feature_names[idx] else: print("Warning there is no feature {} in the internal mapping". format(idx)) return None def transform_by_value(self, feat_value_pair): if (feat_value_pair in self.extended_feature_names.values()): keys = (list(self.extended_feature_names.keys())[list( self.extended_feature_names.values()).index(feat_value_pair)]) return keys else: print( "Warning there is no value {} in the internal mapping".format( feat_value_pair)) return None def mapping_features(self): self.extended_feature_names = {} self.extended_feature_names_as_array_strings = [] counter = 0 if (self.use_categorical): for i in range(self.nb_features): if (i in self.categorical_features): for j, _ in enumerate(self.encoder[i].categories_[0]): self.extended_feature_names.update( {counter: (self.feature_names[i], j)}) self.extended_feature_names_as_array_strings.append( "f{}_{}".format( i, j)) # str(self.feature_names[i]), j)) counter = counter + 1 else: self.extended_feature_names.update( {counter: (self.feature_names[i], None)}) self.extended_feature_names_as_array_strings.append( "f{}".format(i)) #(self.feature_names[i]) counter = counter + 1 else: for i in range(self.nb_features): self.extended_feature_names.update( {counter: (self.feature_names[i], None)}) self.extended_feature_names_as_array_strings.append( "f{}".format(i)) #(self.feature_names[i]) counter = counter + 1 def readable_sample(self, x): readable_x = [] for i, v in enumerate(x): if (i in self.categorical_features): readable_x.append(self.categorical_names[i][int(v)]) else: readable_x.append(v) return np.asarray(readable_x) def test_encoding_transformes(self): # test encoding X = self.X_train[[0], :] print("Sample of length", len(X[0]), " : ", X) enc_X = self.transform(X) print("Encoded sample of length", len(enc_X[0]), " : ", enc_X) inv_X = self.transform_inverse(enc_X) print("Back to sample", inv_X) print("Readable sample", self.readable_sample(inv_X[0])) assert ((inv_X == X).all()) if (self.options.verb > 1): for i in range(len(self.extended_feature_names)): print(i, self.transform_inverse_by_index(i)) for key, value in self.extended_feature_names.items(): print(value, self.transform_by_value(value)) def transfomed_sample_info(self, i): print(enc.categories_) def build_xgbtree(self, outfile=None): """ Build an ensemble of trees. """ if (outfile is None): outfile = self.modfile else: self.datafile = sefl.form_datefile_name(outfile) # fit model no training data if (len(self.X_test) > 0): eval_set = [(self.transform(self.X_train), self.Y_train), (self.transform(self.X_test), self.Y_test)] else: eval_set = [(self.transform(self.X_train), self.Y_train)] print("start xgb") self.model.fit( self.transform(self.X_train), self.Y_train, eval_set=eval_set, verbose=self.options.verb) # eval_set=[(X_test, Y_test)], print("end xgb") evals_result = self.model.evals_result() ########## saving model self.save_datainfo(outfile) print("saving plain model to ", self.mod_plainfile) self.model._Booster.dump_model(self.mod_plainfile) ensemble = TreeEnsemble(self.model, self.extended_feature_names_as_array_strings, nb_classes=self.num_class) y_pred_prob = self.model.predict_proba( self.transform(self.X_train[:10])) y_pred_prob_compute = ensemble.predict( self.transform(self.X_train[:10]), self.num_class) assert (np.absolute(y_pred_prob_compute - y_pred_prob).sum() < 0.01 * len(y_pred_prob)) ### accuracy try: train_accuracy = round( 1 - evals_result['validation_0']['merror'][-1], 2) except: try: train_accuracy = round( 1 - evals_result['validation_0']['error'][-1], 2) except: assert (False) try: test_accuracy = round( 1 - evals_result['validation_1']['merror'][-1], 2) except: try: test_accuracy = round( 1 - evals_result['validation_1']['error'][-1], 2) except: print("no results test data") test_accuracy = 0 #### saving print("saving results to ", self.resfile) with open(self.resfile, 'w') as f: f.write("{} & {} & {} &{} &{} & {} \\\\ \n \hline \n".format( os.path.basename(self.options.files[0]).replace("_", "-"), train_accuracy, test_accuracy, self.options.n_estimators, self.options.maxdepth, self.options.testsplit)) f.close() print("Train accuracy: %.2f%%" % (train_accuracy * 100.0)) print("Test accuracy: %.2f%%" % (test_accuracy * 100.0)) return train_accuracy, test_accuracy, self.model
is_final, args.model, args.scale, args.drop, args.remarks or args.neighbors) if not args.final: eval_predicted_proba = model.predict_proba(eval_data) eval_predicted = model.predict(eval_data) # Splits into classes from 0-10 (11 classes) onehot = to_categorical(eval_labels).astype(int) eval_onehot = onehot[:, 1:] # Trim unnecessary first column (class "0") ll = log_loss(eval_onehot, eval_predicted_proba) acc = accuracy_score(eval_labels, eval_predicted) print("Validation log-loss and accuracy: {:.5f} {:.5f}".format(ll, acc)) # Plot if args.model in ["XGBoost"]: train_metrics = model.evals_result()['validation_0'] test_metrics = model.evals_result()['validation_1'] epochs = len(train_metrics['merror']) x_axis = range(0, epochs) # plot log loss fig, ax = plt.subplots() ax.plot(x_axis, train_metrics['mlogloss'], label='Train') ax.plot(x_axis, test_metrics['mlogloss'], label='Test') ax.legend() plt.ylabel('Log Loss') plt.title('{} - Log Loss'.format(args.model)) plt.savefig("img/logloss_{}.png".format(uid)) plt.show() # plot classification error fig, ax = plt.subplots() ax.plot(x_axis, train_metrics['merror'], label='Train')
currentDT = datetime.datetime.now() print(currentDT.strftime("%I:%M:%S %p")) X = x_train3[model_features] y = x_train3.passed X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=1) xgbc_model = XGBClassifier(n_estimators=2500, max_depth=6, learning_rate=.01, n_jobs=-1, cv=10) xgbc_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='logloss', verbose=False) prediction = xgbc_model.predict_proba(X_valid) result = xgbc_model.evals_result() xgbcloss = log_loss(y_valid, prediction) print((time.time() - start_time)/60,': ', f'log loss: {xgbcloss:.3f}') # Best so far: .334-.337 with n_est: 1450, learn_r: .02 # Best so far (7/14): .237-.241 with updated violations_score # 35050, 0.18083770657518244 # 35100, 0.18083742930751404 # Best so far (7/30): .146-.148 with added one hot encode on extreme words, # .122-.124 with adjusted percentage of split # Back to .243 # In[33]:
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1) xgb.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse", "auc"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100) #rmse,mae,logloss,error,auc y_pre = xgb.predict(x_test) r2 = r2_score(y_test, y_pre) score = xgb.score(x_test, y_test) results = xgb.evals_result() print(__file__) print(results) print("r2") print(r2) print("score") print(score) fig, ax = plt.subplots() epochs = len(results["validation_0"]["logloss"]) x_axis = range(epochs) ax.plot(x_axis, results["validation_0"]["logloss"], label="Train") ax.plot(x_axis, results["validation_1"]["logloss"], label="Test") ax.legend()
# eval set from xgboost import XGBClassifier, XGBRegressor from sklearn.datasets import load_wine from sklearn.model_selection import train_test_split import numpy as np from sklearn.metrics import r2_score, accuracy_score x, y = load_wine(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) model = XGBClassifier(n_estimators=500, learning_rate=0.01, n_jobs=8, eval_metric='mlogloss') model.fit(x_train, y_train, verbose=1, eval_set=[(x_train, y_train),(x_test,y_test)]) aaa = model.score(x_test, y_test) print(aaa) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc :",acc) print("==============================") results = model.evals_result() print(results) # 1.0 # acc : 1.0
def train_and_generate_model(): #global log_fd global log_fd_opt global tr_input_arr global tr_angle_arr global val_input_arr global val_angle_arr data_len = len(exchange_rates) log_fd_tr = open("./train_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") # inner logger function for backtest def logfile_writeln_tr(log_str): nonlocal log_fd_tr log_fd_tr.write(log_str + "\n") log_fd_tr.flush() print("data size of rates: " + str(data_len)) print("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) print("input features sets for tarin: " + str(COMPETITION_TRAIN_DATA_NUM)) logfile_writeln_tr("data size of rates: " + str(data_len)) logfile_writeln_tr("num of rate datas for tarin: " + str(COMPETITION_TRAIN_DATA_NUM_AT_RATE_ARR)) tr_input_mat = [] tr_angle_mat = [] is_loaded_input_mat = False if os.path.exists("./tr_input_mat.pickle"): with open('./tr_input_mat.pickle', 'rb') as f: tr_input_mat = pickle.load(f) with open('./tr_angle_mat.pickle', 'rb') as f: tr_angle_mat = pickle.load(f) is_loaded_input_mat = True else: for i in range(DATA_HEAD_ASOBI, len(exchange_rates) - DATA_HEAD_ASOBI - OUTPUT_LEN, SLIDE_IDX_NUM_AT_GEN_INPUTS_AND_COLLECT_LABELS): tr_input_mat.append([ exchange_rates[i], (exchange_rates[i] - exchange_rates[i - 1]) / exchange_rates[i - 1], get_rsi(exchange_rates, i), get_ma(exchange_rates, i), get_ma_kairi(exchange_rates, i), get_bb_1(exchange_rates, i), get_bb_2(exchange_rates, i), get_ema(exchange_rates, i), get_ema_rsi(exchange_rates, i), get_cci(exchange_rates, i), get_mo(exchange_rates, i), get_lw(exchange_rates, i), get_ss(exchange_rates, i), get_dmi(exchange_rates, i), get_vorarity(exchange_rates, i), get_macd(exchange_rates, i), str(judge_chart_type(exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tr_input_mat.append([ reverse_exchange_rates[i], (reverse_exchange_rates[i] - reverse_exchange_rates[i - 1]) / reverse_exchange_rates[i - 1], get_rsi(reverse_exchange_rates, i), get_ma(reverse_exchange_rates, i), get_ma_kairi(reverse_exchange_rates, i), get_bb_1(reverse_exchange_rates, i), get_bb_2(reverse_exchange_rates, i), get_ema(reverse_exchange_rates, i), get_ema_rsi(reverse_exchange_rates, i), get_cci(reverse_exchange_rates, i), get_mo(reverse_exchange_rates, i), get_lw(reverse_exchange_rates, i), get_ss(reverse_exchange_rates, i), get_dmi(reverse_exchange_rates, i), get_vorarity(reverse_exchange_rates, i), get_macd(reverse_exchange_rates, i), str( judge_chart_type( reverse_exchange_rates[i - CHART_TYPE_JDG_LEN:i])) ]) tmp = exchange_rates[i + OUTPUT_LEN] - exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) tmp = reverse_exchange_rates[ i + OUTPUT_LEN] - reverse_exchange_rates[i] if tmp >= 0: tr_angle_mat.append(1) else: tr_angle_mat.append(0) if is_loaded_input_mat == False: with open('tr_input_mat.pickle', 'wb') as f: pickle.dump(tr_input_mat, f) with open('tr_angle_mat.pickle', 'wb') as f: pickle.dump(tr_angle_mat, f) #log output for tensorboard #configure("logs/xgboost_trade_cpu_1") tr_input_arr = np.array(tr_input_mat[0:COMPETITION_TRAIN_DATA_NUM]) tr_angle_arr = np.array(tr_angle_mat[0:COMPETITION_TRAIN_DATA_NUM]) watchlist = None split_idx = COMPETITION_TRAIN_DATA_NUM + int( (len(tr_input_mat) - COMPETITION_TRAIN_DATA_NUM) * VALIDATION_DATA_RATIO) if VALIDATION_DATA_RATIO != 0.0: val_input_arr = np.array( tr_input_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) val_angle_arr = np.array( tr_angle_mat[COMPETITION_TRAIN_DATA_NUM:split_idx]) watchlist = [(tr_input_arr, tr_angle_arr), (val_input_arr, val_angle_arr)] else: watchlist = [(tr_input_arr, tr_angle_arr)] start = time.time() if is_param_tune_with_optuna: log_fd_opt = open("./tune_progress_log_" + dt.now().strftime("%Y-%m-%d_%H-%M-%S") + ".txt", mode="w") study = None if is_use_db_at_tune: study = optuna.Study(study_name='fxsystrade', storage='sqlite:///../fxsystrade.db') else: study = optuna.create_study() parallel_num = RAPTOP_THREAD_NUM * 2 if is_colab_cpu or is_exec_at_mba: parallel_num = COLAB_CPU_AND_MBA_THREAD_NUM * 2 if special_optuna_parallel_num != -1: parallel_num = special_optuna_parallel_num study.optimize(opt, n_trials=OPTUNA_TRIAL_NUM, n_jobs=parallel_num) process_time = time.time() - start logfile_writeln_opt("best_params: " + str(study.best_params)) logfile_writeln_opt("best_value: " + str(study.best_value)) logfile_writeln_opt("best_trial: " + str(study.best_trial)) logfile_writeln_opt("excecution time of tune: " + str(process_time)) log_fd_opt.flush() log_fd_opt.close() exit() param = {} n_thread = RAPTOP_THREAD_NUM if is_use_gpu: param['tree_method'] = 'gpu_hist' param['max_bin'] = 16 param['gpu_id'] = 0 n_thread = COLAB_CPU_AND_MBA_THREAD_NUM if is_colab_cpu or is_exec_at_mba: n_thread = COLAB_CPU_AND_MBA_THREAD_NUM logfile_writeln_tr("training parameters are below...") logfile_writeln_tr(str(param)) eval_result_dic = {} logfile_writeln_tr("num_round: " + str(NUM_ROUND)) clf = XGBClassifier(max_depth=MAX_DEPTH, random_state=42, n_estimators=NUM_ROUND, min_child_weight=18, subsample=0.9, colsample_bytree=0.6, eta=ETA, objective='binary:logistic', verbosity=0, n_thread=n_thread, **param) verbosity = True if is_use_gpu or is_colab_cpu: verbosity = False clf.fit(tr_input_arr, tr_angle_arr, eval_set=watchlist, verbose=verbosity) process_time = time.time() - start logfile_writeln_tr("excecution time of training: " + str(process_time)) clf.save_model('./xgb.model') booster = clf.get_booster() booster.dump_model('./xgb_model.raw.txt') eval_result_dic = clf.evals_result() for ii in range(len(eval_result_dic['validation_0']['error'])): if VALIDATION_DATA_RATIO != 0.0: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii]) + "," + str(eval_result_dic['validation_1']['error'][ii])) else: logfile_writeln_tr( str(ii) + "," + str(eval_result_dic['validation_0']['error'][ii])) # Feature Importance fti = clf.feature_importances_ logfile_writeln_tr('Feature Importances:') for i, feat in enumerate(FEATURE_NAMES): logfile_writeln_tr('\t{0:20s} : {1:>.6f}'.format(feat, fti[i])) log_fd_tr.flush() log_fd_tr.close() print("finished training and saved model.")
# # fpr, tpr, _ = roc_curve(train.target, pred) # fpr, tpr, sara = roc_curve(train.target, pred) # plt.plot(fpr, tpr, label='BDT', color='b') # # plt.legend(loc='best') # plt.grid() # plt.title('ROC') # plt.tight_layout() # plt.savefig('results/roc_train_%s.pdf' %(tag)) ########################################################################################## ##### OVERTRAINING SCORE ########################################################################################## plt.clf() auc_train = clf.evals_result()['validation_0']['auc'] auc_test = clf.evals_result()['validation_1']['auc'] n_estimators = np.arange(len(auc_train)) plt.plot(n_estimators, auc_train, color='r', label='AUC train') plt.plot(n_estimators, auc_test, color='b', label='AUC test') plt.xlabel('# tree') plt.ylabel('Area Under ROC') plt.xscale('log') plt.grid() # plt.xlim([1, 1000]) # plt.ylim([0.985, 1.0])
def main(): # Start timer t_start = time.time() # Command line options parser = argparse.ArgumentParser() group_model = parser.add_mutually_exclusive_group() group_model.add_argument('-x', '--xgboost', action='store_true', help='Run gradient BDT') group_model.add_argument('-n', '--nn', action='store_true', help='Run neural network') group_model.add_argument('-p', '--prepare_hdf5', type=str, nargs='?', default='', help='Prepare input datasets for ML and store in HDF5 file; options: "2L2J" or "2L3J+"') group_read_dataset = parser.add_mutually_exclusive_group() group_read_dataset.add_argument('-r', '--read_hdf5', action='store_true', help='Read prepared datasets from HDF5 file') #group_read_dataset.add_argument('-d', '--direct_read', action='store_true', help='Read unprepared datasets from ROOT file') parser.add_argument('-l', '--load_pretrained_model', action='store_true', help='Load pre-trained classifier model, i.e. only run on test data') #parser.add_argument('-B', '--N_sig_events', type=lambda x: int(float(x)), default=0, help='Number of signal events to read from the dataset') #parser.add_argument('-S', '--N_bkg_events', type=lambda x: int(float(x)), default=0, help='Number of background events to read from the dataset for each class') parser.add_argument('-s', '--signal_region', type=str, nargs='?', default='int', help='Choose signal region: low-2J, int-2J, high-2J, low-3J+, int-3J+, high-3J+') parser.add_argument('-b', '--balanced', type=int, nargs='?', default=-1, help='Balance dataset for training; 0: oversample signal, 1: undersample background') parser.add_argument('-m', '--multiclass', action='store_true', help='Use multiple background classes in addition to the signal class') parser.add_argument('-w', '--event_weight', action='store_true', help='Apply event weights during training') parser.add_argument('-c', '--class_weight', action='store_true', help='Apply class weights to account for unbalanced dataset') parser.add_argument('-t', '--do_train', action='store_true', help='Train the classifier') parser.add_argument('-T', '--do_test', action='store_true', help='Test the classifier on data it has not been trained on') parser.add_argument('-e', '--train_even', action='store_true', help='Use even run numbers for training and odd run numbers for testing') parser.add_argument('-o', '--train_odd', action='store_true', help='Use odd run numbers for training and even run numbers for testing') parser.add_argument('-C', '--doCV', action='store_true', help='Perform a k-fold cross-validation on the training set during training') parser.add_argument('-O', '--oversample', action='store_true', help='Balance imbalanced dataset using oversampling') parser.add_argument('-U', '--undersample', action='store_true', help='Balance imbalanced dataset using undersampling') parser.add_argument('--n_nodes', type=int, nargs='?', default=20, help='Number of nodes in each hidden neural network layer') parser.add_argument('--n_hidden_layers', type=int, nargs='?', default=1, help='Number of nodes in each hidden neural network layer') parser.add_argument('--dropout', type=float, nargs='?', default=0., help='Use dropout regularization on neural network layers to reduce overfitting') parser.add_argument('--L1', type=float, nargs='?', default=0., help='Use L1 regularization on neural network weights to reduce overfitting') parser.add_argument('--L2', type=float, nargs='?', default=0., help='Use L2 regularization (weights decay) on neural network weights to reduce overfitting') parser.add_argument('--lr', type=float, nargs='?', default=0.001, help='Set learning rate for the neural network or BDT optimizer') parser.add_argument('--batch_size', type=int, nargs='?', default=32, help='Number of events to use for each weight update') parser.add_argument('--epochs', type=lambda x: int(float(x)), nargs='?', default=1, help='Number of passes through the training set') parser.add_argument('--max_depth', type=int, nargs='?', default=3, help='Maximum tree depth for BDT') parser.add_argument('--n_estimators', type=lambda x: int(float(x)), nargs='?', default=100, help='Number of trees in BDT ensemble') parser.add_argument('--gamma', type=float, nargs='?', default=0, help='Minimum loss reduction required to make a further partition on a leaf node of the XGBoost tree') parser.add_argument('--min_child_weight', type=float, nargs='?', default=1, help='Minimum sum of instance weight(hessian) needed in a child') parser.add_argument('--max_delta_step', type=float, nargs='?', default=0, help='Maximum delta step we allow each tree’s weight estimation to be') parser.add_argument('--subsample', type=float, nargs='?', default=1, help='Subsample ratio of the training instance') parser.add_argument('--colsample_bytree', type=float, nargs='?', default=1, help='Subsample ratio of columns when constructing each tree') parser.add_argument('--colsample_bylevel', type=float, nargs='?', default=1, help='Subsample ratio of columns for each level') parser.add_argument('--colsample_bynode', type=float, nargs='?', default=1, help='Subsample ratio of columns for each node') parser.add_argument('-G', '--doGridSearchCV', action='store_true', help='Perform a grid search for optimal hyperparameter values using cross-validation') parser.add_argument('-V', '--plot_validation_curve', action='store_true', help='Calculate and plot perforance score as function of number of training events') parser.add_argument('-L', '--plot_learning_curve', action='store_true', help='Calculate and plot perforance score for different values of a chosen hyperparameter') args = parser.parse_args() # Set which sample types to prepare HDF5s for use_sig = 1 use_bkg = 1 use_data = 0 # Where to put preprocessed datasets preproc_dir = 'preprocessed_datasets/' preproc_suffix = '' if args.prepare_hdf5: preproc_suffix = '_group_{}_preprocessed.h5'.format(args.prepare_hdf5) elif '2J' in args.signal_region: preproc_suffix = '_group_2L2J_preprocessed.h5' elif '3J+' in args.signal_region: preproc_suffix = '_group_2L3J+_preprocessed.h5' filename_sig_low_preprocessed = preproc_dir + 'sig_low' + preproc_suffix filename_sig_int_preprocessed = preproc_dir + 'sig_int' + preproc_suffix filename_sig_high_preprocessed = preproc_dir + 'sig_high' + preproc_suffix filename_sig_preprocessed = filename_sig_low_preprocessed filename_bkg_preprocessed = preproc_dir + 'bkg' + preproc_suffix filename_data_preprocessed = preproc_dir + 'data' + preproc_suffix # Where to put output output_dir = 'output/' #trained_model_dir = 'trained_models/' trained_model_dir = output_dir trained_model_xgb_suffix = '2LJets_trained_model.joblib' trained_model_nn_suffix = '2LJets_trained_model.h5' # Counters n_events_read = n_events_kept = 0 n_events_read_sample = n_events_kept_sample = 0 n_events_read_sample_type = n_events_kept_sample_type = 0 if args.xgboost: output_dir += 'xgboost/latest/xgb_' trained_model_dir += 'xgboost/latest/xgb_' elif args.nn: output_dir += 'neural_network/latest/nn_' trained_model_dir += 'neural_network/latest/nn_' if 'low' in args.signal_region: output_dir += 'low_' trained_model_dir += 'low_' elif 'int' in args.signal_region: output_dir += 'int_' trained_model_dir += 'int_' elif 'high' in args.signal_region: output_dir += 'high_' trained_model_dir += 'high_' if args.train_even: output_dir += 'trainEven_' trained_model_dir += 'trainEven_' elif args.train_odd: output_dir += 'trainOdd_' trained_model_dir += 'trainOdd_' if args.xgboost: trained_model_path = trained_model_dir + trained_model_xgb_suffix elif args.nn: trained_model_path = trained_model_dir + trained_model_nn_suffix global df_sig_feat, df_bkg_feat, df_data_feat l_sig = [] if use_sig: if 'low' in args.signal_region: l_sig = d_sig['low'] filename_sig_preprocessed = filename_sig_low_preprocessed elif 'int' in args.signal_region: #elif args.signal_region == 'int': l_sig = d_sig['int'] filename_sig_preprocessed = filename_sig_int_preprocessed elif 'high' in args.signal_region: l_sig = d_sig['high'] filename_sig_preprocessed = filename_sig_high_preprocessed d_sig_infile = {'low': filename_sig_low_preprocessed, 'int': filename_sig_int_preprocessed, 'high': filename_sig_high_preprocessed} class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(output_dir+".log", "w") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.prepare_hdf5: """Read input dataset in chunks, select features and perform cuts, before storing DataFrame in HDF5 file""" # Prepare and store signal dataset if use_sig: prepareHDF5(filename_sig_low_preprocessed, d_sig['low'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) prepareHDF5(filename_sig_int_preprocessed, d_sig['int'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) prepareHDF5(filename_sig_high_preprocessed, d_sig['high'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) # Prepare and store background dataset if use_bkg: prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e6, n_chunks=None, entrystart=0) #prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e4, n_chunks=1, entrystart=0) # Prepare and store real dataset if use_data: prepareHDF5(filename_data_preprocessed, l_data, sample_type='data', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) return elif args.read_hdf5: if use_sig: # Read in preprocessed signal DataFrame from HDF5 file df_sig_feat = pd.DataFrame({}) for key_sig, value_sig_infile in d_sig_infile.items(): if key_sig in args.signal_region: print("\nReading in file:", value_sig_infile) sig_store = pd.HDFStore(value_sig_infile) for i_sig in sig_store.keys(): #d_sig[key_sig]: if len(df_sig_feat) is 0: df_sig_feat = sig_store[i_sig]#.astype('float64') df_sig_feat['group'] = i_sig else: df_sig_sample = sig_store[i_sig]#.astype('float64') df_sig_sample['group'] = i_sig df_sig_feat = df_sig_feat.append(df_sig_sample) if 'mTl3' in df_sig_feat: df_sig_feat.drop(columns='mTl3', inplace=True) print("\ndf_sig_feat.head():\n", df_sig_feat.head()) sig_store.close() print("Closed store") if use_bkg: # Read in preprocessed background DataFrame from HDF5 file df_bkg_feat = pd.DataFrame({}) print("\nReading in file:", filename_bkg_preprocessed) bkg_store = pd.HDFStore(filename_bkg_preprocessed) for i_bkg in bkg_store.keys(): #l_bkg: if len(df_bkg_feat) is 0: df_bkg_feat = bkg_store[i_bkg]#.astype('float64') df_bkg_feat['group'] = i_bkg else: df_bkg_sample = bkg_store[i_bkg]#.astype('float64') df_bkg_sample['group'] = i_bkg df_bkg_feat = df_bkg_feat.append(df_bkg_sample) if 'mTl3' in df_bkg_feat: df_bkg_feat.drop(columns='mTl3', inplace=True) print("\ndf_bkg_feat.head():\n", df_bkg_feat.head()) bkg_store.close() print("Closed store") if use_data: # Read in preprocessed DataFrame of real data from HDF5 file data_store = pd.HDFStore(filename_data_preprocessed) df_data_feat = data_store['data'] print("\ndf_data_feat.head():\n", df_data_feat.head()) data_store.close() print("Closed store") elif args.direct_read: """Read the input dataset for direct use, without reading in chunks and storing to output file""" print("Not available at the moment") return #entry_start = 0 #sig_entry_stop = 1e4 #bkg_entry_stop = 1e4 ## import signal dataset #df_sig = importOpenData(sample_type="sig", entrystart=entry_start, entrystop=sig_entry_stop) #df_sig = shuffle(df_sig) # shuffle the rows/events #df_sig_feat = selectFeatures(df_sig, l_features) #df_sig_feat = df_sig_feat*1 # multiplying by 1 to convert booleans to integers #df_sig_feat["eventweight"] = getEventWeights(df_sig, l_eventweights) ## import background dataset #df_bkg = importOpenData(sample_type="bkg", entrystart=entry_start, entrystop=bkg_entry_stop) #df_bkg = shuffle(df_bkg) # shuffle the rows/events #df_bkg_feat = selectFeatures(df_bkg, l_features) #df_bkg_feat = df_bkg_feat*1 # multiplying by 1 to convert booleans to integers #df_bkg_feat["eventweight"] = getEventWeights(df_bkg, l_eventweights) ## import data ##df_data = importOpenData(sample_type="data", entrystart=entry_start, entrystop=entry_stop) if 'low' in args.signal_region: print('\nBefore xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head()) df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] = df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] * 0.08836675497457203 print('\nAfter xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head()) # Preselection cuts l_presel = ['met_Sign > 2', 'mt2leplsp_0 > 10'] #df_sig_feat.query('&'.join(l_presel), inplace=True) print("\n======================================") print("df_sig_feat.shape =", df_sig_feat.shape) print("df_bkg_feat.shape =", df_bkg_feat.shape) print("======================================") # make array of features df_X = pd.concat([df_bkg_feat, df_sig_feat], axis=0)#, sort=False) print("\ndf_X.isna().sum().sum()", df_X.isna().sum().sum()) #print("\ndf_X.dtypes", df_X.dtypes) #col_float32 = (df_X.dtypes == 'float32').values #df_X.iloc[:, col_float32] = df_X.iloc[:, col_float32].astype('float64') #print("\nAfter converting all columns to float64:\ndf_X.dtypes", df_X.dtypes) # make array of labels y_bkg = np.zeros(len(df_bkg_feat)) y_sig = np.ones(len(df_sig_feat)) y = np.concatenate((y_bkg, y_sig), axis=0).astype(int) df_X['ylabel'] = y if args.multiclass: df_X.loc[df_X.group=='Zjets', 'ylabel'] = 2 df_X.loc[df_X.group=='diboson', 'ylabel'] = 3 df_X = df_X.query('group=="diboson" | group=="Zjets" | ylabel==1') Y = df_X.ylabel # encode class values as integers encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # convert integers to dummy variables (i.e. one hot encoded) y_multi = np_utils.to_categorical(encoded_Y) # Split the dataset in train and test sets test_size = 0.5 seed = 42 df_X_even = df_X.query("RandomRunNumber % 2 == 0") df_X_odd = df_X.query("RandomRunNumber % 2 == 1") df_X_even = shuffle(df_X_even) df_X_odd = shuffle(df_X_odd) if args.train_even: X_train = df_X_even X_test = df_X_odd elif args.train_odd: X_train = df_X_odd X_test = df_X_even # Balance dataset by resampling: equal number of signal and background events if args.balanced >= 0: # Oversample signal if args.balanced is 0: N_train_sig = len(X_train.query('ylabel==0')) # Undersample background elif args.balanced is 1: N_train_sig = len(X_train.query('ylabel==1')) N_train_bkg = N_train_sig # Draw balanced training datasets where the number of signal and background events are equal X_train_sig = resample(X_train.query('ylabel==1'), replace=True, n_samples=N_train_sig, random_state=42)#, stratify=None) X_train_bkg = resample(X_train.query('ylabel==0'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None) X_train = pd.concat([X_train_bkg, X_train_sig], axis=0) print("\n---------- After balancing ----------") print("args.balanced =", args.balanced) print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==1').shape) print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==0').shape) print("---------------------------------------") #X_train_bkg = resample(X_train.query('group==Zjets'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None) #X_train = X_train.query('group=="diboson" | ylabel==1') # Draw validation set as subsample of test set, for quicker evaluation of validation loss during training n_val_samples = 1e5 X_val = resample(X_test, replace=False, n_samples=n_val_samples, random_state=42, stratify=X_test.ylabel) y_val = X_val.ylabel y_train = X_train.ylabel y_test = X_test.ylabel # Making a copy of the DFs with only feature columns X_train_feat_only = X_train.copy() X_test_feat_only = X_test.copy() X_val_feat_only = X_val.copy() l_non_features = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel'] X_train_feat_only.drop(l_non_features, axis=1, inplace=True) X_test_feat_only.drop(l_non_features, axis=1, inplace=True) X_val_feat_only.drop(l_non_features, axis=1, inplace=True) print("\nX_train_feat_only:", X_train_feat_only.columns) print("X_test_feat_only:", X_test_feat_only.columns) print("X_val_feat_only:", X_val_feat_only.columns) print("\nX_train_feat_only:", X_train_feat_only.shape) print("X_test_feat_only:", X_test_feat_only.shape) print("X_val_feat_only:", X_val_feat_only.shape) # Feature scaling # Scale all variables to the interval [0,1] #scaler = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True) print("\nscaler.fit_transform(X_train_feat_only)") X_train_scaled = scaler.fit_transform(X_train_feat_only) print("scaler.transform(X_test_feat_only)") X_test_scaled = scaler.transform(X_test_feat_only) print("scaler.transform(X_val_feat_only)") X_val_scaled = scaler.transform(X_val_feat_only) print("\n\n//////////////////// ML part ////////////////////////") global model scale_pos_weight = 1 event_weight = None class_weight = None class_weight_dict = {} if args.event_weight: event_weight = X_train.eventweight #event_weight = eventweight_train_resampled if args.class_weight: if args.xgboost: # XGBoost: Scale signal events up by a factor n_bkg_train_events / n_sig_train_events scale_pos_weight = len(X_train[X_train.ylabel == 0]) / len(X_train[X_train.ylabel == 1]) #scale_pos_weight = 10 else: # sciki-learn: Scale overrespresented sample down (bkg) and underrepresented sample up (sig) class_weight = "balanced" else: class_weight = None print("\n# bkg train events / # sig train events = {0:d} / {1:d}".format(len(X_train[X_train.ylabel == 0]), len(X_train[X_train.ylabel == 1]))) print("scale_pos_weight =", scale_pos_weight) classes = np.unique(y) class_weight_vect = compute_class_weight(class_weight, classes, y) class_weight_dict = {0: class_weight_vect[0], 1: class_weight_vect[1]} # Initialize variables for storing CV output valid_score = test_score = fit_time = score_time = 0 # Initialize variables for storing validation and learning curve output train_scores_vc_mean = train_scores_vc_std = 0 valid_scores_vc_mean = valid_scores_vc_std = 0 train_scores_lc_mean = train_scores_lc_std = 0 valid_scores_lc_mean = valid_scores_lc_std = 0 # List of training set sizes for plotting of learning curve train_sizes = [0.5, 0.75, 1.0] # List of parameter values for hyperparameter grid search # XGBoost max_depth = [5, 6, 8, 10] n_estimators = [50, 100, 200, 500, 1000] learning_rate = [0.001, 0.01, 0.1, 0.5, 1.0] reg_alpha = [0, 0.001, 0.01, 0.1, 1.] reg_lambda = [0, 0.001, 0.01, 0.1, 1.] d_param_grid_xgb = {'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda } # Specify one of the above parameter lists to plot validation curve for param_name_xgb = 'max_depth' param_range_xgb = d_param_grid_xgb[param_name_xgb] # Neural network n_hidden_layers = [1, 3, 5, 7, 10] n_nodes = [10, 20, 50, 100, 500] batch_size = [8, 16, 32, 64, 128] epochs = [10, 50, 100, 500, 1000] #kernel_regularizer = [l1_l2(l1=1e-6, l2=1e-6), l1_l2(l1=1e-6, l2=1e-5), l1_l2(l1=1e-5, l2=1e-6), l1_l2(l1=1e-5, l2=1e-5)] d_param_grid_nn = {'n_hidden_layers': [1] #n_hidden_layers, #'n_nodes': #n_nodes, #'batch_size': batch_size, #'epochs': epochs, #'kernel_regularizer': kernel_regularizer } # Specify one of the above parameter lists to plot validation curve for param_name_nn = 'n_hidden_layers' param_range_nn = d_param_grid_nn[param_name_nn] if args.xgboost: param_range = param_range_xgb param_name = param_name_xgb elif args.nn: param_range = param_range_nn param_name = param_name_nn # Run XGBoost BDT if args.xgboost: if args.multiclass: objective = 'multi:softmax' eval_metric = 'mlogloss' else: objective = 'binary:logistic' eval_metric = 'logloss' #eval_metric = 'auc' max_depth = args.max_depth lr = args.lr n_estimators = args.n_estimators gamma = args.gamma min_child_weight = args.min_child_weight max_delta_step = args.max_delta_step subsample = args.subsample colsample_bytree = args.colsample_bytree colsample_bylevel = args.colsample_bylevel colsample_bynode = args.colsample_bynode reg_alpha = args.L1 reg_lambda = args.L2 if not args.load_pretrained_model: model = XGBClassifier(max_depth=max_depth, learning_rate=lr, n_estimators=n_estimators, verbosity=1, objective=objective, n_jobs=-1, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, colsample_bynode=colsample_bynode, reg_alpha=reg_alpha, # L1 regularization reg_lambda=reg_alpha, # L2 regularization scale_pos_weight=scale_pos_weight) print("\nmodel.get_params()\n", model.get_params()) if not args.plot_validation_curve and not args.plot_learning_curve: if args.doGridSearchCV: model = GridSearchCV(model, d_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) print("\nTraining XGBoost BDT...") if args.doCV: cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, return_train_score=True) valid_score = cv_results['test_score'] train_score = cv_results['train_score'] fit_time = cv_results['fit_time'] score_time = cv_results['score_time'] fit_time = cv_results['fit_time'] else: model.fit(X_train_scaled, y_train, sample_weight=event_weight, eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)], #eval_set=[(X_val_scaled, y_val)], eval_metric=eval_metric, early_stopping_rounds=20, verbose=True) evals_result = model.evals_result() sns.set() ax = sns.lineplot(x=range(0, len(evals_result['validation_0'][eval_metric])), y=evals_result['validation_0'][eval_metric], label='Training loss') ax = sns.lineplot(x=range(0, len(evals_result['validation_1'][eval_metric])), y=evals_result['validation_1'][eval_metric], label='Validation loss') ax.set(xlabel='Epochs', ylabel='Loss') plt.show() print("\nTraining done!") if args.doGridSearchCV: joblib.dump(model.best_estimator_, trained_model_path) else: joblib.dump(model, trained_model_path) print("\nSaving the trained XGBoost BDT:", trained_model_path) elif args.load_pretrained_model: print("\nReading in pre-trained XGBoost BDT:", trained_model_path) model = joblib.load(trained_model_path) # Run neural network elif args.nn: n_inputs = X_train_scaled.shape[1] n_nodes = args.n_nodes n_hidden_layers = args.n_hidden_layers dropout_rate = args.dropout batch_size = args.batch_size epochs = args.epochs l1 = args.L1 l2 = args.L2 lr = args.lr if not args.load_pretrained_model: print("\nBuilding and training neural network") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20) model = KerasClassifier(build_fn=create_model, n_inputs=n_inputs, n_hidden_layers=n_hidden_layers, n_nodes=n_nodes, dropout_rate=dropout_rate, l1=l1, l2=l2, lr=lr, batch_size=batch_size, epochs=epochs, verbose=1, ) if not args.plot_validation_curve and not args.plot_learning_curve: if args.doGridSearchCV: param_grid = d_param_grid_nn model = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) history = model.fit(X_train_scaled, y_train, sample_weight=event_weight, class_weight=class_weight_dict, verbose=1, callbacks=[es], validation_data=(X_val_scaled, y_val) #validation_data=(X_test_scaled, y_test) ) print("\nmodel.model.summary()\n", model.model.summary()) if not args.doGridSearchCV: d_val_loss = {'Training loss': history.history['loss'], 'Validation loss': history.history['val_loss']} df_val_loss = pd.DataFrame(d_val_loss) sns.set() ax = sns.lineplot(data=df_val_loss) ax.set(xlabel='Epochs', ylabel='Loss') plt.show() if args.doGridSearchCV: model.best_estimator_.model.save(trained_model_path) else: model.model.save(trained_model_path) print("\nSaving the trained neural network:", trained_model_path) elif args.load_pretrained_model: print("\nReading in pre-trained neural network:", trained_model_path) model = load_model(trained_model_path) if not args.plot_validation_curve and not args.plot_learning_curve: # Print results of grid search if args.doGridSearchCV: print("Best parameters set found on development set:") print("") print("model.best_params_", model.best_params_) print("") print("Grid scores on development set:") means = model.cv_results_['mean_test_score'] stds = model.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, model.cv_results_['params']): print("{0:0.3f} (+/-{1:0.03f}) for {2!r}".format(mean, std, params)) print("") df = pd.DataFrame.from_dict(model.cv_results_) print("pandas DataFrame of cv results") print(df) print("") # Get predicted signal probabilities for train and test sets output_train = model.predict_proba(X_train_scaled) output_test = model.predict_proba(X_test_scaled) #X_train = X_train.copy() #X_test = X_test.copy() if args.multiclass: output_test = output_test.reshape(output_test.shape[0], 3) print("output_train", len(output_train[0])) for i_output in range(len(output_train[0])): X_train["output"+str(i_output)] = output_train[:,i_output] X_test["output"+str(i_output)] = output_test[:,i_output] elif output_train.shape[1] is 2: print("output_train[:10,1]", output_train[:10,1]) X_train["output"] = output_train[:,1] X_test["output"] = output_test[:,1] else: X_train["output"] = output_train X_test["output"] = output_test print("\n\n//////////////////// Plotting part ////////////////////////\n") if not args.multiclass: print("len(X_train.query('ylabel==0').loc[:,'eventweight'])", len(X_train.query('ylabel==0').loc[:,'eventweight'])) print("len(X_train.query('ylabel==0').loc[:,'output'])", len(X_train.query('ylabel==0').loc[:,'output'])) print("X_train.query('ylabel==0').loc[:,'eventweight']", X_train.query("ylabel==0").loc[:,"eventweight"].head()) print("X_train.query('ylabel==0').loc[:,'output']", X_train.query("ylabel==0").loc[:,"output"].head()) print("X_train[['eventweight', 'output']].min(): \n", X_train[['eventweight', 'output']].min()) print("X_train[['eventweight', 'output']].max(): \n", X_train[['eventweight', 'output']].max()) l_X_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg] l_ew_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg] l_X_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg] l_ew_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg] l_X_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig] l_ew_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig] l_X_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig] l_ew_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig] d_X_train_bkg = dict(zip(l_bkg, l_X_train_bkg)) d_ew_train_bkg = dict(zip(l_bkg, l_ew_train_bkg)) d_X_test_bkg = dict(zip(l_bkg, l_X_test_bkg)) d_ew_test_bkg = dict(zip(l_bkg, l_ew_test_bkg)) # Plot unweighted training and test output #plt.figure(1) #plotTrainTestOutput(d_X_train_bkg, None, # X_train.query("ylabel==1").loc[:,"output"], None, # d_X_test_bkg, None, # X_test.query("ylabel==1").loc[:,"output"], None) #plotTrainTestOutput(d_X_train_bkg, None, # X_train.query("ylabel==1").loc[:,"output"], None, # d_X_test_bkg, None, # X_test.query("ylabel==1").loc[:,"output"], None) #plt.savefig(output_dir + 'hist1_train_test_unweighted.pdf') # Plot weighted train and test output, with test set multiplied by 2 to match number of events in training set plt.figure() #for i_output in range(output_train.shape[1]): plotTrainTestOutput(d_X_train_bkg, d_ew_train_bkg, X_train.query("ylabel==1").filter(like='output'), X_train.query("ylabel==1").loc[:,"eventweight"], d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1").filter(like='output'), X_test.query("ylabel==1").loc[:,"eventweight"], args.signal_region) plt.savefig(output_dir + 'hist_train_test_weighted_comparison.pdf') # Plot final signal vs background estimate for test set, scaled to 10.6/fb if 'low' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").filter(like='output'), X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").loc[:,"eventweight"], args.signal_region, figure_text='(200, 100) GeV') plt.savefig(output_dir + 'hist_test_392330_396210_C1N2_WZ_2L2J_200_100_weighted.pdf') elif 'int' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"output"], X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"eventweight"], args.signal_region, figure_text='(500, 200) GeV') plt.savefig(output_dir + 'hist_test_392325_C1N2_WZ_2L2J_500_200_weighted.pdf') elif 'high' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"output"], X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"eventweight"], args.signal_region, figure_text='(600, 0) GeV') plt.savefig(output_dir + 'hist5_test_392356_C1N2_WZ_2L2J_600_0_weighted.pdf') if args.xgboost and not args.doGridSearchCV: # Plot feature importance print("model.feature_importances_", model.feature_importances_) print("np.sum(model.feature_importances_)", np.sum(model.feature_importances_)) if args.multiclass: l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output0', 'output1', 'output2'] else: l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output'] s_feat_importance = pd.Series(model.feature_importances_, index=X_train.drop(l_feat_drop, axis=1).columns) print("X_train.drop(l_feat_drop, axis=1).columns\n", X_train.drop(l_feat_drop, axis=1).columns) s_feat_importance.sort_values(ascending=False, inplace=True) plt.figure() sns.set(style="ticks", color_codes=True) n_top_feat_importance = 20 ax = sns.barplot(x=s_feat_importance[:n_top_feat_importance]*100, y=s_feat_importance[:n_top_feat_importance].index)#, palette="Blues_r") #ax.set_yticklabels(s_feat_importance.index) ax.set(xlabel="Feature importance [%]") plt.savefig(output_dir + 'feature_importance.pdf') if not args.multiclass: # Plot ROC curve fpr, tpr, thresholds = metrics.roc_curve(X_test.loc[:,"ylabel"], X_test.loc[:,"output"]) auc = metrics.roc_auc_score(X_test.loc[:,"ylabel"], X_test.loc[:,"output"]) plt.figure() ax = sns.lineplot(x=tpr, y=1-fpr, estimator=None, label='ROC curve: AUC = %0.2f' % auc) plt.plot([1,0], [0,1], linestyle="--") ax.set(xlabel="Signal efficiency", ylabel="Background efficiency") plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_1minBkgEff.pdf') plt.figure() ax = sns.lineplot(x=tpr, y=1/(fpr), estimator=None, label='ROC curve: AUC = %0.2f' % auc) #plt.plot([0,1], [0,1], linestyle="--") ax.set(xlabel="Signal efficiency", ylabel="Background rejection = 1/(1 - bkg eff.)", yscale='log') plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_bkgRej.pdf') plt.show() # Signal significance print("\n///////////////// Signal significance /////////////////") def significance(cut_string_sig, cut_string_bkg, rel_unc=0.3): sig_exp = np.sum(X_test.query("ylabel == 1 & "+cut_string_sig).loc[:,"eventweight"]) bkg_exp = np.sum(X_test.query("(ylabel == 0 | ylabel == 2 | ylabel == 3) & "+cut_string_bkg).loc[:,"eventweight"]) Z_N_exp = RooStats.NumberCountingUtils.BinomialExpZ(sig_exp, bkg_exp, rel_unc) return [sig_exp, bkg_exp, Z_N_exp] #cut_string_DSID = 'DatasetNumber == {0:d}'.format(dsid) if 'low' in args.signal_region: key = '(200, 100)' cut_string_DSID = '(DatasetNumber == 392330 | DatasetNumber == 396210)' elif 'int' in args.signal_region: key = '(500, 200)' cut_string_DSID = 'DatasetNumber == 392325' elif 'high' in args.signal_region: key = '(600, 0)' cut_string_DSID = 'DatasetNumber == 392356' l_cuts = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99] global cut_optimal cut_optimal = 0 Z_N_optimal = 0 for cut in l_cuts: if args.multiclass: cut_string_SR = 'output0 > {:f}'.format(cut) else: cut_string_SR = 'output > {:f}'.format(cut) cut_string_bkg = cut_string_SR cut_string_sig = cut_string_SR + " & " + cut_string_DSID print('\ncut_string_sig:', cut_string_sig) print('cut_string_bkg:', cut_string_bkg) [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3) print("---", key) print("S_exp =", sig_exp) print("B_exp =", bkg_exp) for i in range(len(l_X_train_bkg)): l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg] B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"]) print(" {0}: {1}".format(l_bkg[i], B_exp_i)) print("Z_N_exp =", Z_N_exp) if sig_exp >= 3 and bkg_exp >= 1: if Z_N_exp > Z_N_optimal: Z_N_optimal = Z_N_exp cut_optimal = cut # Print the optimal SR values if args.multiclass: cut_string_SR = 'output0 > {:f}'.format(cut_optimal) else: cut_string_SR = 'output > {:f}'.format(cut_optimal) cut_string_bkg = cut_string_SR cut_string_sig = cut_string_SR + " & " + cut_string_DSID print('\ncut_string_sig:', cut_string_sig) print('cut_string_bkg:', cut_string_bkg) [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3) print("---", key) print("Optimal cut =", cut_optimal) print("S_exp =", sig_exp) print("B_exp =", bkg_exp) for i in range(len(l_X_train_bkg)): l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg] B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"]) print(" {0}: {1}".format(l_bkg[i], B_exp_i)) print("Z_N_exp =", Z_N_exp) if args.plot_validation_curve: print("\nCalculating validation curve...") train_scores, valid_scores = validation_curve(model, X_train_scaled, y_train, param_name=param_name, param_range=param_range, cv=3, scoring='roc_auc', n_jobs=-1, verbose=11) train_scores_vc_mean = np.mean(train_scores, axis=1) train_scores_vc_std = np.std(train_scores, axis=1) valid_scores_vc_mean = np.mean(valid_scores, axis=1) valid_scores_vc_std = np.std(valid_scores, axis=1) # Plot validation curves figF, axsF = plt.subplots() # Training score axsF.plot( param_range, train_scores_vc_mean, 'o-', label="Training score", color="darkorange", lw=2) axsF.fill_between( param_range, train_scores_vc_mean - train_scores_vc_std, train_scores_vc_mean + train_scores_vc_std, alpha=0.2, color="darkorange", lw=2) # Test score axsF.plot( param_range, valid_scores_vc_mean, 'o-', label="Cross-validation score", color="navy", lw=2) axsF.fill_between( param_range, valid_scores_vc_mean - valid_scores_vc_std, valid_scores_vc_mean + valid_scores_vc_std, alpha=0.2, color="navy", lw=2) axsF.set_xlabel(param_name) axsF.set_ylabel('Score') axsF.legend(loc="best") axsF.set_title('Validation curves') #axsF.set_ylim(0., 1.) plt.savefig(output_dir + 'validation_curve_{}.pdf'.format(param_name)) plt.show() if args.plot_learning_curve: print("\nCalculating learning curve...") train_sizes, train_scores, valid_scores = learning_curve(model, X_train_scaled, y_train, train_sizes=train_sizes, cv=3, scoring='roc_auc', n_jobs=1, verbose=3) train_scores_lc_mean = np.mean(train_scores, axis=1) train_scores_lc_std = np.std(train_scores, axis=1) valid_scores_lc_mean = np.mean(valid_scores, axis=1) valid_scores_lc_std = np.std(valid_scores, axis=1) # Plot learning curves figG, axsG = plt.subplots() # 68% CL bands #if runBDT: #elif runNN: axsG.fill_between( train_sizes, train_scores_lc_mean - train_scores_lc_std, train_scores_lc_mean + train_scores_lc_std, alpha=0.2, color="r", lw=2) axsG.fill_between( train_sizes, valid_scores_lc_mean - valid_scores_lc_std, valid_scores_lc_mean + valid_scores_lc_std, alpha=0.2, color="g", lw=2) # Training and validation scores axsG.plot( train_sizes, train_scores_lc_mean, 'o-', label="Training score", color="r", lw=2) axsG.plot( train_sizes, valid_scores_lc_mean, 'o-', label="Cross-validation score", color="g", lw=2) axsG.set_xlabel("Training examples") axsG.set_ylabel('Score') axsG.legend(loc="best") axsG.set_title('Learning curves') #axsG.set_ylim(0., 1.) plt.savefig(output_dir + 'learning_curve.pdf') plt.show() # Stop timer t_end = time.time() print("\nProcess time: {:4.2f} s".format(t_end - t_start))
def objective_xgb(trial, X_train, X_valid, y_train, y_valid): param = { "verbosity": 0, "objective": "binary:logistic", "n_estimators": 1000, # use exact for small dataset. "tree_method": "exact", # defines booster, gblinear for linear functions. "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]), # L2 regularization weight. "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True), # L1 regularization weight. "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True), # sampling ratio for training data. "subsample": trial.suggest_float("subsample", 0.2, 1.0), # sampling according to each tree. "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0), } if param["booster"] in ["gbtree", "dart"]: # maximum depth of the tree, signifies complexity of the tree. param["max_depth"] = trial.suggest_int("max_depth", 3, 9, step=2) # minimum child weight, larger the term more conservative the tree. param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 10) param["eta"] = trial.suggest_float("eta", 1e-8, 1e-1, log=True) # defines how selective algorithm is. param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True) param["grow_policy"] = trial.suggest_categorical( "grow_policy", ["depthwise", "lossguide"]) if param["booster"] == "dart": param["sample_type"] = trial.suggest_categorical( "sample_type", ["uniform", "weighted"]) param["normalize_type"] = trial.suggest_categorical( "normalize_type", ["tree", "forest"]) param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True) param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True) xgb = XGBClassifier(**param) xgb.fit( X_train, y_train.to_numpy().reshape(-1), early_stopping_rounds=50, eval_set=[ (X_train, y_train.to_numpy().reshape(-1)), (X_valid, y_valid.to_numpy().reshape(-1)), ], eval_metric=EVAL_METRIC, verbose=True, ) results = xgb.evals_result() best_iteration = xgb.best_iteration print(f"Best Iteration: {best_iteration}") res = { eval_name: {key: val[xgb.best_iteration] for key, val in values.items()} for eval_name, values in results.items() } logger.info(res) # accuracy = sklearn.metrics.accuracy_score(valid_y, pred_labels) auc = res["validation_1"]["auc"] return auc
Y_tt = Y_tt.astype(int) Y_tp = Y_tp.astype(int) Y_tt = T2.inverse_transform(Y_tt); Y_tp = T2.inverse_transform(Y_tp); print("\n-------------------- Classification report for PART 2 (XGBoost) -----------------------\n") print(classification_report(Y_tt,Y_tp,digits=3)[0:57]) print(classification_report(Y_tt,Y_tp,digits=3)[-175:]) print("\n---------------------------------------------------------------------------------------\n") plt.rcParams["figure.figsize"] = (5,4); RES = MODEL2.evals_result(); NUM = len(RES['validation_0']['merror']); G1 = range(0, NUM); GRAPH, G2 = plt.subplots(); G2.plot(G1, RES['validation_0']['mlogloss'], label='Log Loss'); G2.plot(G1, RES['validation_0']['merror'] , label='Error'); G2.legend(); plt.ylabel('Error/Log Loss value'); plt.xlabel('Epochs') plt.title('XGBoost Error and Loss Values'); plt.show(); plt.rcParams["figure.figsize"] = (10,10); cm1 = confusion_matrix(Y_tt,Y_tp); labelsi = np.unique(Y_tt)
def extract_xgboost_eval(model: xgboost.XGBClassifier) -> pd.DataFrame: df = pd.DataFrame(model.evals_result()["validation_0"]) df["iteration"] = [i + 1 for i in range(df.shape[0])] df["eval_set"] = "val" df["model"] = "xgboost" return df
for thresh in thresholds: # 칼럼 수 만큼 돈다! selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) select_x_test = selection.transform(x_test) # select_y_train = selection.transform(y_train) # print(select_x_train.shape) # print(type(select_x_train)) # print(type(y_train)) selection_model = XGBClassifier(n_estimators=5, n_jobs=-1) selection_model.fit(select_x_train, y_train, verbose=True, eval_metric=['error', 'logloss'], eval_set=[(select_x_train, y_train), (select_x_test, y_test)], early_stopping_rounds=100) results = selection_model.evals_result() # print("eval's result: ", results) y_predict = selection_model.predict(select_x_test) score = r2_score(y_test, y_predict) print("Thresh=%.3f, n=%d, R2: %.2f%%" % (thresh, select_x_train.shape[1], score * 100.0))
X = dataset[:,0:8] Y = dataset[:,8] # split data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=7) # fit model no training data model = XGBClassifier() eval_set = [(X_train, y_train), (X_test, y_test)] model.fit(X_train, y_train, eval_metric=["error", "logloss"], eval_set=eval_set, verbose=True) # make predictions for test data y_pred = model.predict(X_test) predictions = [round(value) for value in y_pred] # evaluate predictions accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # retrieve performance metrics results = model.evals_result() epochs = len(results['validation_0']['error']) x_axis = range(0, epochs) # plot log loss fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['logloss'], label='Train') ax.plot(x_axis, results['validation_1']['logloss'], label='Test') ax.legend() pyplot.ylabel('Log Loss') pyplot.title('XGBoost Log Loss') pyplot.show() # plot classification error fig, ax = pyplot.subplots() ax.plot(x_axis, results['validation_0']['error'], label='Train') ax.plot(x_axis, results['validation_1']['error'], label='Test') ax.legend()
train_size=0.8, random_state=1) model = XGBClassifier(n_estimators=1000, learning_rate=0.1) # model.fit(x_train, y_train, verbose=True, eval_metric= "error", # eval_set=[(x_train, y_train), (x_test, y_test)]) model.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "loss"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) # rmse, mae, logloss, error, auc result = model.evals_result() print(result) y_pred = model.predict(x_test) r2 = r2_score(y_pred, y_test) print(f"r2: {r2}") thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) parameter = { 'n_estimators': [100, 200, 400],
class Xgboost(object): def __init__(self, task="cla", module_type="performance", compute_task="cpu", **params): """ :param task: :param module_type: :param compute_task: :param params: """ assert task in ["cla", "reg"] assert module_type in ["debug", "performance", "balance"] assert compute_task in ["cpu", "gpu"] self.task = task self.module_type = module_type # 模块 if self.module_type == "debug": params["n_jos"] = 1 elif self.module_type == "performance": params["n_jos"] = cpu_count() # cpu核心数 else: # 性能模式 params["n_jos"] = cpu_count() // 2 self.compute_task = compute_task if self.compute_task == "gpu": # 使用gpu params["tree_method"] = "gpu_hist" else: # 默认cpu params["tree_method"] = "hist" # 使用的cpu if self.task == "reg": # 做回归任务 self.model = XGBRegressor( learning_rate=params.get("learning_rate", 0.3), n_estimators=params.get("n_estimators", 100), # 树的个数100,即代数 max_depth=params.get("max_depth", 6), # 树的深度 min_child_weight=params.get("min_child_weight", 1), # 叶子节点最小权重 n_jobs=params.get("n_jos", None), # 线程数 gamma=params.get("gamma", 0), # 惩罚项中叶子节点个数前的参数 reg_lambda=params.get("lambda", 1), # lambda reg_alpha=params.get("alpha", 0), tree_method=params.get("tree_method", "auto"), subsample=params.get("subsample", 1), # 随机选择100%样本建立决策树 colsample_bytree=1, # 随机选择80%特征建立决策树 objective=params.get("objective", "reg:squarederror"), # 指定损失函数 # num_class=params.get("num_class", 2), # 不指定即为2分类 booster=params.get("booster", "gbtree"), # 使用的提升器 scale_pos_weight=1, # 解决样本不平衡问题 random_state=27, # 随机数 ) else: # 做的分类任务 self.model = XGBClassifier( learning_rate=params.get("learning_rate", 0.3), n_estimators=params.get("n_estimators", 100), # 树的个数100,即代数 max_depth=params.get("max_depth", 6), # 树的深度 min_child_weight=params.get("min_child_weight", 1), # 叶子节点最小权重 n_jobs=params.get("n_jos", None), # 线程数 gamma=params.get("gamma", 0), # 惩罚项中叶子节点个数前的参数 reg_lambda=params.get("lambda", 1), # lambda reg_alpha=params.get("alpha", 0), tree_method=params.get("tree_method", "auto"), # 树方法, 默认为auto subsample=params.get("subsample", 1), # 随机选择100%样本建立决策树 colsample_bytree=1, # 随机选择80%特征建立决策树 objective=params.get("objective", "multi:softmax"), # 指定损失函数 # 'binary:logistic 二分类交叉上 # num_class=params.get("num_class", 2), # 不指定即为2分类 booster=params.get("booster", "gbtree"), # 使用的提升器 scale_pos_weight=1, # 解决样本不平衡问题 random_state=27, # 随机数 ) """ 目标函数类型 具体查看 https://xgboost.readthedocs.io/en/latest/parameter.html obejctive: 默认 reg:squarederror: reg:squarederror: #回归平方误差 reg:squaredlogerror # 上述误差上取对数 reg:logistic logistic regression reg:logistic 逻辑回归 binary:logistic 逻辑回归二分类, 输出为概率值 binary:logitraw 逻辑回归 2分类,输出为logits之前的得分 binary:hinge 用于二元分类的铰链损失。这使得预测为0或1,而不是产生概率。 multi:softmax: 多分类,需要指定num_class的类别 multi:softprob: 输出为概率 ndata*nclass 的矩阵,即,每行数据为分属类别的概率 """ def train(self, x_train, y_train=None, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, sample_weight_eval_set=None): # print(self.model) """ :param x_train: 回归中,使用特征矩阵, array :param y_train: 标签 array :param eval_metric :return: """ # 默认开启过早停止 # eval_metric in ["rmse","rmsle","mae","logloss","error","error@t", "merror","mlogloss","auc","aucpr", # "ndcg","map","ndcg@n", "map@n","ndcg-", "map-", "ndcg@n-", "map@n-","poisson-nloglik", # "gamma-nloglik","cox-nloglik","gamma-deviance","tweedie-nloglik","aft-nloglik"] # eval_metric 参数可为字符串, 也可以是列表字符串的形式 if eval_metric: # 若需要使用评估模型模式, assert eval_set # 要确保 测试集是存在的。 self.model.fit(X=x_train, y=y_train, sample_weight=sample_weight, base_margin=base_margin, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=early_stopping_rounds, verbose=verbose, sample_weight_eval_set=sample_weight_eval_set) # early_stopping_rounds=10 过早停止的条件 # 默认使用值为10 # verbose=True # 是否开启冗余 def plot_loss(self): # 绘制loss result = self.model.evals_result() #获取模型结果 epochs = len(result["validation_0"]["rmse"]) x_axis = range(0, epochs) # 绘制loss曲线图 figure, ax = plt.subplots() ax.plot(x_axis, result["validation_0"]["rmse"], label="Train") ax.plot(x_axis, result["validation_1"]["rmse"], label="Test") ax.legend() plt.ylabel("loss") plt.title("Xgboost Log Loss") plt.show() def predict(self, x_test): """ :param x_test: #使用np.array、scipy.sparse 用于预测 :return: """ my_pred = self.model.predict(data=x_test, output_margin=False, validate_features=True, base_margin=None) return my_pred def plt_importance(self, figure_path=None, ifsave=True): # 绘制重要性特征 """ :param figure_path: 图片保存路径 :param ifsave: 是否保存图片 :return: """ # 绘制特征重要性 fig, ax = plt.subplots(figsize=(15, 15)) plot_importance(self.model, height=0.5, ax=ax, max_num_features=64) # 最多绘制64个特征 if ifsave: if not figure_path: plt.savefig( "../model/XGBboost_model/Xgboost_featute_importance_before.png" ) else: plt.savefig(figure_path) plt.show() # 显示图片 def _plt_importance_v1(self, columns_name, figure_path=None, ifsave=True): # 绘制重要性特征,使用实际的列名进行替换 fig, ax = plt.subplots(figsize=(15, 15)) plot_importance_v1(self.model, model_name="xgb", columns_name=columns_name, height=0.5, ax=ax, max_num_features=64) # 最多绘制64个特征 if ifsave: if not figure_path: plt.savefig( "../model/XGBboost_model/Xgboost_featute_importance_after.png" ) else: plt.savefig(figure_path) plt.show() # 显示图片 def plt_tree(self, num_tree): # 绘制树 """ :param num_tree: 指定目标树的序号 :return: """ plot_tree(booster=self.model, num_trees=num_tree) def plot_graphviz(self, num_tree): # 进行绘制graphviz to_graphviz(self.model, num_trees=num_tree) # 获取重要特征 def get_importance(self): return self.model.feature_importances_ # 评估函数 def evaluate(self, y_test, my_pred, evalue_fun="mse"): if evalue_fun == "acc": # 准确率 分类指标 result = accuracy_score(y_true=y_test, y_pred=my_pred) print("accuarcy:%.2f" % (result * 100.0)) elif evalue_fun == "auc": # auc 值 分类指标 result = roc_auc_score(y_true=y_test, y_score=my_pred) print("auc:%.2f" % (result)) elif evalue_fun == "mae": # 回归指标, 平均绝对误差 result = mean_absolute_error(y_true=y_test, y_pred=my_pred) print("mae:%.2f" % (result)) elif evalue_fun == "median_ae": # 种植绝对误差 回归指标 result = median_absolute_error(y_true=y_test, y_pred=my_pred) print("median_ae:%.2f" % (result)) elif evalue_fun == "r2_score": # R平方值 回归指标 result = r2_score(y_true=y_test, y_pred=my_pred) print("r2_score:%.2f" % (result)) elif evalue_fun == "evs": # 回归反差, 回归指标 result = explained_variance_score(y_true=y_test, y_pred=my_pred) print("explained_variance_score:%.2f" % (result)) elif evalue_fun == "aps": # 分类指标, 根据预测得分计算平均精度(AP) result = average_precision_score(y_true=y_test, y_score=my_pred, average="maco", sample_weight=None) print("average_precision_score:%.2f" % (result)) elif evalue_fun == "bsl": result = brier_score_loss(y_true=y_test, y_prob=my_pred, sample_weight=None, pos_label=None) print("brier_score_loss:%.2f" % (result)) elif evalue_fun == "cmt": #计算混淆矩阵来评估分类的准确性 分类指标 result = confusion_matrix(y_true=y_test, y_pred=my_pred, labels=None, sample_weight=None) print("confusion_matrix:%.2f" % (result)) elif evalue_fun == "f1_score": # f1 得分, 分类指标 result = f1_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary", sample_weight=None) #F1值 print("f1_score:%.2f" % (result)) elif evalue_fun == "log_loss": # 交叉熵孙绍, 分类指标 result = log_loss(y_true=y_test, y_pred=my_pred, eps=1e-15, normalize=True, sample_weight=None, labels=None) print("log_loss:%.2f" % (result)) elif evalue_fun == "precision_score": # 查准率 分类指标 result = precision_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary") print("precision_score:%.2f" % (result)) elif evalue_fun == "recall_score": # 查全绿 分类指标 result = recall_score(y_true=y_test, y_pred=my_pred, labels=None, pos_label=1, average="binary", sample_weight=None) print("recall_score:%.2f" % (result)) elif evalue_fun == "roc_auc_score": # 计算 roc 曲线下面的面积就是AUC值, 分类指标 result = roc_auc_score(y_true=y_test, y_score=my_pred, average="macro", sample_weight=None) print("roc_auc_score:%.2f" % (result)) elif evalue_fun == "roc_curve": # 计算PROC曲线的横轴坐标 分类指标 fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=my_pred, pos_label=None, sample_weight=None, drop_intermediate=True) result = (fpr, tpr, thresholds) else: # mse 参数 均方差, 回归指标 result = mean_squared_error(y_true=y_test, y_pred=my_pred) print("mse:%.2f" % (result)) return result def save_model(self, save_params): # 模型保存 self.model.save_model( fname=save_params.get( "fname", "../model/XGBboost_model/XGboostmodel.model") # 保存的文件路径名字 # format=save_params.get("format", "cbm"), # 保存的数据格式 # pool=save_params.get("pool", None) # 训练使用的数据 模型保存成json格式,无需使用pool )
y_pred_model2 = model2.predict(X_test) y22 = np.argmax(y_pred_model2,axis=1) y_test22 = np.argmax(y_test , axis = 1) count = 0 for i in range(y22.shape[0]): if y22[i] == y_test22[i]: count+=1 from xgboost import XGBClassifier X_train2,X_test2,y_train2,y_test2 = train_test_split(feature_all,y,test_size = 0.3,random_state=20) model3 = XGBClassifier() model3.fit(X_train2,y_train2) model3.evals_result() cross_val_score(model3, X_train2, y_train2, cv=5) y_pred3 = model3.predict(X_test) count = 0 for i in range(y_pred3.shape[0]): if y_pred3[i] == y_test2[i]: count+=1 # clf = RandomForestClassifier(n_estimators=60,max_features=8,max_depth=None,min_samples_split=3,bootstrap=True,random_state=35) # clf = clf.fit(X_train, y_train) # #scores = cross_val_score(clf, X_train, y_train, cv=5) # #print(scores.mean()) # y_pred = clf.predict(X_test) # for i in range(np.shape(y_test))
def gen_sub_by_para(): #version = '1002' args = locals() logger.debug(f'Run train dnn:{args}') #feature_label = get_dynamic_feature(svd_cmp) feature_label = get_stable_feature('1011') train = feature_label[feature_label['sex'].notnull()] test = feature_label[feature_label['sex'].isnull()] X = train.drop(['sex', 'age', 'sex_age', 'device'], axis=1) Y = train['age'] Y_CAT = pd.Categorical(Y) X_train, X_test, y_train, y_test = train_test_split(X, Y_CAT.codes) gbm = XGBClassifier( objective='multi:softprob', eval_metric='mlogloss', num_class=22, max_depth=3, reg_alpha=10, reg_lambda=10, subsample=0.7, colsample_bytree=0.6, n_estimators=20000, learning_rate=0.01, seed=1, missing=None, #Useless Paras silent=True, gamma=0, max_delta_step=0, min_child_weight=1, colsample_bylevel=1, scale_pos_weight=1, **gpu_params) # print(random_search.grid_scores_) gbm.fit(X_train, y_train, eval_set=[ (X_train, y_train), (X_test, y_test), ], early_stopping_rounds=100, verbose=True) results = gbm.evals_result() #print(results) best_epoch = np.array(results['validation_1']['mlogloss']).argmin() + 1 best_score = np.array(results['validation_1']['mlogloss']).min() pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1) # sub=pd.DataFrame(gbm.predict_proba(pre_x)) # # # sub.columns=Y_CAT.categories # sub['DeviceID']=test['device'].values # sub=sub[['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7','1-8', '1-9', '1-10', '2-0', '2-1', '2-2', '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']] # # # from sklearn.metrics import log_loss # # best = log_loss(y_test, gbm.predict_proba(X_test) ) # # best = round(best, 4) # # #lgb.plot_importance(gbm, max_num_features=20) # # print(f'=============Final train feature({len(feature_label.columns)}):\n{list(feature_label.columns)} \n {len(feature_label.columns)}') print_imp_list(X_train, gbm) # print(f'best_epoch:{best_epoch}_best_score:{best_score}') # # file = f'./sub/baseline_xgb_{best}_{args}_epoch_{best_epoch}.csv' # file = replace_invalid_filename_char(file) # print(f'sub file save to {file}') # sub = round(sub,10) # sub.to_csv(file,index=False) # ###Save result for ensemble train_bk = pd.DataFrame(gbm.predict_proba( train.drop(['sex', 'age', 'sex_age', 'device'], axis=1)), index=train.device, columns=Y_CAT.categories) test_bk = pd.DataFrame(gbm.predict_proba(pre_x), index=test.device, columns=Y_CAT.categories) label_bk = pd.DataFrame( {'label': Y_CAT.codes}, index=train.device, ) save_result_for_ensemble( f'{best_score}_{best_epoch}_xgb_age_{args}', train=train_bk, test=test_bk, label=label_bk, )
y = datasets.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) #2. 모델링 model = XGBClassifier(n_estimators=100, learning_rate=0.01, n_jobs=-1) #3. 훈련 model.fit(x_train, y_train, verbose=1, eval_metric=['merror', 'mlogloss'], eval_set=[(x_train, y_train), (x_test, y_test)]) #4. 평가 result1 = model.score(x_test, y_test) print("result1 : ", result1) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc : ", acc) result2 = model.evals_result() print("result2 : ", result2) # result1 : 0.9722222222222222 # acc : 0.9722222222222222
def modeling(): print("开始建模") # train = pd.read_csv("./small_train.csv") train = pd.read_csv("./train.csv", nrows=10000) train = train[train['weight'] != 0] train['action'] = ((train['weight'].values * train['resp'].values) > 0).astype('int') X_train = train.loc[:, train.columns.str.contains('feature')] y_train = train.loc[:, 'action'] X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=666, test_size=0.2) del train X_train = X_train.fillna(-999) sampler = TPESampler(seed=666) tm = "auto" def create_model(trial): max_depth = trial.suggest_int("max_depth", 2, 12) n_estimators = trial.suggest_int("n_estimators", 2, 600) learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99) subsample = trial.suggest_uniform('subsample', 0.0001, 1.0) colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.0000001, 1) model = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth, learning_rate=learning_rate, subsample=subsample, colsample_bytree=colsample_bytree, random_state=666, tree_method=tm, silent=1) return model def objective(trial): model = create_model(trial) model.fit(X_train, y_train) score = accuracy_score(y_train, model.predict(X_train)) return score params1 = { 'max_depth': 8, 'n_estimators': 500, 'learning_rate': 0.01, 'subsample': 0.9, 'tree_method': tm, 'random_state': 666 } params3 = { 'max_depth': 10, 'n_estimators': 500, 'learning_rate': 0.03, 'subsample': 0.9, 'colsample_bytree': 0.7, 'tree_method': tm, 'random_state': 666 } start_time = time.time() model1 = XGBClassifier(**params1) model1.fit(X_train, y_train, eval_metric='auc') model1.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=False) evals_result = model1.evals_result() print("模型1评分") y_true, y_pred = y_test, model1.predict(X_test) print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)) model3 = XGBClassifier(**params3) model3.fit(X_train, y_train, eval_metric='auc') model3.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric='auc', verbose=False) evals_result = model3.evals_result() print("模型3评分") y_true, y_pred = y_test, model3.predict(X_test) print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)) end_time = time.time() print("建模时间:%.2f秒" % (end_time - start_time)) return (model1, model3)
class Classifier: # for initializing train and test sets, classifier and accuracy score # Change method to gpu_hist if you want xgboost to run on a GPU def __init__(self, params={ 'objective': 'reg:squarederror', 'verbosity': 0 }): self.X_train = [] self.X_labels = [] self.test = [] self.test_labels = [] self.model = XGBClassifier(**params) self.prediction = 0 self.error = 0 def size(self): if isinstance(self.X_train, np.ndarray): return self.X_train.size return len(self.X_train) # adding the data points def input_train(self, features, feature): if isinstance(self.X_train, np.ndarray) and self.X_train.size > 0: self.X_train = self.X_train.tolist() self.X_labels = self.X_labels.tolist() self.X_train.append(features) self.X_labels.append(feature) # train the data def train(self): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) self.model.fit(self.X_train, self.X_labels) def train_eval(self, metric='error'): self.X_train = np.asarray(self.X_train) self.X_labels = np.asarray(self.X_labels) X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.X_labels, test_size=0.33) self.model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric=metric) evals_result = self.model.evals_result() if metric == 'error': validations = [] for val in evals_result.values(): lst = val.get("error") validations.append(sum(lst) / len(lst)) return 1 - (sum(validations) / len(validations)) else: validations = [] for val in evals_result.values(): lst = val.get(metric) validations.append(lst[-1]) return validations # input test labels if you want to check accuracy def label(self, label): self.test_labels.append(label) def input_test(self, features): if isinstance(self.test, np.ndarray) and self.test.size > 0: self.test = self.test.tolist() self.test.append(features) # test data def predict(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict(self.test) return self.prediction def predict_proba(self): if not isinstance(self.test, np.ndarray): self.test = np.asarray(self.test) self.prediction = self.model.predict_proba(self.test) return self.prediction # if you have the test labels you can check the error rate (you want error close to 0) def check_error(self): self.test_labels = np.asarray(self.test_labels) self.error = metrics.mean_absolute_error(self.test_labels, self.prediction) return self.error # save classifier def save_classifier(self, file): self.model.save_model(file) # open saved classifier def open_classifier(self, file): self.model.load_model(file) # removes all training data def clean_train(self): self.X_train = [] self.X_labels = [] # removes all testing data def clean_test(self): self.test = [] self.test_labels = []
y_train, # labels (Y=1 signal, Y=0 background) sample_weight=w_train, # instance weights eval_set=[ (x_train, y_train), (x_val, y_val) ], # a list of (X,y) tuple pairs to use as validation sets ---> validation_0=train, validation_1=validation sample_weight_eval_set=[ w_train, w_val ], # list of arrays storing instances weights for the i-th validation set eval_metric=[ 'auc', 'error' ], # list of parameters under eval_metric: https://xgboost.readthedocs.io/en/latest/parameter.html#learning-task-parameters early_stopping_rounds= 300, # validation metric needs to improve at least once in every early_stopping_rounds round(s) verbose=100) results = model.evals_result() # takes the results from the BDT training above n_estimators = len(results['validation_0'] ['error']) # number of rounds used for the BDT training auc_train = results['validation_0']['auc'] # subsample: auc for training auc_val = results['validation_1']['auc'] # subsample: auc for validation error_train = results['validation_0']['error'] # subsample: error for training error_val = results['validation_1']['error'] # subsample: error for validation plt.figure(figsize=(15, 5)) # --- plot auc for training and validation plt.subplot(121) plt.plot(range(0, n_estimators), auc_train, c='blue', label='train') plt.plot(range(0, n_estimators), auc_val, c='orange', label='validation') ymin = min(min(auc_train), min(auc_val)) ymax = max(max(auc_train), max(auc_val))