def test_HistGradientBoostingClassifier_proba(): from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier # train a tree-based model X, y = shap.datasets.adult() model = HistGradientBoostingClassifier(max_iter=10, max_depth=6).fit(X, y) explainer = shap.TreeExplainer(model, shap.sample(X, 10), model_output="predict_proba") shap_values = explainer.shap_values(X) assert np.max(np.abs(shap_values[0].sum(1) + explainer.expected_value[0] - model.predict_proba(X)[:,0])) < 1e-4
min_samples_leaf=10) clf2 = ExtraTreesClassifier(n_estimators=50, max_depth=10, min_samples_leaf=10) clf3 = HistGradientBoostingClassifier(l2_regularization=1, min_samples_leaf=17, max_iter=215) clf4 = HistGradientBoostingClassifier(l2_regularization=1) clf5 = KNeighborsClassifier(n_neighbors=20) clf6 = DecisionTreeClassifier(splitter='random', min_samples_split=20) runTest(clf1, every) runTest(clf2, every) runTest(clf3, every, True) runTest(clf4, every) runTest(clf6, every, lower=0.01, upper=0.99) sub = clf4.predict_proba(fin[every]) # take just the `id` and `n_violations` columns (since that's all we need) submission = fin[['id']].copy() tmp = [] for i in sub: if i[1] > 1: tmp.append(0.99) elif i[1] < 0: tmp.append(0.01) else: tmp.append(i[1]) submission['Predicted'] = tmp # IMPORTANT: Kaggle expects you to name the columns `Id` and `Predicted`, so let's make sure here
column for column in categorical_columns if column != target_column_name ] model = CatBoostClassifier(cat_features=categorical_columns, grow_policy='Lossguide', learning_rate=0.1, n_estimators=100, num_leaves=255, train_dir='data/catboost_info', verbose=False) model.fit(features_train, labels_train, silent=True) # Make predictions on the test data. if args.library == 'h2o': predictions_proba = model.predict(data_test).as_data_frame()['Y'] else: predictions_proba = model.predict_proba(features_test)[:, 1] # Compute metrics. auc_roc = roc_auc_score(labels_test, predictions_proba) # Compute memory usage. f = open("/proc/self/status", "r") for line in f.readlines(): if line.startswith("VmHWM"): memory = line.split(":")[1].strip() print(json.dumps({ 'auc_roc': auc_roc, 'memory': memory, }))
print(": SGD - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro"))) hgbc_model = HistGradientBoostingClassifier( l2_regularization=1.766059063693552, learning_rate=0.10675193678150449, max_bins=128, max_depth=31, max_leaf_nodes=185, random_state=2021 ) hgbc_model.fit( hgbc_x_train, y_train, ) train_oof_preds = hgbc_model.predict_proba(hgbc_x_valid)[:,-1] test_oof_preds = hgbc_model.predict_proba(test[hgbc_features])[:,-1] hgbc_train_preds[test_index] = train_oof_preds hgbc_test_preds += test_oof_preds / n_folds print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro"))) print("") print("--> Overall metrics") print(": XGB - ROC AUC Score = {}".format(roc_auc_score(target, xgb_train_preds, average="micro"))) print(": LGB - ROC AUC Score = {}".format(roc_auc_score(target, lgb_train_preds, average="micro"))) print(": CB - ROC AUC Score = {}".format(roc_auc_score(target, cb_train_preds, average="micro"))) print(": Ridge - ROC AUC Score = {}".format(roc_auc_score(target, ridge_train_preds, average="micro"))) print(": SGD - ROC AUC Score = {}".format(roc_auc_score(target, sgd_train_preds, average="micro"))) print(": HGBC - ROC AUC Score = {}".format(roc_auc_score(target, hgbc_train_preds, average="micro"))) # !SECTION cross validation
print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, early_stopping=False, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) predicted_proba_test = lightgbm_est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
def test_same_predictions_multiclass_classification(seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification pytest.importorskip("lightgbm") rng = np.random.RandomState(seed=seed) max_iter = 1 max_bins = 255 lr = 1 X, y = make_classification( n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0, ) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(n_bins=max_bins + 1).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss="categorical_crossentropy", max_iter=max_iter, max_bins=max_bins, learning_rate=lr, early_stopping=False, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes, ) est_lightgbm = get_equivalent_estimator(est_sklearn, lib="lightgbm") est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > 0.89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > 0.75 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
def test_same_predictions_multiclass_classification( seed, min_samples_leaf, n_samples, max_leaf_nodes): # Same as test_same_predictions_regression but for classification rng = np.random.RandomState(seed=seed) n_samples = n_samples max_iter = 1 max_bins = 256 lr = 1 X, y = make_classification(n_samples=n_samples, n_classes=3, n_features=5, n_informative=5, n_redundant=0, n_clusters_per_class=1, random_state=0) if n_samples > 255: # bin data and convert it to float32 so that the estimator doesn't # treat it as pre-binned X = _BinMapper(max_bins=max_bins).fit_transform(X).astype(np.float32) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng) est_sklearn = HistGradientBoostingClassifier( loss='categorical_crossentropy', max_iter=max_iter, max_bins=max_bins, learning_rate=lr, n_iter_no_change=None, min_samples_leaf=min_samples_leaf, max_leaf_nodes=max_leaf_nodes) est_lightgbm = get_equivalent_estimator(est_sklearn, lib='lightgbm') est_lightgbm.fit(X_train, y_train) est_sklearn.fit(X_train, y_train) # We need X to be treated an numerical data, not pre-binned data. X_train, X_test = X_train.astype(np.float32), X_test.astype(np.float32) pred_lightgbm = est_lightgbm.predict(X_train) pred_sklearn = est_sklearn.predict(X_train) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up to # the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_train, pred_lightgbm) acc_sklearn = accuracy_score(y_train, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2) if max_leaf_nodes < 10 and n_samples >= 1000: pred_lightgbm = est_lightgbm.predict(X_test) pred_sklearn = est_sklearn.predict(X_test) assert np.mean(pred_sklearn == pred_lightgbm) > .89 proba_lightgbm = est_lightgbm.predict_proba(X_train) proba_sklearn = est_sklearn.predict_proba(X_train) # assert more than 75% of the predicted probabilities are the same up # to the second decimal assert np.mean(np.abs(proba_lightgbm - proba_sklearn) < 1e-2) > .75 acc_lightgbm = accuracy_score(y_test, pred_lightgbm) acc_sklearn = accuracy_score(y_test, pred_sklearn) np.testing.assert_almost_equal(acc_lightgbm, acc_sklearn, decimal=2)
for beta in np.arange(0.2, 1.01, 0.2): all_sample_weights = getWeights(data, alpha * wPep_t, beta * wXL_t) clf = HistGradientBoostingClassifier( scoring="f1", monotonic_cst=monotonic_cst, tol=1e-7, random_state=42, validation_fraction=None) #, early_stopping=True) clf.fit(X, y, sample_weight=all_sample_weights) print("alpha,beta: " + str(alpha) + "\t" + str(beta)) print("Loss on all data (sample weight): {:.2f}".format( clf.score(X, y, sample_weight=all_sample_weights))) p = clf.predict_proba( data.loc[:, data.columns != 'Label'])[:, 1] # prob for class=1 (target) p = pd.DataFrame({'p-value': p}) data.reset_index(drop=True, inplace=True) p.reset_index(drop=True, inplace=True) data2 = pd.concat([data, p], axis=1) data2 = calcQ(data2, scoreColName="p-value") data2["Rank"] = 1 # store best fit nXLauc, XLauc = evalXL(data2, plot=False, maxQ=0.1) print("pAUC(peptides), pAUC(XLs): " + str(nXLauc) + "\t" + str(XLauc)) print("sum(pAUC): " + str(nXLauc + XLauc)) print("Confusion matrix:") print(confusion_matrix(y, clf.predict(X))) if nXLauc + 10.0 * XLauc > best_nXLauc + 10.0 * best_XLauc: # we weight XL auc higher than peptide auc
n_jobs = -1) start_time = time.time() grid_search = grid_search.fit(X_train, y_train) print('Training time: {} minutes'.format(round((time.time() - start_time)/60, 2))) grid_search.best_params_, grid_search.best_score_ # last step clf_hgb = grid_search.best_estimator_ clf_hgb.fit(X_train, y_train) y_pred = clf_hgb.predict(X_test) print(classification_report(y_test, y_pred)) y_pred = clf_hgb.predict_proba(X_test)[:, 1] print('HGB AUC_ROC: %.3f' % roc_auc_score(y_test, y_pred)) # KF & RS parameters = {'learning_rate': uniform(0,0.1), 'max_depth':sp_randint(3, 11), 'max_leaf_nodes':sp_randint(2, 32), 'min_samples_leaf':sp_randint(1, 11), 'max_iter':[400,600,800,1000,1200], 'l2_regularization':uniform(0,0.1)} rand_search = RandomizedSearchCV(estimator = clf_hgb, param_distributions = parameters, scoring='roc_auc', n_iter=100,
# model = HistGradientBoostingClassifier(max_iter=300) # GDSCV = GridSearchCV(estimator=model, param_grid=parameters, cv=5, scoring='roc_auc', n_jobs=-1) # GDSCV.fit(X_train, y_train) # print(GDSCV.best_params_) # cv_score = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc', n_jobs=-1) # print("Cross-validation score is {score:.3f}," # " standard deviation is {err:.3f}" # .format(score = cv_score.mean(), err = cv_score.std())) model = HistGradientBoostingClassifier(max_iter=300, l2_regularization=params[i][0], learning_rate=params[i][1], max_depth=int(params[i][2])) model = model.fit(X_train, y_train) prob = model.predict_proba(X_test) prob = np.array(prob[:, 1]) y_pred[label] = prob print(label, ': finished!\n') # Task 3 for i in range(11, len(labels)): label = labels[i] y_train = df_label[label] # # grid search # parameters = { # 'learning_rate':[0.05, 0.10, 0.15, 0.20], # 'max_depth':[3, 4, 5, 6, 7, 8, 9],
print(f"Training set with {n_samples} records with {n_features} features.") print("Fitting a sklearn model...") tic = time() est = HistGradientBoostingClassifier(loss='binary_crossentropy', learning_rate=lr, max_iter=n_trees, max_bins=max_bins, max_leaf_nodes=n_leaf_nodes, n_iter_no_change=None, random_state=0, verbose=1) est.fit(data_train, target_train) toc = time() predicted_test = est.predict(data_test) predicted_proba_test = est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}") if args.lightgbm: print("Fitting a LightGBM model...") tic = time() lightgbm_est = get_equivalent_estimator(est, lib='lightgbm') lightgbm_est.fit(data_train, target_train) toc = time() predicted_test = lightgbm_est.predict(data_test) predicted_proba_test = lightgbm_est.predict_proba(data_test) roc_auc = roc_auc_score(target_test, predicted_proba_test[:, 1]) acc = accuracy_score(target_test, predicted_test) print(f"done in {toc - tic:.3f}s, ROC AUC: {roc_auc:.4f}, ACC: {acc :.4f}")
class tuned_HGB(BaseEstimator): """ Scikit-learn histogram gradient-boosted tree models, tuned with nested cross-validation to minimize the error on a unseen table. Parameters ---------- task : str The estimation task to perform, either 'salary', 'quantile', or 'sex'. learning_rate : None or float The learning rate of the model. If None, a nested cross-validation procedure is used to determine the best one. fit_on : str If fit_on = 'all', all the validation data is used to compute the validation error. Set fit_on = 'seen' or 'unseen' to optimize the learning rate for unseen or seen categories only. """ def __init__(self, task, learning_rate=None, fit_on='all'): self.task = task self.learning_rate = learning_rate self.fit_on = fit_on return def param_tuning(self, X1, y1): D_var = make_D_var(self.X1_nem, self.X1_mem, n_jobs=1) n_var = n_variants(self.X1_nem, self.X1_mem, y1, self.groups1, n_splits=None, test_size=None, D_var=D_var, n_jobs=1, nested_cross_val=True) lr_list = np.logspace(-2, -0.5, 4) res = np.zeros(len(lr_list)) for k in range(len(lr_list)): if self.task == "salary": self2 = HistGradientBoostingRegressor(learning_rate=lr_list[k]) else: self2 = HistGradientBoostingClassifier( learning_rate=lr_list[k]) cv_err = cv_errors(self.task, self2, X1, self.X1_nem, self.X1_mem, y1, self.groups1, n_splits=None, test_size=None, n_jobs=1, nested_cross_val=True) if self.task != 'quantile': cv_err = cv_err**2 if self.fit_on == 'unseen': res[k] = cv_err[n_var == 0].mean() elif self.fit_on == 'seen': res[k] = cv_err[n_var >= 1].mean() else: res[k] = cv_err.mean() self.learning_rate = lr_list[np.argmin(res)] print(int(sum(n_var == 0) / len(n_var) * 100) / 100) return def fit(self, X1, y1): # Parameter tuning if self.learning_rate == None: self.param_tuning(X1, y1) print(self.learning_rate) # Fit on all train data with tuned params if self.task == "salary": self.model = HistGradientBoostingRegressor( learning_rate=self.learning_rate) else: self.model = HistGradientBoostingClassifier( learning_rate=self.learning_rate) self.model.fit(X1, y1) return def predict(self, X2): return self.model.predict(X2) def predict_proba(self, X2): return self.model.predict_proba(X2)
def main(): #==================================================== # DATA PREPARATION #==================================================== #Let's have a look at the dataset: data_full = pd.read_csv('dataset_higgs_challenge.csv') #For this classification I used only yhe "t" (training data), "b" (validation data) and "v" (test data) set of variables: print('Total number of events: ', len(data_full), '\n') for KaggleSetID in ['t', 'b', 'v', 'u']: print('Number of events in the {} KaggleSet: {}'.format( KaggleSetID, len(data_full['KaggleSet'][data_full['KaggleSet'] == KaggleSetID]))) #Description of the sub-dataset in each line: #1) Splitting of the dataset into train, test and validation set. #2) Extracting the weights of the validation and test set. #3) Extracting the binary arrays for my networks. #4) Extracting the binary arrays for my BDT #Within the splitting of the dataset, have been applyied some operations on the engineering of the features for each subset. The problem is that the "phi" variables have a signal distribution that is very similar to the background one. So it's better to consider their linear combination (difference in this case) to make them useful in my classification. X, df_empty, y_train, y_train_BDT = splitting(data_full, "t") X_val, weights_val, y_val, y_val_BDT = splitting(data_full, "b") X_test, weights_test, y_test, y_test_BDT = splitting(data_full, "v") del (data_full) #==================================================== # BDT #==================================================== #Let's first scale my data: standard = StandardScaler() standard.fit(X) X_standard = standard.transform(X) X_val_standard = standard.transform(X_val) X_test_standard = standard.transform(X_test) #BDT classification: BDT = HistGradientBoostingClassifier(max_iter=90, verbose=1, l2_regularization=0.5, learning_rate=.1, max_leaf_nodes=50, random_state=45, max_depth=15, max_bins=50) BDT.fit(X_standard, y_train_BDT) y_pred_val = BDT.predict_proba(X_val_standard) y_pred_test = BDT.predict_proba(X_test_standard) del X_standard, X_val_standard, X_test_standard #I will split the results just to be able to combine them with the DNN result later: BDT_0jets_val = y_pred_val[X_val['PRI_jet_num'] == 0] BDT_1jet_val = y_pred_val[X_val['PRI_jet_num'] == 1] BDT_2jets_val = y_pred_val[X_val['PRI_jet_num'] >= 2] y_pred_BDT_val = np.concatenate( (BDT_0jets_val, BDT_1jet_val, BDT_2jets_val)) BDT_0jets_test = y_pred_test[X_test['PRI_jet_num'] == 0] BDT_1jet_test = y_pred_test[X_test['PRI_jet_num'] == 1] BDT_2jets_test = y_pred_test[X_test['PRI_jet_num'] >= 2] y_pred_BDT_test = np.concatenate( (BDT_0jets_test, BDT_1jet_test, BDT_2jets_test)) #==================================================== # DATA PROCESSING #==================================================== #Let's construct the data for the case with 0 jets: X_0jets, y_train_0jets, empty_0 = splitting_jets(X, y_train, df_empty, 0) X_val_0jets, y_val_0jets, weights_0jets_val = splitting_jets( X_val, y_val, weights_val, 0) X_test_0jets, y_test_0jets, weights_0jets_test = splitting_jets( X_test, y_test, weights_test, 0) #Let's construct the data for the case with 1 jets: X_1jet, y_train_1jet, empty_1 = splitting_jets(X, y_train, df_empty, 1) X_val_1jet, y_val_1jet, weights_1jet_val = splitting_jets( X_val, y_val, weights_val, 1) X_test_1jet, y_test_1jet, weights_1jet_test = splitting_jets( X_test, y_test, weights_test, 1) #Let's construct the data for the case with 2 jets: X_2jets, y_train_2jets, empty_2 = splitting_jets(X, y_train, df_empty, 2) X_val_2jets, y_val_2jets, weights_2jets_val = splitting_jets( X_val, y_val, weights_val, 2) X_test_2jets, y_test_2jets, weights_2jets_test = splitting_jets( X_test, y_test, weights_test, 2) del empty_0, empty_1, empty_2 #==================================================== # 2-JETS DNN #==================================================== #Scaling data: standard_2jets = StandardScaler() standard_2jets.fit(X_2jets) X_2jets_standard = standard_2jets.transform(X_2jets) X_val_2jets_standard = standard_2jets.transform(X_val_2jets) X_test_2jets_standard = standard_2jets.transform(X_test_2jets) #DNN: np.random.seed(42) DNN_2jets = make_model([64, 128, 64, 64, 32, 8], 'relu', 0.1, 'Adam', 'L2', 0.0001, X_2jets.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_2jets.fit(X_2jets_standard, y_train_2jets, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_2jets_standard, y_val_2jets), callbacks=[early_stopping], class_weight=None) y_pred_2jets_val = DNN_2jets.predict(X_val_2jets_standard) y_pred_2jets_test = DNN_2jets.predict(X_test_2jets_standard) del X_2jets_standard, X_val_2jets_standard, X_2jets, X_val_2jets, X_test_2jets_standard, X_test_2jets #==================================================== # 1-JET DNN #==================================================== #Scaling data: standard_1jet = StandardScaler() standard_1jet.fit(X_1jet) X_1jet_standard = standard_1jet.transform(X_1jet) X_val_1jet_standard = standard_1jet.transform(X_val_1jet) X_test_1jet_standard = standard_1jet.transform(X_test_1jet) #DNN: np.random.seed(42) DNN_1jet = make_model([64, 64, 64, 32, 8], 'relu', 0.1, 'Adagrad', 'L1', 0.0001, X_1jet.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_1jet.fit(X_1jet_standard, y_train_1jet, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_1jet_standard, y_val_1jet), callbacks=[early_stopping], class_weight=None) y_pred_1jet_val = DNN_1jet.predict(X_val_1jet_standard) y_pred_1jet_test = DNN_1jet.predict(X_test_1jet_standard) del X_1jet_standard, X_val_1jet_standard, X_1jet, X_val_1jet, X_test_1jet_standard, X_test_1jet #==================================================== # 0-JET DNN #==================================================== #Scaling data: standard_0jets = StandardScaler() standard_0jets.fit(X_0jets) X_0jets_standard = standard_0jets.transform(X_0jets) X_val_0jets_standard = standard_0jets.transform(X_val_0jets) X_test_0jets_standard = standard_0jets.transform(X_test_0jets) #DNN: np.random.seed(42) DNN_0jets = make_model([32, 64, 128, 64, 32, 8], 'elu', 0.1, 'Adagrad', 'L1', 0.0001, X_0jets.shape[-1]) early_stopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=10, verbose=0, mode='auto', baseline=None, restore_best_weights=True) history = DNN_0jets.fit(X_0jets_standard, y_train_0jets, batch_size=256, epochs=50, verbose=1, validation_data=(X_val_0jets_standard, y_val_0jets), callbacks=[early_stopping], class_weight=None) y_pred_0jets_val = DNN_0jets.predict(X_val_0jets_standard) y_pred_0jets_test = DNN_0jets.predict(X_test_0jets_standard) del X_0jets_standard, X_val_0jets_standard, X_0jets, X_val_0jets, X_test_0jets_standard, X_test_0jets #==================================================== # TOTAL AMS SCORE OF DNNs #==================================================== #Total AMS score considering all the AMS of each subset: y_pred_DNN_val = np.concatenate( (y_pred_0jets_val, y_pred_1jet_val, y_pred_2jets_val)) y_val_total = np.concatenate((y_val_0jets, y_val_1jet, y_val_2jets)) weights_total_val = np.concatenate( (weights_0jets_val, weights_1jet_val, weights_2jets_val)) y_pred_DNN_test = np.concatenate( (y_pred_0jets_test, y_pred_1jet_test, y_pred_2jets_test)) y_test_total = np.concatenate((y_test_0jets, y_test_1jet, y_test_2jets)) weights_total_test = np.concatenate( (weights_0jets_test, weights_1jet_test, weights_2jets_test)) #==================================================== # COMBINING DNNs AND BDT AMS #==================================================== dataset_blend_val = np.append(y_pred_DNN_val[:, 1].reshape(-1, 1), y_pred_BDT_val[:, 1].reshape(-1, 1), axis=1) dataset_blend_test = np.append(y_pred_DNN_test[:, 1].reshape(-1, 1), y_pred_BDT_test[:, 1].reshape(-1, 1), axis=1) blend = LogisticRegression(solver='lbfgs') blend.fit(dataset_blend_val, y_val_total[:, 1]) blended_val = blend.predict_proba(dataset_blend_val) blended_test = blend.predict_proba(dataset_blend_test) #==================================================== # FINAL RESULTS #==================================================== print('DNN:') plot_AMS(y_pred_DNN_test, y_test_total, weights_total_test) print('BDT:') plot_AMS(y_pred_BDT_test, y_test_total, weights_total_test) print('Combination:') plot_AMS(blended_test, y_test_total, weights_total_test) plt.legend(['DNN', 'BDT', 'DNN + BDT']) plt.ylim(2.8, ) plt.savefig('AMS_total.png', dpi=300) plt.show() plot_distributions_final(blended_val, blended_test, y_val_total, 50, False, weights_total_val, weights_total_test) plt.savefig('Final_distribution_unweighted.png', dpi=300) plt.show() plot_distributions_final(blended_val, blended_test, y_val_total, 50, True, weights_total_val, weights_total_test) plt.savefig('Final_distribution_weighted.png', dpi=300) plt.show()
class Network(): """Represent a network and let us operate on it. Currently only works for an MLP. """ def __init__( self, nn_param_choices=None, ): self.accuracy = 0. self.nn_param_choices = nn_param_choices self.network_params = {} # (dic): represents MLP network parameters self.model = None self.best_threshold = 0.5 def compile_model(self, bFinal=False): # Get our network parameters. max_iter = 150 if bFinal else 60 max_features = None if bFinal else "auto" self.best_threshold = 0.5 #self.model = RandomForestClassifier(n_estimators=n_estimators, verbose=2) f_scorer = make_scorer(fbeta_score, beta=0.125) self.model = HistGradientBoostingClassifier( scoring=f_scorer, #learning_rate=0.1,max_bins=50, max_depth=3,n_iter_no_change=10, max_iter=max_iter, verbose=2) #, #validation_fraction=0.08) def create_random(self): for key in self.nn_param_choices: self.network_params[key] = random.choice( self.nn_param_choices[key]) def create_set(self, network): self.network_params = network def train(self, dataset_dict): if self.accuracy == 0.: self.accuracy = self.train_net(dataset_dict) def print_network(self): logging.info(self.network_params) logging.info("RF threshold: %.2f%%" % (self.best_threshold)) logging.info("RF accuracy: %.2f%%" % (self.accuracy * 100)) def update_best_threshold(self, y_val_proba, y_validation, y_train_proba, y_train): self.best_threshold = 0.5 best_fbeta_score_valid = 0 best_fbeta_score_train = 0 beta = 0.25 for threshold in np.arange(0.5, 0.8, 0.001): y_val_pred = np.where(y_val_proba[:, 1] > threshold, 1, 0) # y_train_pred = np.where(y_train_proba[:, 1] > threshold, 1, 0) curr_validation_beta_score = fbeta_score(y_validation, y_val_pred, beta=beta) # curr_train_beta_score = fbeta_score(y_train, y_train_pred, beta=beta) if curr_validation_beta_score >= best_fbeta_score_valid: # and curr_train_beta_score >= best_fbeta_score_train: best_fbeta_score_valid = curr_validation_beta_score # best_fbeta_score_train = curr_train_beta_score self.best_threshold = threshold header_note = "#" * 80 print(header_note) print(f'#### improve thres:{self.best_threshold} With:') print(f'validation f-beta-{beta} score {best_fbeta_score_valid}') # print(f'train f-beta-{beta} score {best_fbeta_score_train}') print(header_note) def train_net(self, dataset_dict): self.compile_model(False) num_of_rows = self.network_params["Network_train_sample_size"] rows_index = np.random.choice(dataset_dict["X_train"].shape[0], size=num_of_rows, replace=False) print(f"train_net with param{self.network_params}") self.model.fit(dataset_dict["X_train"][rows_index, :], dataset_dict["y_train"][rows_index]) y_val_proba = self.model.predict_proba(dataset_dict["X_validation"]) y_train_proba = self.model.predict_proba( dataset_dict["X_train"][rows_index, :]) self.update_best_threshold(y_val_proba, dataset_dict["y_validation"], y_train_proba, dataset_dict["y_train"][rows_index]) y_train_pred = np.where(y_train_proba[:, 1] > self.best_threshold, 1, 0) y_val_pred = np.where(y_val_proba[:, 1] > self.best_threshold, 1, 0) print( 'Train accuracy', accuracy_score(dataset_dict["y_train"][rows_index], y_train_pred)) print('Validation accuracy', accuracy_score(dataset_dict["y_validation"], y_val_pred)) print( 'Train precision', precision_score(dataset_dict["y_train"][rows_index], y_train_pred)) print('Validation precision', precision_score(dataset_dict["y_validation"], y_val_pred)) print('Train recall', recall_score(dataset_dict["y_train"][rows_index], y_train_pred)) print('Validation recall', recall_score(dataset_dict["y_validation"], y_val_pred)) print( 'Train f-beta score', fbeta_score(dataset_dict["y_train"][rows_index], y_train_pred, beta=0.25)) validation_beta_score = fbeta_score(dataset_dict["y_validation"], y_val_pred, beta=0.25) print(f'Validation f-beta score {validation_beta_score}') return validation_beta_score def train_final_net(self, dataset_dict): str_header = "#" * 80 print(str_header) print(f"best RF.. train_final_net with param{self.network_params}") print(str_header) self.compile_model(bFinal=True) self.model.fit(dataset_dict["X_train"], dataset_dict["y_train"]) y_val_proba = self.model.predict_proba(dataset_dict["X_validation"]) y_train_proba = self.model.predict_proba(dataset_dict["X_train"]) self.update_best_threshold(y_val_proba, dataset_dict["y_validation"], y_train_proba, dataset_dict["y_train"]) y_train_pred = np.where(y_train_proba[:, 1] > self.best_threshold, 1, 0) y_val_pred = np.where(y_val_proba[:, 1] > self.best_threshold, 1, 0) print(str_header) print(str_header) print('Train accuracy', accuracy_score(dataset_dict["y_train"], y_train_pred)) print('Validation accuracy', accuracy_score(dataset_dict["y_validation"], y_val_pred)) print('Train precision', precision_score(dataset_dict["y_train"], y_train_pred)) print('Validation precision', precision_score(dataset_dict["y_validation"], y_val_pred)) print('Train recall', recall_score(dataset_dict["y_train"], y_train_pred)) print('Validation recall', recall_score(dataset_dict["y_validation"], y_val_pred)) print('Train f-beta score', fbeta_score(dataset_dict["y_train"], y_train_pred, beta=0.25)) validation_beta_score = fbeta_score(dataset_dict["y_validation"], y_val_pred, beta=0.25) print(f'Validation f-beta score {validation_beta_score}') print(str_header) print(str_header) self.accuracy = validation_beta_score return validation_beta_score def WriteModelToFile(self): print("save net to model") print("Network accuracy: %.2f%%" % (self.accuracy * 100)) print(self.network_params) self.print_network() # TODO: use pickle # self.model.save("model.h5") def WriteResToFile(self, ds_class, file_name): """Train the model, return test loss. Args: network (dict): the parameters of the network dataset (str): Dataset to use for training/evaluating """ print(f"Write tests results to File {file_name}..") y_test_pred = np.where( self.model.predict_proba(ds_class["X_test"])[:, 1] > self.best_threshold, 1, 0) np.savetxt(file_name, y_test_pred.astype(int), fmt='%i', delimiter='\n')