plt.savefig('../opt_comp.png', dpi = 100, facecolor = 'white') plt.close() ################################################################################## # BEST HYPERPARAMETERS FOR EACH METHOD names = ['Opt_test_OPTUNA', 'Opt_test_BAYES', 'Opt_test_DEFAULT', 'Opt_test_PbPb'] if False: for name in names: model_hdl = ModelHandler() model_hdl.load_model_handler('../analysis_results/' + name + '/model/model_hdl') print(name) print(model_hdl.get_model_params()) print('\n---------------\n') ################################################################################## # PLOT SUPERIMPOSED ROC ''' plt.close() objects = [] for n in names: with (open('../analysis_results/' + n + '/images/training/ROC_AUC_train_test.pickle', "rb")) as openfile: while True: try: objects.append(pickle.load(openfile)) except EOFError:
# write test set data frame train_test_data_cent[2]['model_output'] = test_y_score train_test_data_cent[2][ 'y_true'] = train_test_data_cent[3] train_test_data_cent_tmp = train_test_data_cent[2].query( f'y_true > 0.5 and ct >= {ct_bins_df[0]} and ct < {ct_bins_df[1]}' ) train_test_data_cent_tmp.to_parquet(f'df/mc_{bin_df}', compression='gzip') # get the model hyperparameters if DUMP_HYPERPARAMS and TRAIN: if not os.path.isdir('hyperparams'): os.mkdir('hyperparams') model_params_dict = model_hdl.get_model_params() with open(f'hyperparams/model_params_{bin}.yml', 'w') as outfile: yaml.dump(model_params_dict, outfile, default_flow_style=False) # save roc-auc del train_test_data_cent ############################################################## if COMPUTE_SCORES_FROM_EFF and TRAIN: pickle.dump(score_eff_arrays_dict, open("file_score_eff_dict", "wb")) # apply model to data if APPLICATION:
def train_xgboost_model(signal, background, filename_dict, params, params_range, flag_dict, training_variables='', testsize=0.5): ''' Trains an XGBOOST model using hipe4ml and plot output distribution and feature importance ''' print('Training XGBOOST model') training_fig_path = filename_dict['analysis_path'] + "/images/training" train_test_data = train_test_generator([signal, background], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) if not flag_dict['use_default_param']: model_hdl.set_model_params(params) if flag_dict['benchmark_opt']: print('Benchamarking optimizers\n') import time from sklearn.metrics import roc_auc_score times_sk = [] roc_sk = [] for i in range(1): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc_sk.append(roc_auc_score(train_test_data[3], y_pred_test)) times_sk.append(time.time() - start) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') time = [] roc = [] for i in range(1): for key in params: if isinstance(params[key], str): params_range[key] = params[key] model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs']) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('\nBAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(times_sk))) print('Mean ROC : ' + str(np.mean(roc_sk))) print('--------------\n') print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') if flag_dict['optimize_bayes']: import time print('Doing Bayes optimization of hyperparameters\n') start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', n_iter=700, njobs=flag_dict['n_jobs']) print('Elapsed time: ' + str(time.time() - start)) if flag_dict['optimize_optuna']: print('Doing Optuna optimization of hyperparameters\n') for key in params: if isinstance(params[key], str): params_range[key] = params[key] study = model_hdl.optimize_params_optuna(train_test_data, params_range, scoring='roc_auc', timeout=flag_dict['timeout'], n_jobs=flag_dict['n_jobs'], n_trials=None) print('Parameters optimization done!\n') if flag_dict['plot_optim']: print('Saving optimization plots') fig = optuna.visualization.plot_slice(study) fig.write_image(training_fig_path + '/optuna_slice.png') fig = optuna.visualization.plot_optimization_history(study) fig.write_image(training_fig_path + '/optuna_history.png') '''fig = optuna.visualization.plot_param_importances(study) fig.write_image(training_fig_path + '/optuna_param_importance.png') fig = optuna.visualization.plot_contour(study) fig.write_image(training_fig_path + '/optuna_contour.png')''' print('Done\n') import joblib joblib.dump(study, filename_dict['analysis_path'] + "model/study.pkl") model_hdl.train_test_model(train_test_data, ) print(model_hdl.get_model_params()) print('Predicting values on training and test datas') y_pred_train = model_hdl.predict(train_test_data[0], True) y_pred_test = model_hdl.predict(train_test_data[2], True) #used to evaluate model performance print('Prediction done\n') plt.rcParams["figure.figsize"] = (10, 7) leg_labels = ['background', 'signal'] print('Saving Output comparison plot') plt.figure() ml_out_fig = plot_utils.plot_output_train_test(model_hdl, train_test_data, 100, True, leg_labels, True, density=False) plt.savefig(training_fig_path + '/output_train_test.png', dpi=300, facecolor='white') plt.close() print('Done\n') print('Saving ROC AUC plot') plt.figure() roc_train_test_fig = plot_utils.plot_roc_train_test( train_test_data[3], y_pred_test, train_test_data[1], y_pred_train, None, leg_labels) #ROC AUC plot plt.savefig(training_fig_path + '/ROC_AUC_train_test.png', dpi=300, facecolor='white') import pickle with open(training_fig_path + '/ROC_AUC_train_test.pickle', 'wb') as f: pickle.dump(roc_train_test_fig, f) plt.close() print('Done\n') print('Saving feature importance plots') plt.figure() feat_imp_1, feat_imp_2 = plot_utils.plot_feature_imp(train_test_data[2], train_test_data[3], model_hdl, approximate=True) feat_imp_1.savefig(training_fig_path + '/feature_importance_HIPE4ML_violin.png', dpi=300, facecolor='white') feat_imp_2.savefig(training_fig_path + '/feature_importance_HIPE4ML_bar.png', dpi=300, facecolor='white') plt.close() print('Done\n') efficiency_score_conversion(train_test_data, y_pred_test, filename_dict) return train_test_data, y_pred_test, model_hdl