def test_tree_handler(): # pylint: disable=too-many-statements """ Test the TreeHandler class functionalities. """ # define the working directory test_dir = Path(__file__).resolve().parent # initialize TreeHandler test test_data, references = init_tree_handler_test_workspace(test_dir) # instantiate tree handler objects data_hdlr = TreeHandler(test_data[0], 'treeMLDplus') prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus') data_pq_hdlr = TreeHandler(test_data[2]) prompt_pq_hdlr = TreeHandler(test_data[3]) mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus') mult_pq_hdlr = TreeHandler(test_data[2:]) # open refernces objects reference_data_slice_df = pd.read_pickle(references[0]) reference_prompt_slice_df = pd.read_pickle(references[1]) with open(references[2], 'rb') as handle: reference_dict = pickle.load(handle) terminate_tree_handler_test_workspace(test_dir) # test that data is the same in root and parquet assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \ 'data Dataframe from parquet file differs from the root file one!' assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \ 'prompt Dataframe from parquet file differs from the root file one!' # test loading from multiple files merged_df = pd.concat( [data_hdlr.get_data_frame(), prompt_hdlr.get_data_frame()], ignore_index=True) assert mult_hdlr.get_data_frame().equals( merged_df), 'loading of multiple root files not working!' merged_pq_df = pd.concat( [data_pq_hdlr.get_data_frame(), prompt_pq_hdlr.get_data_frame()], ignore_index=True) assert mult_pq_hdlr.get_data_frame().equals( merged_pq_df), 'loading of multiple parquet files not working!' # define the info dict that will be compared with the reference info_dict = {} # get the number of candidates in the original data sample info_dict['n_data'] = data_hdlr.get_n_cand() info_dict['n_prompt'] = prompt_hdlr.get_n_cand() # get the original variable list info_dict['data_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_var_list'] = prompt_hdlr.get_var_names() # shuffle dataframes new_hndl = data_hdlr.shuffle_data_frame(size=10, random_state=5, inplace=False) copied_hndl = copy.deepcopy(data_hdlr) copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True) assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after shuffling' # apply preselections preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False) data_hdlr.apply_preselections(preselections_data) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after the preselections' prompt_hdlr.apply_preselections(preselections_prompt) # get the number of selected data info_dict['n_data_preselected'] = data_hdlr.get_n_cand() info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand() # get the preselections info_dict['data_preselections'] = data_hdlr.get_preselections() info_dict['prompt_preselections'] = prompt_hdlr.get_preselections() # apply dummy eval() on the underlying data frame d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)' new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False) data_hdlr.eval_data_frame(d_len_z_def) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after eval' prompt_hdlr.eval_data_frame(d_len_z_def) # get the new variable list info_dict['data_new_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names() # get a random subset of the original data data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED) prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED) # slice both data and prompt data frame respect to the pT bins = [[0, 2], [2, 10], [10, 25]] data_hdlr.slice_data_frame('pt_cand', bins) prompt_hdlr.slice_data_frame('pt_cand', bins) # store projection variable and binning info_dict['data_proj_variable'] = data_hdlr.get_projection_variable() info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable() info_dict['data_binning'] = data_hdlr.get_projection_binning() info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning() # get info from a single data slice data_slice_df = data_hdlr.get_slice(2) prompt_slice_df = prompt_hdlr.get_slice(2) info_dict['n_data_slice'] = len(data_slice_df) info_dict['n_prompt_slice'] = len(prompt_slice_df) # test info_dict reproduction assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!' # test sliced data frames reproduction assert data_slice_df.equals( reference_data_slice_df ), 'data sliced DataFrame differs from the reference!' assert prompt_slice_df.equals( reference_prompt_slice_df ), 'prompt sliced DataFrame differs from the reference!'
def benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, training_variables='', testsize=0.75): import time from sklearn.metrics import roc_auc_score N_run = 1 data_path = filename_dict['data_path'] analysis_path = filename_dict['analysis_path'] print('Loading MC signal') mc_signal = TreeHandler() mc_signal.get_handler_from_large_file( file_name=data_path + filename_dict['MC_signal_filename'], tree_name=filename_dict['MC_signal_table']) print('MC signal loaded\n') print('Loading background data for training') background_ls = TreeHandler() background_ls.get_handler_from_large_file( file_name=data_path + filename_dict['train_bckg_filename'], tree_name=filename_dict['train_bckg_table']) background_ls.apply_preselections(presel_dict['train_bckg_presel']) background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(), mc_signal.get_n_cand() * 4)) print('Done\n') train_test_data = train_test_generator([mc_signal, background_ls], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) times = [] roc = [] for i in range(N_run): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) times.append(time.time() - start) print('BAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') for i in range(N_run): model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=np.mean(times), njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n')
PROMPT_FILE_PATH = DATA_DIR.joinpath('Prompt_Dpluspp7TeV_pT_1_50.root') # define dictionary for storing reference for the tests INFO_DICT = {} # preliminar check if not REFERENCE_DIR.is_dir(): sys.exit( "No 'references' dir was found, so no reference data were produced!") # instantiate tree handler objects DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus') PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus') # store number of candidates in the original data sample INFO_DICT['n_data'] = DATA_HDLR.get_n_cand() INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand() # store original variable list INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names() INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names() # apply preselections PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' DATA_HDLR.apply_preselections(PRESEL_DATA) PROMPT_HDLR.apply_preselections(PRESEL_PROMPT) # store number of selcted data INFO_DICT['n_data_preselected'] = DATA_HDLR.get_n_cand()
) #df_signal_cent_ct = df_signal_cent_ct[TRAINING_COLUMNS_LIST] #df_background_cent_ct = df_background_cent_ct[TRAINING_COLUMNS_LIST] # define tree handlers signal_tree_handler = TreeHandler() background_tree_handler_full = TreeHandler() signal_tree_handler.set_data_frame(df_signal_cent_ct) background_tree_handler_full.set_data_frame( df_background_cent_ct) del df_signal_cent_ct del df_background_cent_ct # downscale background background_tree_handler = background_tree_handler_full.get_subset( size=int(0.8 * signal_tree_handler.get_n_cand()), rndm_state=RANDOM_STATE) del background_tree_handler_full # features plot leg_labels = ['background', 'signal'] # second condition needed because of issue with Qt libraries if MAKE_FEATURES_PLOTS and not MAKE_PRESELECTION_EFFICIENCY: if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') plot_utils.plot_distr( [background_tree_handler, signal_tree_handler], TRAINING_COLUMNS_LIST, bins=50, labels=leg_labels,
class Optimiserhipe4mltree: # Class Attribute species = "optimiser_hipe4mltree" def __init__(self, data_param, binmin, binmax, training_var, bkg_sel, hyper_pars): self.logger = get_logger() # directory #self.do_mlprefilter = datap.get("doml_asprefilter", None) self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] #if self.do_mlprefilter is True: # self.dirmodel = self.dirmodel + "/prefilter" # self.dirmlplot = self.dirmlplot + "/prefilter" #if self.do_mlprefilter is False: # self.dirmodel = self.dirmodel + "/analysis" # self.dirmlplot = self.dirmlplot + "/analysis" self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root" self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root" self.v_train = None self.p_binmin = binmin self.p_binmax = binmax self.s_selsigml = "" self.s_selbkgml = bkg_sel #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00" self.v_bkgoversigfrac = 3 self.v_sig = 1 self.v_bkg = 0 self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.prompthandler = None self.datahandler = None self.bkghandler = None self.traintestdata = None self.ypredtrain_hipe4ml = None self.ypredtest_hipe4ml = None self.preparesample() self.p_hipe4ml_model = None self.v_hipe4ml_pars = hyper_pars self.load_hipe4mlmodel() self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][ "bayes_opt_config"] self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"] self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"] self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"] self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"] self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"] self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"] self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"] self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"] self.multiclass_labels = data_param["ml"].get("multiclass_labels", None) self.logger.info("Using the following training variables: %s", self.v_train) def preparesample(self): self.logger.info("Prepare Sample for hipe4ml") self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus') nsigcand = self.signalhandler.get_n_cand() self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus') self.bkghandler = self.datahandler.get_subset(self.s_selbkgml, size=nsigcand * self.v_bkgoversigfrac) self.traintestdata = train_test_generator( [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg], test_size=self.test_frac, random_state=self.rnd_splt) def load_hipe4mlmodel(self): self.logger.info("Loading hipe4ml model") self.v_train = self.signalhandler.get_var_names() self.v_train.remove('inv_mass') self.v_train.remove('pt_cand') model_xgboost = xgb.XGBClassifier() self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train) def set_hipe4ml_modelpar(self): self.logger.info("Setting hipe4ml hyperparameters") self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars) def do_hipe4mlhyperparopti(self): self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)") if not (self.average_method_hipe4ml in ['macro', 'weighted'] and self.roc_method_hipe4ml in ['ovo', 'ovr']): self.logger.fatal("Selected ROC configuration is not valid!") if self.average_method_hipe4ml == 'weighted': metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}' else: metric = f'roc_auc_{self.roc_method_hipe4ml}' hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt' outfilehyppars = open(hypparsfile, 'wt') sys.stdout = outfilehyppars self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata, self.bayesoptconfig_hipe4ml, metric, self.nfold_hipe4ml, self.init_points, self.n_iter_hipe4ml, self.njobs_hipe4ml) outfilehyppars.close() sys.stdout = sys.__stdout__ self.logger.info("Performing hyper-parameters optimisation: Done!") def do_hipe4mltrain(self): self.logger.info("Training + testing hipe4ml model") t0 = time.time() self.p_hipe4ml_model.train_test_model(self.traintestdata, self.average_method_hipe4ml, self.roc_method_hipe4ml) self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[0], self.raw_output_hipe4ml) self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[2], self.raw_output_hipe4ml) modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl' self.p_hipe4ml_model.dump_model_handler(modelhandlerfile) modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model' self.p_hipe4ml_model.dump_original_model(modelfile) self.logger.info("Training + testing hipe4ml: Done!") self.logger.info("Time elapsed = %.3f", time.time() - t0) def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)