def preparesample(self): self.logger.info("Prepare Sample for hipe4ml") self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus') nsigcand = self.signalhandler.get_n_cand() self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus') self.bkghandler = self.datahandler.get_subset(self.s_selbkgml, size=nsigcand * self.v_bkgoversigfrac) self.traintestdata = train_test_generator( [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg], test_size=self.test_frac, random_state=self.rnd_splt)
train_test_data = [ pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame() ] if CREATE_TRAIN_TEST and (COMPUTE_SCORES_FROM_EFF or TRAIN): df_signal_ct = df_signal.query( f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3 and isReconstructed and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3' ) df_background_ct = df_background.query( f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3 and ( mass < 1.1 or mass > 1.13 ) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3' ) # define tree handlers signal_tree_handler = TreeHandler() background_tree_handler = TreeHandler() signal_tree_handler.set_data_frame(df_signal_ct) background_tree_handler.set_data_frame(df_background_ct) del df_signal_ct, df_background_ct # split data into training and test set train_test_data = train_test_generator( [signal_tree_handler, background_tree_handler], [1, 0], test_size=0.5, random_state=RANDOM_STATE) train_test_data[0]['y_true'] = train_test_data[1] train_test_data[2]['y_true'] = train_test_data[3] train_test_data[0].to_parquet( f'df/train_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip', compression='gzip')
def main(): #pylint: disable=too-many-statements, too-many-branches # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] OutputLabels = [ inputCfg['output']['out_labels']['Bkg'], inputCfg['output']['out_labels']['Prompt'] ] if inputCfg['output']['out_labels']['FD'] is not None: OutputLabels.append(inputCfg['output']['out_labels']['FD']) ColumnsToSave = inputCfg['appl']['column_to_save_list'] ModelList = inputCfg['ml']['saved_models'] ModelHandls = [] for iBin in range(len(PtBins)): ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print('\033[91mERROR: path to model not correctly defined!\033[0m') sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) ModelHandls.append(ModelHandl) for inputFile, outName in zip(inputCfg['standalone_appl']['inputs'], inputCfg['standalone_appl']['output_names']): print(f'Loading and preparing data file {inputFile}: ...', end='\r') DataHandler = TreeHandler(inputFile) DataHandler.slice_data_frame('pt_cand', PtBins, True) print(f'Loading and preparing data files {inputFile}: Done!') print('Applying ML model to dataframes: ...', end='\r') for iBin, PtBin in enumerate(PtBins): OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['standalone_appl']['output_dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) DataDfPtSel = DataHandler.get_slice(iBin) yPred = ModelHandls[iBin].predict(DataDfPtSel, inputCfg['ml']['raw_output']) ColumnsToSaveFinal = ColumnsToSave if not isinstance(ColumnsToSaveFinal, list): print( '\033[91mERROR: column_to_save_list must be defined!\033[0m' ) sys.exit() if 'inv_mass' not in ColumnsToSaveFinal: print( '\033[93mWARNING: inv_mass is not going to be saved in the output dataframe!\033[0m' ) if 'pt_cand' not in ColumnsToSaveFinal: print( '\033[93mWARNING: pt_cand is not going to be saved in the output dataframe!\033[0m' ) if 'pt_B' in ColumnsToSaveFinal and 'pt_B' not in DataDfPtSel.columns: ColumnsToSaveFinal.remove('pt_B') # only in MC DataDfPtSel = DataDfPtSel.loc[:, ColumnsToSaveFinal] if ModelHandls[iBin].get_n_classes() < 3: DataDfPtSel['ML_output'] = yPred else: for Pred, Lab in enumerate(OutputLabels): DataDfPtSel[f'ML_output_{Lab}'] = yPred[:, Pred] DataDfPtSel.to_parquet( f'{OutPutDirPt}/{outName}_pT_{PtBin[0]}_{PtBin[1]}_ModelApplied.parquet.gzip' ) del DataDfPtSel print('Applying ML model to dataframes: Done!')
def test_tree_handler(): # pylint: disable=too-many-statements """ Test the TreeHandler class functionalities. """ # define the working directory test_dir = Path(__file__).resolve().parent # initialize TreeHandler test test_data, references = init_tree_handler_test_workspace(test_dir) # instantiate tree handler objects data_hdlr = TreeHandler(test_data[0], 'treeMLDplus') prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus') data_pq_hdlr = TreeHandler(test_data[2]) prompt_pq_hdlr = TreeHandler(test_data[3]) mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus') mult_pq_hdlr = TreeHandler(test_data[2:]) # open refernces objects reference_data_slice_df = pd.read_pickle(references[0]) reference_prompt_slice_df = pd.read_pickle(references[1]) with open(references[2], 'rb') as handle: reference_dict = pickle.load(handle) terminate_tree_handler_test_workspace(test_dir) # test that data is the same in root and parquet assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \ 'data Dataframe from parquet file differs from the root file one!' assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \ 'prompt Dataframe from parquet file differs from the root file one!' # test loading from multiple files merged_df = pd.concat( [data_hdlr.get_data_frame(), prompt_hdlr.get_data_frame()], ignore_index=True) assert mult_hdlr.get_data_frame().equals( merged_df), 'loading of multiple root files not working!' merged_pq_df = pd.concat( [data_pq_hdlr.get_data_frame(), prompt_pq_hdlr.get_data_frame()], ignore_index=True) assert mult_pq_hdlr.get_data_frame().equals( merged_pq_df), 'loading of multiple parquet files not working!' # define the info dict that will be compared with the reference info_dict = {} # get the number of candidates in the original data sample info_dict['n_data'] = data_hdlr.get_n_cand() info_dict['n_prompt'] = prompt_hdlr.get_n_cand() # get the original variable list info_dict['data_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_var_list'] = prompt_hdlr.get_var_names() # shuffle dataframes new_hndl = data_hdlr.shuffle_data_frame(size=10, random_state=5, inplace=False) copied_hndl = copy.deepcopy(data_hdlr) copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True) assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after shuffling' # apply preselections preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False) data_hdlr.apply_preselections(preselections_data) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after the preselections' prompt_hdlr.apply_preselections(preselections_prompt) # get the number of selected data info_dict['n_data_preselected'] = data_hdlr.get_n_cand() info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand() # get the preselections info_dict['data_preselections'] = data_hdlr.get_preselections() info_dict['prompt_preselections'] = prompt_hdlr.get_preselections() # apply dummy eval() on the underlying data frame d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)' new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False) data_hdlr.eval_data_frame(d_len_z_def) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after eval' prompt_hdlr.eval_data_frame(d_len_z_def) # get the new variable list info_dict['data_new_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names() # get a random subset of the original data data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED) prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED) # slice both data and prompt data frame respect to the pT bins = [[0, 2], [2, 10], [10, 25]] data_hdlr.slice_data_frame('pt_cand', bins) prompt_hdlr.slice_data_frame('pt_cand', bins) # store projection variable and binning info_dict['data_proj_variable'] = data_hdlr.get_projection_variable() info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable() info_dict['data_binning'] = data_hdlr.get_projection_binning() info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning() # get info from a single data slice data_slice_df = data_hdlr.get_slice(2) prompt_slice_df = prompt_hdlr.get_slice(2) info_dict['n_data_slice'] = len(data_slice_df) info_dict['n_prompt_slice'] = len(prompt_slice_df) # test info_dict reproduction assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!' # test sliced data frames reproduction assert data_slice_df.equals( reference_data_slice_df ), 'data sliced DataFrame differs from the reference!' assert prompt_slice_df.equals( reference_prompt_slice_df ), 'prompt sliced DataFrame differs from the reference!'
def main(): #pylint: disable=too-many-statements # read config file parser = argparse.ArgumentParser(description='Arguments to pass') parser.add_argument('cfgFileName', metavar='text', default='cfgFileNameML.yml', help='config file name for ml') parser.add_argument("--train", help="perform only training and testing", action="store_true") parser.add_argument("--apply", help="perform only application", action="store_true") args = parser.parse_args() print('Loading analysis configuration: ...', end='\r') with open(args.cfgFileName, 'r') as ymlCfgFile: inputCfg = yaml.load(ymlCfgFile, yaml.FullLoader) print('Loading analysis configuration: Done!') print('Loading and preparing data files: ...', end='\r') PromptHandler = TreeHandler(inputCfg['input']['prompt'], inputCfg['input']['treename']) FDHandler = None if inputCfg['input']['FD'] is None else TreeHandler( inputCfg['input']['FD'], inputCfg['input']['treename']) DataHandler = TreeHandler(inputCfg['input']['data'], inputCfg['input']['treename']) if inputCfg['data_prep']['filt_bkg_mass']: BkgHandler = DataHandler.get_subset( inputCfg['data_prep']['filt_bkg_mass'], frac=1., rndm_state=inputCfg['data_prep']['seed_split']) else: BkgHandler = DataHandler PtBins = [[a, b] for a, b in zip(inputCfg['pt_ranges']['min'], inputCfg['pt_ranges']['max'])] PromptHandler.slice_data_frame('pt_cand', PtBins, True) if FDHandler is not None: FDHandler.slice_data_frame('pt_cand', PtBins, True) DataHandler.slice_data_frame('pt_cand', PtBins, True) BkgHandler.slice_data_frame('pt_cand', PtBins, True) print('Loading and preparing data files: Done!') for iBin, PtBin in enumerate(PtBins): print( f'\n\033[94mStarting ML analysis --- {PtBin[0]} < pT < {PtBin[1]} GeV/c\033[0m' ) OutPutDirPt = os.path.join( os.path.expanduser(inputCfg['output']['dir']), f'pt{PtBin[0]}_{PtBin[1]}') if os.path.isdir(OutPutDirPt): print(( f'\033[93mWARNING: Output directory \'{OutPutDirPt}\' already exists,' ' overwrites possibly ongoing!\033[0m')) else: os.makedirs(OutPutDirPt) # data preparation #_____________________________________________ FDDfPt = pd.DataFrame() if FDHandler is None else FDHandler.get_slice( iBin) TrainTestData, PromptDfSelForEff, FDDfSelForEff = data_prep( inputCfg, iBin, PtBin, OutPutDirPt, PromptHandler.get_slice(iBin), FDDfPt, BkgHandler.get_slice(iBin)) if args.apply and inputCfg['data_prep']['test_fraction'] < 1.: print( '\033[93mWARNING: Using only a fraction of the MC for the application! Are you sure?\033[0m' ) # training, testing #_____________________________________________ if not args.apply: ModelHandl = train_test(inputCfg, PtBin, OutPutDirPt, TrainTestData, iBin) else: ModelList = inputCfg['ml']['saved_models'] ModelPath = ModelList[iBin] if not isinstance(ModelPath, str): print( '\033[91mERROR: path to model not correctly defined!\033[0m' ) sys.exit() ModelPath = os.path.expanduser(ModelPath) print(f'Loaded saved model: {ModelPath}') ModelHandl = ModelHandler() ModelHandl.load_model_handler(ModelPath) # model application #_____________________________________________ if not args.train: appl(inputCfg, PtBin, OutPutDirPt, ModelHandl, DataHandler.get_slice(iBin), PromptDfSelForEff, FDDfSelForEff) # delete dataframes to release memory for data in TrainTestData: del data del PromptDfSelForEff, FDDfSelForEff
# PLOT FEATURES DISTRIBUTIONS AND CORRELATIONS ###################################################### df_background = uproot.open( os.path.expandvars(BKG_PATH))['LambdaTree'].arrays(library="pd") df_prompt_ct = df_signal.query( f'pt > 0 and pt < 3.5 and flag==1') # pt cut? df_non_prompt_ct = df_signal.query( f'pt > 0 and pt < 3.5 and flag==2') # pt cut? df_background_ct = df_background.query( f'pt > 0 and pt < 3.5') # pt cut? #print(df_prompt_ct.keys()) #print(df_background_ct.keys()) # define tree handlers prompt_tree_handler = TreeHandler() non_prompt_tree_handler = TreeHandler() background_tree_handler = TreeHandler() prompt_tree_handler.set_data_frame(df_prompt_ct) non_prompt_tree_handler.set_data_frame(df_non_prompt_ct) background_tree_handler.set_data_frame(df_background_ct) del df_prompt_ct, df_non_prompt_ct, df_background_ct if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') leg_labels = ['background', 'non-prompt', 'prompt'] plot_distr = plot_utils.plot_distr([ background_tree_handler, non_prompt_tree_handler, prompt_tree_handler ],
def benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, training_variables='', testsize=0.75): import time from sklearn.metrics import roc_auc_score N_run = 1 data_path = filename_dict['data_path'] analysis_path = filename_dict['analysis_path'] print('Loading MC signal') mc_signal = TreeHandler() mc_signal.get_handler_from_large_file( file_name=data_path + filename_dict['MC_signal_filename'], tree_name=filename_dict['MC_signal_table']) print('MC signal loaded\n') print('Loading background data for training') background_ls = TreeHandler() background_ls.get_handler_from_large_file( file_name=data_path + filename_dict['train_bckg_filename'], tree_name=filename_dict['train_bckg_table']) background_ls.apply_preselections(presel_dict['train_bckg_presel']) background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(), mc_signal.get_n_cand() * 4)) print('Done\n') train_test_data = train_test_generator([mc_signal, background_ls], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) times = [] roc = [] for i in range(N_run): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) times.append(time.time() - start) print('BAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') for i in range(N_run): model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=np.mean(times), njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n')
# make dataframe directory if not os.path.isdir('df'): os.mkdir('df') score_eff_arrays_dict = dict() for ct_bins in CT_BINS: train_test_data = [pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()] if CREATE_TRAIN_TEST and (COMPUTE_SCORES_FROM_EFF or TRAIN): df_prompt_ct = df_signal.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and isReconstructed and (flag==1) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8') df_nonprompt_ct = df_signal.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and isReconstructed and (flag==2 or flag==4) and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8') df_background_ct = df_background.query(f'ct > {ct_bins[0]} and ct < {ct_bins[1]} and pt > 0.5 and pt < 3.5 and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3 and radius < 50 and dcaPrPV < 10 and dcaPiPV < 10 and eta < 0.8 and eta > -0.8') # define tree handlers prompt_tree_handler = TreeHandler() nonprompt_tree_handler = TreeHandler() background_tree_handler = TreeHandler() prompt_tree_handler.set_data_frame(df_prompt_ct) nonprompt_tree_handler.set_data_frame(df_nonprompt_ct) background_tree_handler.set_data_frame(df_background_ct) del df_prompt_ct, df_nonprompt_ct, df_background_ct # split data into training and test set train_test_data = train_test_generator([background_tree_handler, nonprompt_tree_handler, prompt_tree_handler], [ 0, 1, 2], test_size=0.2, random_state=RANDOM_STATE) train_test_data[0]['y_true'] = train_test_data[1] train_test_data[2]['y_true'] = train_test_data[3] train_test_data[0].to_parquet(f'df/train_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip',compression='gzip') train_test_data[2].to_parquet(f'df/test_data_{ct_bins[0]}_{ct_bins[1]}.parquet.gzip',compression='gzip') # continue
#%% import numpy as np import matplotlib.pyplot as plt import pandas as pd import seaborn as sns import uproot import os from hipe4ml.tree_handler import TreeHandler #%% data = TreeHandler( os.path.abspath(os.getcwd()) + '/data/SignalTable_pp13TeV_mtexp.root', "SignalTable").get_data_frame() #%% training_variables = [ "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi", "tpc_nsig_de", "tpc_nsig_pr", "tpc_nsig_pi", "dca_de_pr", "dca_de_pi", "dca_pr_pi", "dca_de_sv", "dca_pr_sv", "dca_pi_sv", "chi2" ] print(list(data.columns)) data.head(10) # %% #sns.pairplot(data.query('gReconstructed > 0').sample(1000), hue = 'gReconstructed', plot_kws={'alpha': 0.1}, corner = True) #data.query('gReconstructed > 0').hist(bins = 50, figsize = (20,20)); # %% rec = data.query('gReconstructed > 0').copy() rec_rej_acc = rec.query('rej_accept > 0').copy() print(len(rec) / len(data))
del df_generated_cent ###################################################################### # second condition needed because of issue with Qt libraries if MAKE_FEATURES_PLOTS and not MAKE_PRESELECTION_EFFICIENCY and not TRAIN: ###################################################### # PLOT FEATURES DISTRIBUTIONS AND CORRELATIONS ###################################################### df_background = uproot.open(os.path.expandvars(BKG_PATH))['LambdaTree'].arrays(library="pd") df_signal_ct = df_signal.query(f'pt > 0.5 and pt < 3 and ((flag & 1) == 1) and isReconstructed') # pt cut? df_background_ct = df_background.query(f'pt > 0.5 and pt < 3 and ( mass < 1.105 or mass > 1.13 )') # pt cut? # define tree handlers signal_tree_handler = TreeHandler() background_tree_handler = TreeHandler() signal_tree_handler.set_data_frame(df_signal_ct) background_tree_handler.set_data_frame(df_background_ct) del df_signal_ct, df_background_ct if not os.path.isdir(f'{PLOT_DIR}/features'): os.mkdir(f'{PLOT_DIR}/features') leg_labels = ['background', 'signal'] plot_distr = plot_utils.plot_distr( [background_tree_handler, signal_tree_handler], TRAINING_COLUMNS_LIST, bins=40, labels=leg_labels, log=True, density=True, figsize=(10, 12), alpha=0.5, grid=False) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.50, wspace=0.50) plt.tight_layout()
bin = f'{split}_{cent_bins[0]}_{cent_bins[1]}_{ct_bins[0]}_{ct_bins[1]}' ############################################################## # TRAINING AND TEST SET PREPARATION ############################################################## df_signal_cent_ct = df_signal.query( f'ArmenterosAlpha {split_ineq_sign} 0 and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and ct > {ct_bins[0]} and ct < {ct_bins[1]}' ) df_background_cent_ct = df_background.query( f'ArmenterosAlpha {split_ineq_sign} 0 and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and ct > {ct_bins[0]} and ct < {ct_bins[1]}' ) #df_signal_cent_ct = df_signal_cent_ct[TRAINING_COLUMNS_LIST] #df_background_cent_ct = df_background_cent_ct[TRAINING_COLUMNS_LIST] # define tree handlers signal_tree_handler = TreeHandler() background_tree_handler_full = TreeHandler() signal_tree_handler.set_data_frame(df_signal_cent_ct) background_tree_handler_full.set_data_frame( df_background_cent_ct) del df_signal_cent_ct del df_background_cent_ct # downscale background background_tree_handler = background_tree_handler_full.get_subset( size=int(0.8 * signal_tree_handler.get_n_cand()), rndm_state=RANDOM_STATE) del background_tree_handler_full # features plot leg_labels = ['background', 'signal']
REFERENCE_DIR = BASE_DIR.joinpath('references') # define paths for files DATA_FILE_PATH = DATA_DIR.joinpath('Bkg_Dpluspp7TeV_pT_1_50.root') PROMPT_FILE_PATH = DATA_DIR.joinpath('Prompt_Dpluspp7TeV_pT_1_50.root') # define dictionary for storing reference for the tests INFO_DICT = {} # preliminar check if not REFERENCE_DIR.is_dir(): sys.exit( "No 'references' dir was found, so no reference data were produced!") # instantiate tree handler objects DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus') PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus') # store number of candidates in the original data sample INFO_DICT['n_data'] = DATA_HDLR.get_n_cand() INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand() # store original variable list INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names() INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names() # apply preselections PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' DATA_HDLR.apply_preselections(PRESEL_DATA)
class Optimiserhipe4mltree: # Class Attribute species = "optimiser_hipe4mltree" def __init__(self, data_param, binmin, binmax, training_var, bkg_sel, hyper_pars): self.logger = get_logger() # directory #self.do_mlprefilter = datap.get("doml_asprefilter", None) self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] #if self.do_mlprefilter is True: # self.dirmodel = self.dirmodel + "/prefilter" # self.dirmlplot = self.dirmlplot + "/prefilter" #if self.do_mlprefilter is False: # self.dirmodel = self.dirmodel + "/analysis" # self.dirmlplot = self.dirmlplot + "/analysis" self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root" self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root" self.v_train = None self.p_binmin = binmin self.p_binmax = binmax self.s_selsigml = "" self.s_selbkgml = bkg_sel #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00" self.v_bkgoversigfrac = 3 self.v_sig = 1 self.v_bkg = 0 self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.prompthandler = None self.datahandler = None self.bkghandler = None self.traintestdata = None self.ypredtrain_hipe4ml = None self.ypredtest_hipe4ml = None self.preparesample() self.p_hipe4ml_model = None self.v_hipe4ml_pars = hyper_pars self.load_hipe4mlmodel() self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][ "bayes_opt_config"] self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"] self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"] self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"] self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"] self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"] self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"] self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"] self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"] self.multiclass_labels = data_param["ml"].get("multiclass_labels", None) self.logger.info("Using the following training variables: %s", self.v_train) def preparesample(self): self.logger.info("Prepare Sample for hipe4ml") self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus') nsigcand = self.signalhandler.get_n_cand() self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus') self.bkghandler = self.datahandler.get_subset(self.s_selbkgml, size=nsigcand * self.v_bkgoversigfrac) self.traintestdata = train_test_generator( [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg], test_size=self.test_frac, random_state=self.rnd_splt) def load_hipe4mlmodel(self): self.logger.info("Loading hipe4ml model") self.v_train = self.signalhandler.get_var_names() self.v_train.remove('inv_mass') self.v_train.remove('pt_cand') model_xgboost = xgb.XGBClassifier() self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train) def set_hipe4ml_modelpar(self): self.logger.info("Setting hipe4ml hyperparameters") self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars) def do_hipe4mlhyperparopti(self): self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)") if not (self.average_method_hipe4ml in ['macro', 'weighted'] and self.roc_method_hipe4ml in ['ovo', 'ovr']): self.logger.fatal("Selected ROC configuration is not valid!") if self.average_method_hipe4ml == 'weighted': metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}' else: metric = f'roc_auc_{self.roc_method_hipe4ml}' hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt' outfilehyppars = open(hypparsfile, 'wt') sys.stdout = outfilehyppars self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata, self.bayesoptconfig_hipe4ml, metric, self.nfold_hipe4ml, self.init_points, self.n_iter_hipe4ml, self.njobs_hipe4ml) outfilehyppars.close() sys.stdout = sys.__stdout__ self.logger.info("Performing hyper-parameters optimisation: Done!") def do_hipe4mltrain(self): self.logger.info("Training + testing hipe4ml model") t0 = time.time() self.p_hipe4ml_model.train_test_model(self.traintestdata, self.average_method_hipe4ml, self.roc_method_hipe4ml) self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[0], self.raw_output_hipe4ml) self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict( self.traintestdata[2], self.raw_output_hipe4ml) modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl' self.p_hipe4ml_model.dump_model_handler(modelhandlerfile) modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model' self.p_hipe4ml_model.dump_original_model(modelfile) self.logger.info("Training + testing hipe4ml: Done!") self.logger.info("Time elapsed = %.3f", time.time() - t0) def do_hipe4mlplot(self): self.logger.info("Plotting hipe4ml model") leglabels = ["Background", "Prompt signal"] outputlabels = ["Bkg", "SigPrompt"] # _____________________________________________ plot_utils.plot_distr([self.bkghandler, self.signalhandler], self.v_train, 100, leglabels) plt.subplots_adjust(left=0.06, bottom=0.06, right=0.99, top=0.96, hspace=0.55, wspace=0.55) figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' plt.savefig(figname) plt.close('all') # _____________________________________________ corrmatrixfig = plot_utils.plot_corr( [self.bkghandler, self.signalhandler], self.v_train, leglabels) for figg, labb in zip(corrmatrixfig, outputlabels): plt.figure(figg.number) plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9) figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf' figg.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 7) mloutputfig = plot_utils.plot_output_train_test( self.p_hipe4ml_model, self.traintestdata, 80, self.raw_output_hipe4ml, leglabels, self.train_test_log_hipe4ml, density=True) figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf' mloutputfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvefig = plot_utils.plot_roc(self.traintestdata[3], self.ypredtest_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvefig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (10, 9) roccurvettfig = plot_utils.plot_roc_train_test( self.traintestdata[3], self.ypredtest_hipe4ml, self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels, self.average_method_hipe4ml, self.roc_method_hipe4ml) figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf' roccurvettfig.savefig(figname) # _____________________________________________ precisionrecallfig = plot_utils.plot_precision_recall( self.traintestdata[3], self.ypredtest_hipe4ml, leglabels) figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf' precisionrecallfig.savefig(figname) # _____________________________________________ plt.rcParams["figure.figsize"] = (12, 7) featuresimportancefig = plot_utils.plot_feature_imp( self.traintestdata[2][self.v_train], self.traintestdata[3], self.p_hipe4ml_model, leglabels) for i in range(0, len(featuresimportancefig)): figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_' f'pT_{self.p_binmin}_{self.p_binmax}.pdf') featuresimportancefig[i].savefig(figname)
MODEL_PARAMS = params['XGBOOST_PARAMS'] HYPERPARAMS = params['HYPERPARAMS'] path_to_data = "../Tables/" results_ml_path = "../Results/PlotsML" ml_model_path = "../Utils/Models" utils_path = "../Utils" efficiencies_path = "../Utils/Efficiencies" selected_df_path = "../Utils/ReducedDataFrames" print("---------------------------------------------") print("Data loading...") if training: signalH = TreeHandler(path_to_data + signal_table_name, "SignalTable") bkgH = TreeHandler(path_to_data + bkg_table_name, "DataTable") if bkg_fraction != None: bkgH.shuffle_data_frame(size=bkg_fraction * len(signalH), inplace=True, random_state=52) train_test_data = au.train_test_generator([signalH, bkgH], [1, 0], test_size=0.5, random_state=42) if pp_mode: signalH.apply_preselections("pt>0 and rej_accept==True") training_columns = [ "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi",