df_data = pd.read_parquet('df/data_dataset') #df_data = uproot.open('../data/AnalysisResults.root')['LambdaTree'].arrays(library="pd") # df_data = df_data.append(df_data_r, ignore_index=True) df_data_cent = df_data.query( f'matter {split_ineq_sign} and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and pt > 0.5 and pt < 3 and ct > {ct_bins[0]} and ct < {ct_bins[1]} and tpcClV0Pi > 69 and tpcClV0Pr > 69 and radius > 3' ) del df_data data_y_score = model_hdl.predict(df_data_cent) df_data_cent['model_output'] = data_y_score df_data_cent = df_data_cent.query( f'model_output > {score_eff_arrays_dict[bin][len(eff_array)-1]}' ) df_data_cent.to_parquet(f'df/{bin}.parquet.gzip', compression='gzip') else: df_data = TreeHandler() df_data.get_handler_from_large_file( DATA_PATH, "LambdaTree", preselection= f'matter {split_ineq_sign} and centrality > {cent_bins[0]} and centrality < {cent_bins[1]} and pt > 0.5 and pt < 3 and ct > {ct_bins[0]} and ct < {ct_bins[1]}', max_workers=8) df_data.apply_model_handler(model_hdl) df_data.apply_preselections( f'model_output > {score_eff_arrays_dict[bin][len(eff_array)-1]}' ) df_data.write_df_to_parquet_files(bin, "df/")
def test_tree_handler(): # pylint: disable=too-many-statements """ Test the TreeHandler class functionalities. """ # define the working directory test_dir = Path(__file__).resolve().parent # initialize TreeHandler test test_data, references = init_tree_handler_test_workspace(test_dir) # instantiate tree handler objects data_hdlr = TreeHandler(test_data[0], 'treeMLDplus') prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus') data_pq_hdlr = TreeHandler(test_data[2]) prompt_pq_hdlr = TreeHandler(test_data[3]) mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus') mult_pq_hdlr = TreeHandler(test_data[2:]) # open refernces objects reference_data_slice_df = pd.read_pickle(references[0]) reference_prompt_slice_df = pd.read_pickle(references[1]) with open(references[2], 'rb') as handle: reference_dict = pickle.load(handle) terminate_tree_handler_test_workspace(test_dir) # test that data is the same in root and parquet assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \ 'data Dataframe from parquet file differs from the root file one!' assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \ 'prompt Dataframe from parquet file differs from the root file one!' # test loading from multiple files merged_df = pd.concat( [data_hdlr.get_data_frame(), prompt_hdlr.get_data_frame()], ignore_index=True) assert mult_hdlr.get_data_frame().equals( merged_df), 'loading of multiple root files not working!' merged_pq_df = pd.concat( [data_pq_hdlr.get_data_frame(), prompt_pq_hdlr.get_data_frame()], ignore_index=True) assert mult_pq_hdlr.get_data_frame().equals( merged_pq_df), 'loading of multiple parquet files not working!' # define the info dict that will be compared with the reference info_dict = {} # get the number of candidates in the original data sample info_dict['n_data'] = data_hdlr.get_n_cand() info_dict['n_prompt'] = prompt_hdlr.get_n_cand() # get the original variable list info_dict['data_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_var_list'] = prompt_hdlr.get_var_names() # shuffle dataframes new_hndl = data_hdlr.shuffle_data_frame(size=10, random_state=5, inplace=False) copied_hndl = copy.deepcopy(data_hdlr) copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True) assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after shuffling' # apply preselections preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False) data_hdlr.apply_preselections(preselections_data) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after the preselections' prompt_hdlr.apply_preselections(preselections_prompt) # get the number of selected data info_dict['n_data_preselected'] = data_hdlr.get_n_cand() info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand() # get the preselections info_dict['data_preselections'] = data_hdlr.get_preselections() info_dict['prompt_preselections'] = prompt_hdlr.get_preselections() # apply dummy eval() on the underlying data frame d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)' new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False) data_hdlr.eval_data_frame(d_len_z_def) assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \ 'Inplaced dataframe differs from the not inplaced one after eval' prompt_hdlr.eval_data_frame(d_len_z_def) # get the new variable list info_dict['data_new_var_list'] = prompt_hdlr.get_var_names() info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names() # get a random subset of the original data data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED) prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED) # slice both data and prompt data frame respect to the pT bins = [[0, 2], [2, 10], [10, 25]] data_hdlr.slice_data_frame('pt_cand', bins) prompt_hdlr.slice_data_frame('pt_cand', bins) # store projection variable and binning info_dict['data_proj_variable'] = data_hdlr.get_projection_variable() info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable() info_dict['data_binning'] = data_hdlr.get_projection_binning() info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning() # get info from a single data slice data_slice_df = data_hdlr.get_slice(2) prompt_slice_df = prompt_hdlr.get_slice(2) info_dict['n_data_slice'] = len(data_slice_df) info_dict['n_prompt_slice'] = len(prompt_slice_df) # test info_dict reproduction assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!' # test sliced data frames reproduction assert data_slice_df.equals( reference_data_slice_df ), 'data sliced DataFrame differs from the reference!' assert prompt_slice_df.equals( reference_prompt_slice_df ), 'prompt sliced DataFrame differs from the reference!'
DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus') PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus') # store number of candidates in the original data sample INFO_DICT['n_data'] = DATA_HDLR.get_n_cand() INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand() # store original variable list INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names() INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names() # apply preselections PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)' PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)' DATA_HDLR.apply_preselections(PRESEL_DATA) PROMPT_HDLR.apply_preselections(PRESEL_PROMPT) # store number of selcted data INFO_DICT['n_data_preselected'] = DATA_HDLR.get_n_cand() INFO_DICT['n_prompt_preselected'] = PROMPT_HDLR.get_n_cand() # store preselections INFO_DICT['data_preselections'] = DATA_HDLR.get_preselections() INFO_DICT['prompt_preselections'] = PROMPT_HDLR.get_preselections() # apply dummy eval() on the underlying data frame DATA_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)') PROMPT_HDLR.eval_data_frame('d_len_z = sqrt(d_len**2 - d_len_xy**2)') # store new variable list
def benchmark_hyperparam_optimizers(filename_dict, params, params_range, flag_dict, presel_dict, training_variables='', testsize=0.75): import time from sklearn.metrics import roc_auc_score N_run = 1 data_path = filename_dict['data_path'] analysis_path = filename_dict['analysis_path'] print('Loading MC signal') mc_signal = TreeHandler() mc_signal.get_handler_from_large_file( file_name=data_path + filename_dict['MC_signal_filename'], tree_name=filename_dict['MC_signal_table']) print('MC signal loaded\n') print('Loading background data for training') background_ls = TreeHandler() background_ls.get_handler_from_large_file( file_name=data_path + filename_dict['train_bckg_filename'], tree_name=filename_dict['train_bckg_table']) background_ls.apply_preselections(presel_dict['train_bckg_presel']) background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(), mc_signal.get_n_cand() * 4)) print('Done\n') train_test_data = train_test_generator([mc_signal, background_ls], [1, 0], test_size=testsize) if training_variables == '': training_variables = train_test_data[0].columns.tolist() model_clf = xgb.XGBClassifier() model_hdl = ModelHandler(model_clf, training_variables) times = [] roc = [] for i in range(N_run): start = time.time() model_hdl.optimize_params_bayes(train_test_data, params_range, 'roc_auc', njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) times.append(time.time() - start) print('BAYES OPTIMIZATION WITH SKLEARN') print('Mean time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n') for i in range(N_run): model_hdl.optimize_params_optuna(train_test_data, params_range, 'roc_auc', timeout=np.mean(times), njobs=-1) model_hdl.train_test_model(train_test_data, ) y_pred_test = model_hdl.predict( train_test_data[2], True) #used to evaluate model performance roc.append(roc_auc_score(train_test_data[3], y_pred_test)) print('OPTUNA') print('Fixed time : ' + str(np.mean(time))) print('Mean ROC : ' + str(np.mean(roc))) print('--------------\n')
if training: signalH = TreeHandler(path_to_data + signal_table_name, "SignalTable") bkgH = TreeHandler(path_to_data + bkg_table_name, "DataTable") if bkg_fraction != None: bkgH.shuffle_data_frame(size=bkg_fraction * len(signalH), inplace=True, random_state=52) train_test_data = au.train_test_generator([signalH, bkgH], [1, 0], test_size=0.5, random_state=42) if pp_mode: signalH.apply_preselections("pt>0 and rej_accept==True") training_columns = [ "pt", "cos_pa", "tpc_ncls_de", "tpc_ncls_pr", "tpc_ncls_pi", "tpc_nsig_de", "tpc_nsig_pr", "tpc_nsig_pi", "dca_de_pr", "dca_de_pi", "dca_pr_pi", "dca_de_sv", "dca_pr_sv", "dca_pi_sv", "chi2" ] else: training_columns = [ 'TPCnSigmaHe3', 'ct', 'V0CosPA', 'ProngsDCA', 'He3ProngPvDCA', 'PiProngPvDCA', 'He3ProngPvDCAXY', 'PiProngPvDCAXY', 'NpidClustersHe3', 'TPCnSigmaPi' ] if not os.path.exists(results_ml_path): os.makedirs(results_ml_path)