예제 #1
0
def test_tree_handler():  # pylint: disable=too-many-statements
    """
    Test the TreeHandler class functionalities.
    """
    # define the working directory
    test_dir = Path(__file__).resolve().parent

    # initialize TreeHandler test
    test_data, references = init_tree_handler_test_workspace(test_dir)

    # instantiate tree handler objects
    data_hdlr = TreeHandler(test_data[0], 'treeMLDplus')
    prompt_hdlr = TreeHandler(test_data[1], 'treeMLDplus')
    data_pq_hdlr = TreeHandler(test_data[2])
    prompt_pq_hdlr = TreeHandler(test_data[3])
    mult_hdlr = TreeHandler(test_data[:2], 'treeMLDplus')
    mult_pq_hdlr = TreeHandler(test_data[2:])

    # open refernces objects
    reference_data_slice_df = pd.read_pickle(references[0])
    reference_prompt_slice_df = pd.read_pickle(references[1])
    with open(references[2], 'rb') as handle:
        reference_dict = pickle.load(handle)

    terminate_tree_handler_test_workspace(test_dir)

    # test that data is the same in root and parquet
    assert data_hdlr.get_data_frame().equals(data_pq_hdlr.get_data_frame()), \
        'data Dataframe from parquet file differs from the root file one!'
    assert prompt_hdlr.get_data_frame().equals(prompt_pq_hdlr.get_data_frame()), \
        'prompt Dataframe from parquet file differs from the root file one!'

    # test loading from multiple files
    merged_df = pd.concat(
        [data_hdlr.get_data_frame(),
         prompt_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_hdlr.get_data_frame().equals(
        merged_df), 'loading of multiple root files not working!'
    merged_pq_df = pd.concat(
        [data_pq_hdlr.get_data_frame(),
         prompt_pq_hdlr.get_data_frame()],
        ignore_index=True)
    assert mult_pq_hdlr.get_data_frame().equals(
        merged_pq_df), 'loading of multiple parquet files not working!'

    # define the info dict that will be compared with the reference
    info_dict = {}

    # get the number of candidates in the original data sample
    info_dict['n_data'] = data_hdlr.get_n_cand()
    info_dict['n_prompt'] = prompt_hdlr.get_n_cand()

    # get the original variable list
    info_dict['data_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_var_list'] = prompt_hdlr.get_var_names()

    # shuffle dataframes
    new_hndl = data_hdlr.shuffle_data_frame(size=10,
                                            random_state=5,
                                            inplace=False)
    copied_hndl = copy.deepcopy(data_hdlr)
    copied_hndl.shuffle_data_frame(size=10, random_state=5, inplace=True)
    assert copied_hndl.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after shuffling'

    # apply preselections
    preselections_data = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
    preselections_prompt = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

    new_hndl = data_hdlr.apply_preselections(preselections_data, inplace=False)
    data_hdlr.apply_preselections(preselections_data)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after the preselections'

    prompt_hdlr.apply_preselections(preselections_prompt)

    # get the number of selected data
    info_dict['n_data_preselected'] = data_hdlr.get_n_cand()
    info_dict['n_prompt_preselected'] = prompt_hdlr.get_n_cand()

    # get the preselections
    info_dict['data_preselections'] = data_hdlr.get_preselections()
    info_dict['prompt_preselections'] = prompt_hdlr.get_preselections()

    # apply dummy eval() on the underlying data frame
    d_len_z_def = 'd_len_z = sqrt(d_len**2 - d_len_xy**2)'
    new_hndl = data_hdlr.eval_data_frame(d_len_z_def, inplace=False)
    data_hdlr.eval_data_frame(d_len_z_def)
    assert data_hdlr.get_data_frame().equals(new_hndl.get_data_frame()), \
        'Inplaced dataframe differs from the not inplaced one after eval'

    prompt_hdlr.eval_data_frame(d_len_z_def)

    # get the new variable list
    info_dict['data_new_var_list'] = prompt_hdlr.get_var_names()
    info_dict['prompt_new_var_list'] = prompt_hdlr.get_var_names()

    # get a random subset of the original data
    data_hdlr = data_hdlr.get_subset(size=3000, rndm_state=SEED)
    prompt_hdlr = prompt_hdlr.get_subset(size=55, rndm_state=SEED)

    # slice both data and prompt data frame respect to the pT
    bins = [[0, 2], [2, 10], [10, 25]]

    data_hdlr.slice_data_frame('pt_cand', bins)
    prompt_hdlr.slice_data_frame('pt_cand', bins)

    # store projection variable and binning
    info_dict['data_proj_variable'] = data_hdlr.get_projection_variable()
    info_dict['prompt_proj_variable'] = prompt_hdlr.get_projection_variable()

    info_dict['data_binning'] = data_hdlr.get_projection_binning()
    info_dict['prompt_binning'] = prompt_hdlr.get_projection_binning()

    # get info from a single data slice
    data_slice_df = data_hdlr.get_slice(2)
    prompt_slice_df = prompt_hdlr.get_slice(2)

    info_dict['n_data_slice'] = len(data_slice_df)
    info_dict['n_prompt_slice'] = len(prompt_slice_df)

    # test info_dict reproduction

    assert info_dict == reference_dict, 'dictionary with the data info differs from the reference!'

    # test sliced data frames reproduction
    assert data_slice_df.equals(
        reference_data_slice_df
    ), 'data sliced DataFrame differs from the reference!'
    assert prompt_slice_df.equals(
        reference_prompt_slice_df
    ), 'prompt sliced DataFrame differs from the reference!'
예제 #2
0
def benchmark_hyperparam_optimizers(filename_dict,
                                    params,
                                    params_range,
                                    flag_dict,
                                    presel_dict,
                                    training_variables='',
                                    testsize=0.75):

    import time
    from sklearn.metrics import roc_auc_score

    N_run = 1

    data_path = filename_dict['data_path']
    analysis_path = filename_dict['analysis_path']

    print('Loading MC signal')
    mc_signal = TreeHandler()
    mc_signal.get_handler_from_large_file(
        file_name=data_path + filename_dict['MC_signal_filename'],
        tree_name=filename_dict['MC_signal_table'])
    print('MC signal loaded\n')

    print('Loading background data for training')
    background_ls = TreeHandler()
    background_ls.get_handler_from_large_file(
        file_name=data_path + filename_dict['train_bckg_filename'],
        tree_name=filename_dict['train_bckg_table'])
    background_ls.apply_preselections(presel_dict['train_bckg_presel'])
    background_ls.shuffle_data_frame(size=min(background_ls.get_n_cand(),
                                              mc_signal.get_n_cand() * 4))
    print('Done\n')

    train_test_data = train_test_generator([mc_signal, background_ls], [1, 0],
                                           test_size=testsize)

    if training_variables == '':
        training_variables = train_test_data[0].columns.tolist()

    model_clf = xgb.XGBClassifier()
    model_hdl = ModelHandler(model_clf, training_variables)

    times = []
    roc = []

    for i in range(N_run):
        start = time.time()

        model_hdl.optimize_params_bayes(train_test_data,
                                        params_range,
                                        'roc_auc',
                                        njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

        times.append(time.time() - start)

    print('BAYES OPTIMIZATION WITH SKLEARN')
    print('Mean time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')

    for i in range(N_run):
        model_hdl.optimize_params_optuna(train_test_data,
                                         params_range,
                                         'roc_auc',
                                         timeout=np.mean(times),
                                         njobs=-1)
        model_hdl.train_test_model(train_test_data, )

        y_pred_test = model_hdl.predict(
            train_test_data[2], True)  #used to evaluate model performance

        roc.append(roc_auc_score(train_test_data[3], y_pred_test))

    print('OPTUNA')
    print('Fixed time : ' + str(np.mean(time)))
    print('Mean ROC : ' + str(np.mean(roc)))
    print('--------------\n')
예제 #3
0
PROMPT_FILE_PATH = DATA_DIR.joinpath('Prompt_Dpluspp7TeV_pT_1_50.root')

# define dictionary for storing reference for the tests
INFO_DICT = {}

# preliminar check
if not REFERENCE_DIR.is_dir():
    sys.exit(
        "No 'references' dir was found, so no reference data were produced!")

# instantiate tree handler objects
DATA_HDLR = TreeHandler(DATA_FILE_PATH, 'treeMLDplus')
PROMPT_HDLR = TreeHandler(PROMPT_FILE_PATH, 'treeMLDplus')

# store number of candidates in the original data sample
INFO_DICT['n_data'] = DATA_HDLR.get_n_cand()
INFO_DICT['n_prompt'] = PROMPT_HDLR.get_n_cand()

# store original variable list
INFO_DICT['data_var_list'] = PROMPT_HDLR.get_var_names()
INFO_DICT['prompt_var_list'] = PROMPT_HDLR.get_var_names()

# apply preselections
PRESEL_DATA = '(pt_cand > 1.30 and pt_cand < 42.00) and (inv_mass > 1.6690 and inv_mass < 2.0690)'
PRESEL_PROMPT = '(pt_cand > 1.00 and pt_cand < 25.60) and (inv_mass > 1.8320 and inv_mass < 1.8940)'

DATA_HDLR.apply_preselections(PRESEL_DATA)
PROMPT_HDLR.apply_preselections(PRESEL_PROMPT)

# store number of selcted data
INFO_DICT['n_data_preselected'] = DATA_HDLR.get_n_cand()
예제 #4
0
                )
                #df_signal_cent_ct = df_signal_cent_ct[TRAINING_COLUMNS_LIST]
                #df_background_cent_ct = df_background_cent_ct[TRAINING_COLUMNS_LIST]

                # define tree handlers
                signal_tree_handler = TreeHandler()
                background_tree_handler_full = TreeHandler()
                signal_tree_handler.set_data_frame(df_signal_cent_ct)
                background_tree_handler_full.set_data_frame(
                    df_background_cent_ct)
                del df_signal_cent_ct
                del df_background_cent_ct

                # downscale background
                background_tree_handler = background_tree_handler_full.get_subset(
                    size=int(0.8 * signal_tree_handler.get_n_cand()),
                    rndm_state=RANDOM_STATE)
                del background_tree_handler_full

                # features plot
                leg_labels = ['background', 'signal']
                # second condition needed because of issue with Qt libraries
                if MAKE_FEATURES_PLOTS and not MAKE_PRESELECTION_EFFICIENCY:
                    if not os.path.isdir(f'{PLOT_DIR}/features'):
                        os.mkdir(f'{PLOT_DIR}/features')

                    plot_utils.plot_distr(
                        [background_tree_handler, signal_tree_handler],
                        TRAINING_COLUMNS_LIST,
                        bins=50,
                        labels=leg_labels,
class Optimiserhipe4mltree:
    # Class Attribute
    species = "optimiser_hipe4mltree"

    def __init__(self, data_param, binmin, binmax, training_var, bkg_sel,
                 hyper_pars):

        self.logger = get_logger()

        # directory
        #self.do_mlprefilter = datap.get("doml_asprefilter", None)
        self.dirmlout = data_param["ml"]["mlout"]
        self.dirmlplot = data_param["ml"]["mlplot"]
        #if self.do_mlprefilter is True:
        #    self.dirmodel = self.dirmodel + "/prefilter"
        #    self.dirmlplot = self.dirmlplot + "/prefilter"
        #if self.do_mlprefilter is False:
        #    self.dirmodel = self.dirmodel + "/analysis"
        #    self.dirmlplot = self.dirmlplot + "/analysis"

        self.inputtreedata = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/data.root"
        self.inputtreemc = "/Users/lvermunt/cernbox/Analyses/ML/input/hipe4mlTTree/prompt.root"
        self.v_train = None
        self.p_binmin = binmin
        self.p_binmax = binmax

        self.s_selsigml = ""
        self.s_selbkgml = bkg_sel  #"inv_mass < 1.82 or 1.92 < inv_mass < 2.00"
        self.v_bkgoversigfrac = 3
        self.v_sig = 1
        self.v_bkg = 0
        self.rnd_splt = data_param["ml"]["rnd_splt"]
        self.test_frac = data_param["ml"]["test_frac"]

        self.prompthandler = None
        self.datahandler = None
        self.bkghandler = None
        self.traintestdata = None
        self.ypredtrain_hipe4ml = None
        self.ypredtest_hipe4ml = None

        self.preparesample()

        self.p_hipe4ml_model = None
        self.v_hipe4ml_pars = hyper_pars
        self.load_hipe4mlmodel()

        self.bayesoptconfig_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"][
            "bayes_opt_config"]
        self.average_method_hipe4ml = data_param["hipe4ml"]["roc_auc_average"]
        self.nfold_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["nfolds"]
        self.init_points = data_param["hipe4ml"]["hyper_par_opt"]["initpoints"]
        self.n_iter_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["niter"]
        self.njobs_hipe4ml = data_param["hipe4ml"]["hyper_par_opt"]["njobs"]
        self.roc_method_hipe4ml = data_param["hipe4ml"]["roc_auc_approach"]
        self.raw_output_hipe4ml = data_param["hipe4ml"]["raw_output"]
        self.train_test_log_hipe4ml = data_param["hipe4ml"]["train_test_log"]

        self.multiclass_labels = data_param["ml"].get("multiclass_labels",
                                                      None)

        self.logger.info("Using the following training variables: %s",
                         self.v_train)

    def preparesample(self):
        self.logger.info("Prepare Sample for hipe4ml")

        self.signalhandler = TreeHandler(self.inputtreemc, 'treeMLDplus')
        nsigcand = self.signalhandler.get_n_cand()
        self.datahandler = TreeHandler(self.inputtreedata, 'treeMLDplus')
        self.bkghandler = self.datahandler.get_subset(self.s_selbkgml,
                                                      size=nsigcand *
                                                      self.v_bkgoversigfrac)
        self.traintestdata = train_test_generator(
            [self.signalhandler, self.bkghandler], [self.v_sig, self.v_bkg],
            test_size=self.test_frac,
            random_state=self.rnd_splt)

    def load_hipe4mlmodel(self):
        self.logger.info("Loading hipe4ml model")
        self.v_train = self.signalhandler.get_var_names()
        self.v_train.remove('inv_mass')
        self.v_train.remove('pt_cand')

        model_xgboost = xgb.XGBClassifier()
        self.p_hipe4ml_model = ModelHandler(model_xgboost, self.v_train)

    def set_hipe4ml_modelpar(self):
        self.logger.info("Setting hipe4ml hyperparameters")
        self.p_hipe4ml_model.set_model_params(self.v_hipe4ml_pars)

    def do_hipe4mlhyperparopti(self):
        self.logger.info("Optimising hipe4ml hyperparameters (Bayesian)")

        if not (self.average_method_hipe4ml in ['macro', 'weighted']
                and self.roc_method_hipe4ml in ['ovo', 'ovr']):
            self.logger.fatal("Selected ROC configuration is not valid!")

        if self.average_method_hipe4ml == 'weighted':
            metric = f'roc_auc_{self.roc_method_hipe4ml}_{self.average_method_hipe4ml}'
        else:
            metric = f'roc_auc_{self.roc_method_hipe4ml}'

        hypparsfile = f'{self.dirmlout}/HyperParOpt_pT_{self.p_binmin}_{self.p_binmax}.txt'
        outfilehyppars = open(hypparsfile, 'wt')
        sys.stdout = outfilehyppars
        self.p_hipe4ml_model.optimize_params_bayes(self.traintestdata,
                                                   self.bayesoptconfig_hipe4ml,
                                                   metric, self.nfold_hipe4ml,
                                                   self.init_points,
                                                   self.n_iter_hipe4ml,
                                                   self.njobs_hipe4ml)
        outfilehyppars.close()
        sys.stdout = sys.__stdout__
        self.logger.info("Performing hyper-parameters optimisation: Done!")

    def do_hipe4mltrain(self):
        self.logger.info("Training + testing hipe4ml model")
        t0 = time.time()

        self.p_hipe4ml_model.train_test_model(self.traintestdata,
                                              self.average_method_hipe4ml,
                                              self.roc_method_hipe4ml)
        self.ypredtrain_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[0], self.raw_output_hipe4ml)
        self.ypredtest_hipe4ml = self.p_hipe4ml_model.predict(
            self.traintestdata[2], self.raw_output_hipe4ml)

        modelhandlerfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.pkl'
        self.p_hipe4ml_model.dump_model_handler(modelhandlerfile)
        modelfile = f'{self.dirmlout}/ModelHandler_pT_{self.p_binmin}_{self.p_binmax}.model'
        self.p_hipe4ml_model.dump_original_model(modelfile)

        self.logger.info("Training + testing hipe4ml: Done!")
        self.logger.info("Time elapsed = %.3f", time.time() - t0)

    def do_hipe4mlplot(self):
        self.logger.info("Plotting hipe4ml model")

        leglabels = ["Background", "Prompt signal"]
        outputlabels = ["Bkg", "SigPrompt"]

        # _____________________________________________
        plot_utils.plot_distr([self.bkghandler, self.signalhandler],
                              self.v_train, 100, leglabels)
        plt.subplots_adjust(left=0.06,
                            bottom=0.06,
                            right=0.99,
                            top=0.96,
                            hspace=0.55,
                            wspace=0.55)
        figname = f'{self.dirmlplot}/DistributionsAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        plt.savefig(figname)
        plt.close('all')
        # _____________________________________________
        corrmatrixfig = plot_utils.plot_corr(
            [self.bkghandler, self.signalhandler], self.v_train, leglabels)
        for figg, labb in zip(corrmatrixfig, outputlabels):
            plt.figure(figg.number)
            plt.subplots_adjust(left=0.2, bottom=0.25, right=0.95, top=0.9)
            figname = f'{self.dirmlplot}/CorrMatrix{labb}_pT_{self.p_binmin}_{self.p_binmax}.pdf'
            figg.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 7)
        mloutputfig = plot_utils.plot_output_train_test(
            self.p_hipe4ml_model,
            self.traintestdata,
            80,
            self.raw_output_hipe4ml,
            leglabels,
            self.train_test_log_hipe4ml,
            density=True)
        figname = f'{self.dirmlplot}/MLOutputDistr_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        mloutputfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvefig = plot_utils.plot_roc(self.traintestdata[3],
                                          self.ypredtest_hipe4ml, None,
                                          leglabels,
                                          self.average_method_hipe4ml,
                                          self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvefig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (10, 9)
        roccurvettfig = plot_utils.plot_roc_train_test(
            self.traintestdata[3], self.ypredtest_hipe4ml,
            self.traintestdata[1], self.ypredtrain_hipe4ml, None, leglabels,
            self.average_method_hipe4ml, self.roc_method_hipe4ml)
        figname = f'{self.dirmlplot}/ROCCurveTrainTest_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        roccurvettfig.savefig(figname)
        # _____________________________________________
        precisionrecallfig = plot_utils.plot_precision_recall(
            self.traintestdata[3], self.ypredtest_hipe4ml, leglabels)
        figname = f'{self.dirmlplot}/PrecisionRecallAll_pT_{self.p_binmin}_{self.p_binmax}.pdf'
        precisionrecallfig.savefig(figname)
        # _____________________________________________
        plt.rcParams["figure.figsize"] = (12, 7)
        featuresimportancefig = plot_utils.plot_feature_imp(
            self.traintestdata[2][self.v_train], self.traintestdata[3],
            self.p_hipe4ml_model, leglabels)
        for i in range(0, len(featuresimportancefig)):
            figname = (f'{self.dirmlplot}/FeatureImportanceOpt{i}_'
                       f'pT_{self.p_binmin}_{self.p_binmax}.pdf')
            featuresimportancefig[i].savefig(figname)