Exemplo n.º 1
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
             if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0:
                 continue
         dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb"))
         if self.p_mask_values:
             mask_df(dfrecosk, self.p_mask_values)
         if self.doml is True:
             if os.path.isfile(self.lpt_model[ipt]) is False:
                 print("Model file not present in bin %d" % ipt)
             mod = pickle.load(openfile(self.lpt_model[ipt], 'rb'))
             if self.mltype == "MultiClassification":
                 dfrecoskml = apply(self.mltype, [self.p_modelname], [mod],
                                    dfrecosk, self.v_train[ipt], self.multiclass_labels)
                 prob0 = "y_test_prob" + self.p_modelname + self.multiclass_labels[0]
                 prob1 = "y_test_prob" + self.p_modelname + self.multiclass_labels[1]
                 dfrecoskml = dfrecoskml.loc[(dfrecoskml[prob0] <= self.lpt_probcutpre[ipt][0]) &
                                             (dfrecoskml[prob1] >= self.lpt_probcutpre[ipt][1])]
             else:
                 dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
                                    dfrecosk, self.v_train[ipt])
                 probvar = "y_test_prob" + self.p_modelname
                 dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         else:
             dfrecoskml = dfrecosk.query("isstd == 1")
         pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"),
                     protocol=4)
Exemplo n.º 2
0
 def do_apply(self):
     df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                     self.df_data, self.v_train)
     df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                   self.df_mc, self.v_train)
     pickle.dump(df_data, openfile(self.f_reco_applieddata, "wb"), protocol=4)
     pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
Exemplo n.º 3
0
 def do_apply(self):
     df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                     self.df_data, self.v_train)
     df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                   self.df_mc, self.v_train)
     df_data_to_root = self.dirmlout + "/data_%s_mldecision.root" % (
         self.s_suffix)
     df_mc_to_root = self.dirmlout + "/mc_%s_mldecision.root" % (
         self.s_suffix)
     write_tree(df_data_to_root, self.n_treetest, df_data)
     write_tree(df_mc_to_root, self.n_treetest, df_mc)
Exemplo n.º 4
0
    def do_apply(self):

        self.prepare_data_mc_mcgen()

        if self.step_done("application"):
            return

        self.do_train()

        self.logger.info("Application")

        df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                        self.df_data, self.v_train)
        df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                      self.df_mc, self.v_train)
        pickle.dump(df_data,
                    openfile(self.f_reco_applieddata, "wb"),
                    protocol=4)
        pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
Exemplo n.º 5
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         dfrecosk = pickle.load(
             open(self.mptfiles_recosk[ipt][file_index], "rb"))
         mod = pickle.load(open(self.lpt_model[ipt], 'rb'))
         dfrecoskml = apply("BinaryClassification", [self.p_modelname],
                            [mod], dfrecosk, self.v_train)
         probvar = "y_test_prob" + self.p_modelname
         dfrecoskml = dfrecoskml.loc[
             dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         dfrecoskml.to_pickle(self.mptfiles_recoskmldec[ipt][file_index])
Exemplo n.º 6
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb"))
         if os.path.isfile(self.lpt_model[ipt]) is False:
             print("Model file not present in bin %d" % ipt)
         mod = pickle.load(openfile(self.lpt_model[ipt], 'rb'))
         dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
                            dfrecosk, self.v_train)
         probvar = "y_test_prob" + self.p_modelname
         dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"),
                     protocol=4)
        savemodels(names, trainedmodels, mlout, suffix)

    if dotesting == 1:
        # The model predictions are added to the test dataframe
        df_ml_test = test(mltype, names, trainedmodels, df_ml_test,
                          var_training, var_signal)
        df_ml_test_to_df = mlout + "/testsample_%s_mldecision.pkl" % (suffix)
        df_ml_test_to_root = mlout + "/testsample_%s_mldecision.root" % (
            suffix)
        df_ml_test.to_pickle(df_ml_test_to_df)
        write_tree(df_ml_test_to_root, tree_name, df_ml_test)
        #plot_overtraining(names, classifiers, suffix, mlplot, x_train, y_train, x_test, y_test)

    if applytodatamc == 1:
        # The model predictions are added to the dataframes of data and MC
        df_data = apply(mltype, names, trainedmodels, df_data, var_training)
        df_mc = apply(mltype, names, trainedmodels, df_mc, var_training)
        df_data_to_root = mlout + "/data_%s_mldecision.root" % (suffix)
        df_mc_to_root = mlout + "/mc_%s_mldecision.root" % (suffix)
        write_tree(df_data_to_root, tree_name, df_data)
        write_tree(df_mc_to_root, tree_name, df_mc)

    if docrossvalidation == 1:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
                names, classifiers, x_train, y_train, nkfolds, ncores)
        if mltype == "BinaryClassification":
            df_scores = cross_validation_mse(names, classifiers, x_train,
                                             y_train, nkfolds, ncores)
        plot_cross_validation_mse(names, df_scores, suffix, mlplot)
Exemplo n.º 8
0
def doclassification_regression(conf):  # pylint: disable=too-many-locals, too-many-statements, too-many-branches

    logger = get_logger()
    logger.info(f"Start classification_regression run")

    run_config = conf.get_run_config()
    model_config = conf.get_model_config()

    mltype = run_config['mltype']
    mlsubtype = run_config['mlsubtype']
    case = run_config['case']
    loadsampleoption = run_config['loadsampleoption']
    binmin = run_config['binmin']
    binmax = run_config['binmax']
    rnd_shuffle = run_config['rnd_shuffle']
    nevt_sig = run_config['nevt_sig']
    nevt_bkg = run_config['nevt_bkg']
    test_frac = run_config['test_frac']
    rnd_splt = run_config['rnd_splt']
    docorrelation = run_config['docorrelation']
    dostandard = run_config['dostandard']
    dopca = run_config['dopca']
    dotraining = run_config['dotraining']
    dotesting = run_config['dotesting']
    applytodatamc = run_config['applytodatamc']
    docrossvalidation = run_config['docrossvalidation']
    dolearningcurve = run_config['dolearningcurve']
    doROC = run_config['doROC']
    doboundary = run_config['doboundary']
    doimportance = run_config['doimportance']
    dopltregressionxy = run_config['dopltregressionxy']
    dogridsearch = run_config['dogridsearch']
    dosignifopt = run_config['dosignifopt']
    nkfolds = run_config['nkfolds']
    ncores = run_config['ncores']

    data = get_database_ml_parameters()
    filesig, filebkg = data[case]["sig_bkg_files"]
    filedata, filemc = data[case]["data_mc_files"]
    trename = data[case]["tree_name"]
    var_all = data[case]["var_all"]
    var_signal = data[case]["var_signal"]
    sel_signal = data[case]["sel_signal"]
    sel_bkg = data[case]["sel_bkg"]
    var_training = data[case]["var_training"]
    var_target = data[case]["var_target"]
    var_corr_x, var_corr_y = data[case]["var_correlation"]
    var_boundaries = data[case]["var_boundaries"]
    var_binning = data[case]['var_binning']
    presel_reco = data[case]["presel_reco"]

    summary_string = f"#sig events: {nevt_sig}\n#bkg events: {nevt_bkg}\nmltype: {mltype}\n" \
                     f"mlsubtype: {mlsubtype}\ncase: {case}"
    logger.debug(summary_string)

    string_selection = createstringselection(var_binning, binmin, binmax)
    suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \
             f"{mltype}{case}_{string_selection}"
    dataframe = f"dataframes_{suffix}"
    plotdir = f"plots_{suffix}"
    output = f"output_{suffix}"
    checkdir(dataframe)
    checkdir(plotdir)
    checkdir(output)

    classifiers = []
    classifiers_scikit = []
    classifiers_xgboost = []
    classifiers_keras = []

    names = []
    names_scikit = []
    names_xgboost = []
    names_keras = []

    filesig = os.path.join(DATA_PREFIX, filesig)
    filebkg = os.path.join(DATA_PREFIX, filebkg)
    filedata = os.path.join(DATA_PREFIX, filedata)
    filemc = os.path.join(DATA_PREFIX, filemc)

    trainedmodels = []

    if loadsampleoption == 1:
        df_sig = getdataframe(filesig, trename, var_all)
        df_bkg = getdataframe(filebkg, trename, var_all)
        if presel_reco is not None:
            df_sig = df_sig.query(presel_reco)
            df_bkg = df_bkg.query(presel_reco)
        df_sig = filterdataframe_singlevar(df_sig, var_binning, binmin, binmax)
        df_bkg = filterdataframe_singlevar(df_bkg, var_binning, binmin, binmax)
        _, df_ml_test, df_sig_train, df_bkg_train, _, _, \
        x_train, y_train, x_test, y_test = \
            create_mlsamples(df_sig, df_bkg, sel_signal, sel_bkg, rnd_shuffle,
                             var_signal, var_training,
                             nevt_sig, nevt_bkg, test_frac, rnd_splt)

    if docorrelation == 1:
        do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x,
                       var_corr_y, plotdir)

    if dostandard == 1:
        x_train = getdataframe_standardised(x_train)

    if dopca == 1:
        n_pca = 9
        x_train, pca = get_pcadataframe_pca(x_train, n_pca)
        plotvariance_pca(pca, plotdir)

    classifiers_scikit, names_scikit = getclf_scikit(model_config)

    classifiers_xgboost, names_xgboost = getclf_xgboost(model_config)

    classifiers_keras, names_keras = getclf_keras(model_config,
                                                  len(x_train.columns))

    classifiers = classifiers_scikit + classifiers_xgboost + classifiers_keras
    names = names_scikit + names_xgboost + names_keras

    if dotraining == 1:
        trainedmodels = fit(names, classifiers, x_train, y_train)
        savemodels(names, trainedmodels, output, suffix)

    if dotesting == 1:
        # The model predictions are added to the test dataframe
        df_ml_test = test(mltype, names, trainedmodels, df_ml_test,
                          var_training, var_signal)
        df_ml_test_to_df = output + "/testsample_%s_mldecision.pkl" % (suffix)
        df_ml_test_to_root = output + "/testsample_%s_mldecision.root" % (
            suffix)
        df_ml_test.to_pickle(df_ml_test_to_df)
        write_tree(df_ml_test_to_root, trename, df_ml_test)

    if applytodatamc == 1:
        df_data = getdataframe(filedata, trename, var_all)
        df_mc = getdataframe(filemc, trename, var_all)
        if presel_reco is not None:
            df_mc = df_mc.query(presel_reco)
            df_data = df_data.query(presel_reco)
        df_data = filterdataframe_singlevar(df_data, var_binning, binmin,
                                            binmax)
        df_mc = filterdataframe_singlevar(df_mc, var_binning, binmin, binmax)
        # The model predictions are added to the dataframes of data and MC
        df_data = apply(mltype, names, trainedmodels, df_data, var_training)
        df_mc = apply(mltype, names, trainedmodels, df_mc, var_training)
        df_data_to_root = output + "/data_%s_mldecision.root" % (suffix)
        df_mc_to_root = output + "/mc_%s_mldecision.root" % (suffix)
        write_tree(df_data_to_root, trename, df_data)
        write_tree(df_mc_to_root, trename, df_mc)

    if docrossvalidation == 1:
        df_scores = []
        if mltype == "Regression":
            df_scores = cross_validation_mse_continuous(
                names, classifiers, x_train, y_train, nkfolds, ncores)
        if mltype == "BinaryClassification":
            df_scores = cross_validation_mse(names, classifiers, x_train,
                                             y_train, nkfolds, ncores)
        plot_cross_validation_mse(names, df_scores, suffix, plotdir)

    if dolearningcurve == 1:
        #         confusion(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir)
        npoints = 10
        plot_learning_curves(names, classifiers, suffix, plotdir, x_train,
                             y_train, npoints)

    if doROC == 1:
        precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds,
                         plotdir)

    if doboundary == 1:
        classifiers_scikit_2var, names_2var = getclf_scikit(mltype)
        classifiers_keras_2var, names_keras_2var = getclf_keras(
            model_config, 2)
        classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var
        names_2var = names_2var + names_keras_2var
        x_test_boundary = x_test[var_boundaries]
        trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary,
                                 y_test)
        decisionboundaries(names_2var, trainedmodels_2var, suffix + "2var",
                           x_test_boundary, y_test, plotdir)

    if doimportance == 1:
        importanceplotall(var_training, names_scikit + names_xgboost,
                          classifiers_scikit + classifiers_xgboost, suffix,
                          plotdir)

    if dopltregressionxy == 1:
        plotdistributiontarget(names, df_ml_test, var_target, suffix, plotdir)
        plotscattertarget(names, df_ml_test, var_target, suffix, plotdir)

    if dogridsearch == 1:
        datasearch = get_database_ml_gridsearch()
        analysisdb = datasearch[mltype]
        names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \
            par_grid_cv_keys = read_grid_dict(analysisdb)
        _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv,
                                      x_train, y_train, nkfolds, ncores)
        perform_plot_gridsearch(names_cv, dfscore, par_grid_cv,
                                par_grid_cv_keys, var_param, plotdir, suffix,
                                0.1)

    if dosignifopt == 1:
        logger.info("Doing significance optimization")
        if dotraining and dotesting and applytodatamc:
            if (mlsubtype == "HFmeson") and case in ("Dsnew", "Lcnew", "Dzero",
                                                     "Dplus", "Dstar"):
                df_data_opt = df_data.query(sel_bkg)
                df_data_opt = shuffle(df_data_opt, random_state=rnd_shuffle)
                study_signif(case, names, [binmin, binmax], filemc, filedata,
                             df_mc, df_ml_test, df_data_opt, suffix, plotdir)
            else:
                logger.error(
                    "Optimisation is not implemented for this classification problem."
                )
        else:
            logger.error(
                "Training, testing and applytodata flags must be set to 1")
Exemplo n.º 9
0
            #preselection on pid and track vars using bitmap
            df = filter_df_cand(df, data[case], 'presel_track_pid')
            #apply standard cuts from file
            for icutvar in std_cuts_map:
                if icutvar != "var_binning":
                    array_var = df.loc[:, std_cuts_map[icutvar]["name"]].values
                    is_selected = selectcand_lincut(array_var, \
                            std_cuts_map[icutvar]["min"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["max"][ibin_std_cuts], \
                            std_cuts_map[icutvar]["isabsval"])
                    df = df[is_selected]
            df.to_pickle(namefiledf_std)
    elif useml == 1:
        df = filter_df_cand(df, data[case], 'presel_track_pid')
        mod = pickle.load(open(model, 'rb'))
        df = apply("BinaryClassification", [modelname], [mod], df, var_training)
        array_prob = df.loc[:, "y_test_prob" + modelname].values
        is_selected = selectcandidateml(array_prob, probcut)
        df = df[is_selected]
        df.to_pickle(namefiledf_ml)

def selectcandidatesall(data, listdf, listdfout_ml, listdfout_std, pt_var, ptmin, ptmax,
                        useml, modelname, model, probcut, case, std_cuts_map=None, \
                            ibin_std_cuts=None):
    processes = [mp.Process(target=selectcandidates, \
                 args=(data, listdf[index], listdfout_ml[index], \
                       listdfout_std[index], pt_var, ptmin, ptmax, \
                       useml, modelname, model, probcut, case, std_cuts_map, ibin_std_cuts))
                 for index, _ in enumerate(listdf)]
    for p in processes:
        p.start()