def test(): data = get_database_ml_parameters() nevt_sig = 10000 nevt_bkg = 10000 mltype = "BinaryClassification" mlsubtype = "HFmeson" case = "Lc" var_skimming = ["pt_cand_ML"] varmin = [2] varmax = [4] test_frac = 0.2 rnd_splt = 12 rnd_shuffle = 12 logger = get_logger() summary_string = f"#sg events: {nevt_sig}\n#bkg events: {nevt_bkg}\nmltype: {mltype}\n" \ f"mlsubtype: {mlsubtype}\ncase: {case}" logger.debug(summary_string) var_all = data[case]["var_all"] var_signal = data[case]["var_signal"] sel_signal = data[case]["sel_signal"] sel_bkg = data[case]["sel_bkg"] # var_training = data[case]["var_training"] # var_target = data[case]["var_target"] # var_corr_x, var_corr_y = data[case]["var_correlation"] loadsampleoption = 1 if loadsampleoption == 1: filesig, filebkg = data[case]["sig_bkg_files"] trename = data[case]["tree_name"] df_sig = getdataframe(filesig, trename, var_all) df_bkg = getdataframe(filebkg, trename, var_all) df_sig = filterdataframe(df_sig, var_skimming, varmin, varmax) df_bkg = filterdataframe(df_bkg, var_skimming, varmin, varmax) df_sig = df_sig.query(sel_signal, random_state=rnd_shuffle) df_bkg = df_bkg.query(sel_bkg, random_state=rnd_shuffle) df_sig = shuffle(df_sig) df_bkg = shuffle(df_bkg) df_ml_train, df_ml_test = \ prep_mlsamples(df_sig, df_bkg, var_signal, nevt_sig, nevt_bkg, test_frac, rnd_splt) df_sig_train, df_bkg_train = split_df_sigbkg(df_ml_train, var_signal) df_sig_test, df_bkg_test = split_df_sigbkg(df_ml_test, var_signal) logger.info("events for ml train %d and test %d", len(df_ml_train), len(df_ml_test)) logger.info("events for signal train %d and test %d", len(df_sig_train), len(df_sig_test)) logger.info("events for bkg train %d and test %d", len(df_bkg_train), len(df_bkg_test))
def post_form(req): # pylint: disable=too-many-locals, too-many-statements, too-many-branches # Collect configuration in a dictionary for further processing run_config = {} mltype = "BinaryClassification" run_config["mltype"] = mltype case = get_form(req, "case") run_config["case"] = case filesig = get_form(req, "filesig") filebkg = get_form(req, "filebkg") trename = get_form(req, "tree_name") var_all_str = get_form(req, "var_all") var_all = var_all_str.split(',') var_signal = get_form(req, "var_signal") sel_signal = get_form(req, "sel_signal") sel_bkg_str = get_form(req, "sel_bkg") sel_bkg = '' for i in sel_bkg_str: if i == ',': sel_bkg += ' ' elif i == '<': sel_bkg += '<' elif i == '>': sel_bkg += '>' else: sel_bkg += i var_training_str = get_form(req, "var_training") var_training = var_training_str.split(',') var_corr_x_str = get_form(req, "var_correlation_x") var_corr_y_str = get_form(req, "var_correlation_y") var_corr_x = var_corr_x_str.split(',') var_corr_y = var_corr_y_str.split(',') # var_binning_str = get_form(req, "var_binning") # var_binning = var_binning_str.split(',') # var_binning_min_str = get_form(req, "var_binning_min_str") # varmin = [int(i) for i in var_binning_min_str.split(',')] # var_binning_max_str = get_form(req, "var_binning_max_str") # varmax = [int(i) for i in var_binning_max_str.split(',')] var_binning = get_form(req, "var_binning") var_binning_min = float(get_form(req, 'var_binning_min', var_type=float)) var_binning_max = float(get_form(req, 'var_binning_max', var_type=float)) run_config["binmin"] = var_binning_min run_config["binmax"] = var_binning_max presel_reco_str = get_form(req, "presel_reco") presel_reco = '' if presel_reco_str == 'None': presel_reco = None else: for i in presel_reco_str: if i == ',': presel_reco += ' ' elif i == '<': presel_reco += '<' elif i == '>': presel_reco += '>' else: presel_reco += i activate_scikit = get_form(req, 'activate_scikit', var_type=bool) activate_xgboost = get_form(req, 'activate_xgboost', var_type=bool) activate_keras = get_form(req, 'activate_keras', var_type=bool) docorrelation = get_form(req, 'docorrelation', var_type=bool) run_config["docorrelation"] = docorrelation dotraining = get_form(req, 'dotraining', var_type=bool) run_config["dotraining"] = dotraining doROC = get_form(req, 'doROC', var_type=bool) run_config["doROC"] = doROC dolearningcurve = get_form(req, 'dolearningcurve', var_type=bool) run_config["dolearningcurve"] = dolearningcurve docrossvalidation = get_form(req, 'docrossvalidation', var_type=bool) run_config["docrossvalidation"] = docrossvalidation doimportance = get_form(req, 'doimportance', var_type=bool) run_config["doimportance"] = doimportance dogridsearch = get_form(req, 'dogridsearch', var_type=bool) run_config["dogridsearch"] = dogridsearch rnd_shuffle = int(get_form(req, 'rnd_shuffle', var_type=int)) run_config["rnd_shuffle"] = rnd_shuffle nevt_sig = int(get_form(req, 'nevt_sig', var_type=int)) run_config["nevt_sig"] = nevt_sig nevt_bkg = int(get_form(req, 'nevt_bkg', var_type=int)) run_config["nevt_bkg"] = nevt_bkg test_frac = float(get_form(req, 'test_frac', var_type=float)) run_config["test_frac"] = test_frac rnd_splt = int(get_form(req, 'rnd_splt', var_type=int)) run_config["rnd_splt"] = rnd_splt nkfolds = int(get_form(req, 'nkfolds', var_type=int)) run_config["nkfolds"] = nkfolds ncores = int(get_form(req, 'ncores', var_type=int)) run_config["ncores"] = ncores data = get_database_ml_parameters() # Construct Configuration object from run_config conf = Configuration(run_config_input=run_config) conf.configure() model_config = conf.get_model_config() string_selection = createstringselection(var_binning, var_binning_min, var_binning_max) suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \ f"{mltype}{case}_{string_selection}" dataframe = f"dataframes_{suffix}" plotdir = f"plots_{suffix}" output = f"output_{suffix}" checkdir(dataframe) checkdir(plotdir) checkdir(output) classifiers = [] classifiers_scikit = [] classifiers_xgboost = [] classifiers_keras = [] names = [] names_scikit = [] names_xgboost = [] names_keras = [] trainedmodels = [] df_sig = getdataframe(filesig, trename, var_all) df_bkg = getdataframe(filebkg, trename, var_all) if presel_reco is not None: df_sig = df_sig.query(presel_reco) df_bkg = df_bkg.query(presel_reco) df_sig = filterdataframe_singlevar(df_sig, var_binning, var_binning_min, var_binning_max) df_bkg = filterdataframe_singlevar(df_bkg, var_binning, var_binning_min, var_binning_max) # Output images imageIO_vardist: BytesIO = None imageIO_scatterplot: BytesIO = None imageIO_corr_sig: BytesIO = None imageIO_corr_bkg: BytesIO = None imageIO_precision_recall: BytesIO = None imageIO_ROC: BytesIO = None imageIO_plot_learning_curves: BytesIO = None img_scoresRME: BytesIO = None img_import: BytesIO = None img_gridsearch: BytesIO = None # pylint: disable=unused-variable _, _, df_sig_train, df_bkg_train, _, _, x_train, y_train, x_test, y_test = \ create_mlsamples(df_sig, df_bkg, sel_signal, data[case], sel_bkg, rnd_shuffle, var_signal, var_training, nevt_sig, nevt_bkg, test_frac, rnd_splt) if docorrelation: imageIO_vardist, imageIO_scatterplot, imageIO_corr_sig, imageIO_corr_bkg = \ do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x, var_corr_y, plotdir) # Using the activate_* flags is for now a work-around if activate_scikit: classifiers_scikit, names_scikit = getclf_scikit(model_config) classifiers = classifiers + classifiers_scikit names = names + names_scikit if activate_xgboost: classifiers_xgboost, names_xgboost = getclf_xgboost(model_config) classifiers = classifiers + classifiers_xgboost names = names + names_xgboost if activate_keras: classifiers_keras, names_keras = getclf_keras(model_config, len(x_train.columns)) classifiers = classifiers + classifiers_keras names = names + names_keras if dotraining: trainedmodels = fit(names, classifiers, x_train, y_train) savemodels(names, trainedmodels, output, suffix) if doROC: imageIO_precision_recall, imageIO_ROC = \ precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir) if docrossvalidation: df_scores = [] if mltype == "Regression": df_scores = cross_validation_mse_continuous( names, classifiers, x_train, y_train, nkfolds, ncores) if mltype == "BinaryClassification": df_scores = cross_validation_mse(names, classifiers, x_train, y_train, nkfolds, ncores) img_scoresRME = plot_cross_validation_mse(names, df_scores, suffix, plotdir) if doimportance: img_import = importanceplotall( var_training, names_scikit + names_xgboost, classifiers_scikit + classifiers_xgboost, suffix, plotdir) if dolearningcurve: npoints = 10 imageIO_plot_learning_curves = plot_learning_curves( names, classifiers, suffix, plotdir, x_train, y_train, npoints) if dogridsearch: datasearch = get_database_ml_gridsearch() analysisdb = datasearch[mltype] names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \ par_grid_cv_keys = read_grid_dict(analysisdb) _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv, x_train, y_train, nkfolds, ncores) img_gridsearch = perform_plot_gridsearch(names_cv, dfscore, par_grid_cv, par_grid_cv_keys, var_param, plotdir, suffix, 0.1) return JENV.get_template("display.html").render( imageIO_vardist=imageIO_vardist, imageIO_scatterplot=imageIO_scatterplot, imageIO_corr_sig=imageIO_corr_sig, imageIO_corr_bkg=imageIO_corr_bkg, imageIO_precision_recall=imageIO_precision_recall, imageIO_ROC=imageIO_ROC, imageIO_plot_learning_curves=imageIO_plot_learning_curves, img_scoresRME=img_scoresRME, img_import=img_import, img_gridsearch=img_gridsearch)
def post_continue(req): # pylint: disable=unused-argument """Serve the configuration page.""" subtype = get_form(req, "slct1") case = get_form(req, "slct2") data = get_database_ml_parameters() filesig, filebkg = data[case]["sig_bkg_files"] filesig = os.path.join(DATA_PREFIX, filesig) filebkg = os.path.join(DATA_PREFIX, filebkg) trename = data[case]["tree_name"] var_all = data[case]["var_all"] var_all_str = ','.join(var_all) var_signal = data[case]["var_signal"] sel_signal = data[case]["sel_signal"] sel_bkg = data[case]["sel_bkg"] sel_bkg_str = '' for i in sel_bkg: if i == '<': sel_bkg_str += '<' elif i == '>': sel_bkg_str += '>' elif i == ' ': sel_bkg_str += ',' else: sel_bkg_str += i var_training = data[case]["var_training"] var_training_str = ','.join(var_training) var_corr_x, var_corr_y = data[case]["var_correlation"] var_corr_x_str = ','.join(var_corr_x) var_corr_y_str = ','.join(var_corr_y) # var_binning = [data[case]["var_binning"]] # var_binning_str = ','.join(var_binning) # varmin = ['0'] # var_binning_min_str = ','.join(varmin) # varmax = ['100'] # var_binning_max_str = ','.join(varmax) var_binning = data[case]["var_binning"] var_binning_min = 2 var_binning_max = 3 presel_reco = data[case]["presel_reco"] presel_reco_str = None if presel_reco is not None: presel_reco_str = '' for i in presel_reco: if i == '<': presel_reco_str += '<' elif i == '>': presel_reco_str += '>' elif i == ' ': presel_reco_str += ',' else: presel_reco_str += i return JENV.get_template("test.html").render( subtype=subtype, case=case, filesig=filesig, filebkg=filebkg, trename=trename, var_all_str=var_all_str, var_signal=var_signal, sel_signal=sel_signal, sel_bkg_str=sel_bkg_str, var_training_str=var_training_str, var_corr_x_str=var_corr_x_str, var_corr_y_str=var_corr_y_str, var_binning=var_binning, var_binning_min=var_binning_min, var_binning_max=var_binning_max, presel_reco_str=presel_reco_str)
plt.savefig(f'{plot_dir}/FONLL_curve_{suffix}.png') #pylint: disable=too-many-statements, too-many-locals def study_signif(case, names, bin_lim, file_mc_gen, file_data_evt_ml, file_data_evt_tot, df_mc_reco, df_ml_test, df_data_dec, suffix, plot_dir): """ Study the efficiency and the expected signal significance as a function of the threshold value on a ML model output. """ logger = get_logger() gROOT.SetBatch(True) gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;") gen_dict = get_database_ml_parameters()[case] mass = gen_dict["mass"] mass_fit_lim = gen_dict['mass_fit_lim'] bin_width = gen_dict['bin_width'] var_bin = gen_dict['variables']['var_binning'] sopt_dict = gen_dict['signif_opt'] bkg_fract = sopt_dict['bkg_data_fraction'] save_fit = sopt_dict['save_fit'] df_mc_gen = pd.read_pickle(file_mc_gen) df_mc_gen = df_mc_gen.query(gen_dict['presel_gen']) df_mc_gen = filterdataframe_singlevar(df_mc_gen, var_bin, bin_lim[0], bin_lim[1]) df_evt_ml = pd.read_pickle( file_data_evt_ml) # portion of data events used for ML n_events_ml = len(df_evt_ml.query(sopt_dict['sel_event']))
def doclassification_regression(conf): # pylint: disable=too-many-locals, too-many-statements, too-many-branches logger = get_logger() logger.info(f"Start classification_regression run") run_config = conf.get_run_config() model_config = conf.get_model_config() mltype = run_config['mltype'] mlsubtype = run_config['mlsubtype'] case = run_config['case'] loadsampleoption = run_config['loadsampleoption'] binmin = run_config['binmin'] binmax = run_config['binmax'] rnd_shuffle = run_config['rnd_shuffle'] nevt_sig = run_config['nevt_sig'] nevt_bkg = run_config['nevt_bkg'] test_frac = run_config['test_frac'] rnd_splt = run_config['rnd_splt'] docorrelation = run_config['docorrelation'] dostandard = run_config['dostandard'] dopca = run_config['dopca'] dotraining = run_config['dotraining'] dotesting = run_config['dotesting'] applytodatamc = run_config['applytodatamc'] docrossvalidation = run_config['docrossvalidation'] dolearningcurve = run_config['dolearningcurve'] doROC = run_config['doROC'] doboundary = run_config['doboundary'] doimportance = run_config['doimportance'] dopltregressionxy = run_config['dopltregressionxy'] dogridsearch = run_config['dogridsearch'] dosignifopt = run_config['dosignifopt'] nkfolds = run_config['nkfolds'] ncores = run_config['ncores'] data = get_database_ml_parameters() filesig, filebkg = data[case]["sig_bkg_files"] filedata, filemc = data[case]["data_mc_files"] trename = data[case]["tree_name"] var_all = data[case]["var_all"] var_signal = data[case]["var_signal"] sel_signal = data[case]["sel_signal"] sel_bkg = data[case]["sel_bkg"] var_training = data[case]["var_training"] var_target = data[case]["var_target"] var_corr_x, var_corr_y = data[case]["var_correlation"] var_boundaries = data[case]["var_boundaries"] var_binning = data[case]['var_binning'] presel_reco = data[case]["presel_reco"] summary_string = f"#sig events: {nevt_sig}\n#bkg events: {nevt_bkg}\nmltype: {mltype}\n" \ f"mlsubtype: {mlsubtype}\ncase: {case}" logger.debug(summary_string) string_selection = createstringselection(var_binning, binmin, binmax) suffix = f"nevt_sig{nevt_sig}_nevt_bkg{nevt_bkg}_" \ f"{mltype}{case}_{string_selection}" dataframe = f"dataframes_{suffix}" plotdir = f"plots_{suffix}" output = f"output_{suffix}" checkdir(dataframe) checkdir(plotdir) checkdir(output) classifiers = [] classifiers_scikit = [] classifiers_xgboost = [] classifiers_keras = [] names = [] names_scikit = [] names_xgboost = [] names_keras = [] filesig = os.path.join(DATA_PREFIX, filesig) filebkg = os.path.join(DATA_PREFIX, filebkg) filedata = os.path.join(DATA_PREFIX, filedata) filemc = os.path.join(DATA_PREFIX, filemc) trainedmodels = [] if loadsampleoption == 1: df_sig = getdataframe(filesig, trename, var_all) df_bkg = getdataframe(filebkg, trename, var_all) if presel_reco is not None: df_sig = df_sig.query(presel_reco) df_bkg = df_bkg.query(presel_reco) df_sig = filterdataframe_singlevar(df_sig, var_binning, binmin, binmax) df_bkg = filterdataframe_singlevar(df_bkg, var_binning, binmin, binmax) _, df_ml_test, df_sig_train, df_bkg_train, _, _, \ x_train, y_train, x_test, y_test = \ create_mlsamples(df_sig, df_bkg, sel_signal, sel_bkg, rnd_shuffle, var_signal, var_training, nevt_sig, nevt_bkg, test_frac, rnd_splt) if docorrelation == 1: do_correlation(df_sig_train, df_bkg_train, var_all, var_corr_x, var_corr_y, plotdir) if dostandard == 1: x_train = getdataframe_standardised(x_train) if dopca == 1: n_pca = 9 x_train, pca = get_pcadataframe_pca(x_train, n_pca) plotvariance_pca(pca, plotdir) classifiers_scikit, names_scikit = getclf_scikit(model_config) classifiers_xgboost, names_xgboost = getclf_xgboost(model_config) classifiers_keras, names_keras = getclf_keras(model_config, len(x_train.columns)) classifiers = classifiers_scikit + classifiers_xgboost + classifiers_keras names = names_scikit + names_xgboost + names_keras if dotraining == 1: trainedmodels = fit(names, classifiers, x_train, y_train) savemodels(names, trainedmodels, output, suffix) if dotesting == 1: # The model predictions are added to the test dataframe df_ml_test = test(mltype, names, trainedmodels, df_ml_test, var_training, var_signal) df_ml_test_to_df = output + "/testsample_%s_mldecision.pkl" % (suffix) df_ml_test_to_root = output + "/testsample_%s_mldecision.root" % ( suffix) df_ml_test.to_pickle(df_ml_test_to_df) write_tree(df_ml_test_to_root, trename, df_ml_test) if applytodatamc == 1: df_data = getdataframe(filedata, trename, var_all) df_mc = getdataframe(filemc, trename, var_all) if presel_reco is not None: df_mc = df_mc.query(presel_reco) df_data = df_data.query(presel_reco) df_data = filterdataframe_singlevar(df_data, var_binning, binmin, binmax) df_mc = filterdataframe_singlevar(df_mc, var_binning, binmin, binmax) # The model predictions are added to the dataframes of data and MC df_data = apply(mltype, names, trainedmodels, df_data, var_training) df_mc = apply(mltype, names, trainedmodels, df_mc, var_training) df_data_to_root = output + "/data_%s_mldecision.root" % (suffix) df_mc_to_root = output + "/mc_%s_mldecision.root" % (suffix) write_tree(df_data_to_root, trename, df_data) write_tree(df_mc_to_root, trename, df_mc) if docrossvalidation == 1: df_scores = [] if mltype == "Regression": df_scores = cross_validation_mse_continuous( names, classifiers, x_train, y_train, nkfolds, ncores) if mltype == "BinaryClassification": df_scores = cross_validation_mse(names, classifiers, x_train, y_train, nkfolds, ncores) plot_cross_validation_mse(names, df_scores, suffix, plotdir) if dolearningcurve == 1: # confusion(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir) npoints = 10 plot_learning_curves(names, classifiers, suffix, plotdir, x_train, y_train, npoints) if doROC == 1: precision_recall(names, classifiers, suffix, x_train, y_train, nkfolds, plotdir) if doboundary == 1: classifiers_scikit_2var, names_2var = getclf_scikit(mltype) classifiers_keras_2var, names_keras_2var = getclf_keras( model_config, 2) classifiers_2var = classifiers_scikit_2var + classifiers_keras_2var names_2var = names_2var + names_keras_2var x_test_boundary = x_test[var_boundaries] trainedmodels_2var = fit(names_2var, classifiers_2var, x_test_boundary, y_test) decisionboundaries(names_2var, trainedmodels_2var, suffix + "2var", x_test_boundary, y_test, plotdir) if doimportance == 1: importanceplotall(var_training, names_scikit + names_xgboost, classifiers_scikit + classifiers_xgboost, suffix, plotdir) if dopltregressionxy == 1: plotdistributiontarget(names, df_ml_test, var_target, suffix, plotdir) plotscattertarget(names, df_ml_test, var_target, suffix, plotdir) if dogridsearch == 1: datasearch = get_database_ml_gridsearch() analysisdb = datasearch[mltype] names_cv, clf_cv, par_grid_cv, refit_cv, var_param, \ par_grid_cv_keys = read_grid_dict(analysisdb) _, _, dfscore = do_gridsearch(names_cv, clf_cv, par_grid_cv, refit_cv, x_train, y_train, nkfolds, ncores) perform_plot_gridsearch(names_cv, dfscore, par_grid_cv, par_grid_cv_keys, var_param, plotdir, suffix, 0.1) if dosignifopt == 1: logger.info("Doing significance optimization") if dotraining and dotesting and applytodatamc: if (mlsubtype == "HFmeson") and case in ("Dsnew", "Lcnew", "Dzero", "Dplus", "Dstar"): df_data_opt = df_data.query(sel_bkg) df_data_opt = shuffle(df_data_opt, random_state=rnd_shuffle) study_signif(case, names, [binmin, binmax], filemc, filedata, df_mc, df_ml_test, df_data_opt, suffix, plotdir) else: logger.error( "Optimisation is not implemented for this classification problem." ) else: logger.error( "Training, testing and applytodata flags must be set to 1")