def run_mva(client, parameters, df): mva_path = parameters.pop("mva_path", "./") mkdir(mva_path) mva_models = parameters.pop("mva_models", {}) saved_models = parameters.pop("saved_models", {}) training_datasets = parameters.pop("training_datasets", {}) features = parameters.pop("training_features", []) do_training = parameters.pop("mva_do_training", False) do_evaluation = parameters.pop("mva_do_evaluation", False) do_plotting = parameters.pop("mva_do_plotting", False) channels_to_use = parameters.get("mva_channels", ["ggh_0jets"]) for channel in channels_to_use: out_dir = f"{mva_path}/{channel}" mkdir(out_dir) parameters["plots_path"] = out_dir trainer = Trainer( df=df[df.channel == channel], channel=channel, ds_dict=training_datasets, features=features, out_path=out_dir, training_cut="(dimuon_mass > 110) & (dimuon_mass < 150)", ) # trainer.shape_in_eta_bins(shape_of="dimuon_mass", nbins=10) if do_training: trainer.add_models(mva_models.copy()) trainer.run_training(client) if channel in saved_models.keys(): if len(saved_models[channel].keys()) > 0: trainer.add_saved_models(saved_models[channel]) if do_evaluation: trainer.run_evaluation(client) trainer.shape_in_bins(shape_of="dimuon_mass", nbins=6) # trainer.shape_in_bins(shape_of="max_abs_eta", nbins=10) for model_name in trainer.models.keys(): score_name = f"{model_name}_score" df.loc[df.channel == channel, score_name] = trainer.df.loc[ :, score_name ] if do_plotting: trainer.plot_roc_curves() parameters_tmp = parameters.copy() parameters_tmp["hist_vars"] = [] parameters_tmp["plot_vars"] = [] parameters_tmp["regions"] = ["h-peak", "h-sidebands"] parameters_tmp["channels"] = [channel] all_models = [] if channel in mva_models.keys(): all_models += list(mva_models[channel].keys()) if channel in saved_models.keys(): all_models += list(saved_models[channel].keys()) all_models = list(set(all_models)) for model_name in all_models: score_name = f"{model_name}_score" parameters_tmp["hist_vars"].append(score_name) parameters_tmp["plot_vars"].append(score_name) parameters_tmp["variables_lookup"][score_name] = Variable( score_name, score_name, 50, 0, 1 ) hist_df = to_histograms(client, parameters_tmp, trainer.df) plotter(client, parameters_tmp, hist_df)
parameters["hist_vars"] += ["score_" + m for m in parameters["bdt_models"]] # parameters['plot_vars'] = ['dimuon_mass'] parameters["plot_vars"] = parameters["hist_vars"] parameters["datasets"] = datasets all_paths = {} for year in parameters["years"]: all_paths[year] = {} for dataset in datasets: paths = glob.glob( f"{parameters['path']}/" f"{year}_{parameters['label']}/" f"{dataset}/*.parquet" ) all_paths[year][dataset] = paths if args.remake_hists: for year in parameters["years"]: print(f"Processing {year}") for dataset, path in tqdm.tqdm(all_paths[year].items()): if len(path) == 0: continue df = load_dataframe(client, parameters, inputs=[path]) if not isinstance(df, dd.DataFrame): continue to_histograms(client, parameters, df=df) if args.plot: yields = plotter(client, parameters)
"signals": ["ggh_powheg", "vbf_powheg"], } if __name__ == "__main__": tick = time.time() client = Client(processes=True, n_workers=1, threads_per_worker=1, memory_limit="4GB") file_name = "dy_delphes_stage1_output.parquet" path = f"{os.getcwd()}/tests/samples/{file_name}" out_df = load_dataframe(client, parameters, inputs=[path]) out_hist = to_histograms(client, parameters, df=out_df) out_plot = plotter(client, parameters, hist_df=out_hist) out_tmp = to_templates(client, parameters, hist_df=out_hist) elapsed = round(time.time() - tick, 3) print(f"Finished everything in {elapsed} s.") slicer = { "region": "h-peak", "channel": "ggh_0jets", "val_sumw2": "value", "dimuon_mass": slice(None), } assert almost_equal(out_hist["hist"][0][slicer].sum(), 3349.189725131393) assert almost_equal(sum(out_plot), 3349.189725131393) assert almost_equal(sum(out_tmp), 3349.189725131393)