예제 #1
0
def run_mva(client, parameters, df):
    mva_path = parameters.pop("mva_path", "./")
    mkdir(mva_path)
    mva_models = parameters.pop("mva_models", {})
    saved_models = parameters.pop("saved_models", {})
    training_datasets = parameters.pop("training_datasets", {})
    features = parameters.pop("training_features", [])
    do_training = parameters.pop("mva_do_training", False)
    do_evaluation = parameters.pop("mva_do_evaluation", False)
    do_plotting = parameters.pop("mva_do_plotting", False)
    channels_to_use = parameters.get("mva_channels", ["ggh_0jets"])

    for channel in channels_to_use:
        out_dir = f"{mva_path}/{channel}"
        mkdir(out_dir)
        parameters["plots_path"] = out_dir

        trainer = Trainer(
            df=df[df.channel == channel],
            channel=channel,
            ds_dict=training_datasets,
            features=features,
            out_path=out_dir,
            training_cut="(dimuon_mass > 110) & (dimuon_mass < 150)",
        )
        # trainer.shape_in_eta_bins(shape_of="dimuon_mass", nbins=10)

        if do_training:
            trainer.add_models(mva_models.copy())
            trainer.run_training(client)

        if channel in saved_models.keys():
            if len(saved_models[channel].keys()) > 0:
                trainer.add_saved_models(saved_models[channel])

        if do_evaluation:
            trainer.run_evaluation(client)
            trainer.shape_in_bins(shape_of="dimuon_mass", nbins=6)
            # trainer.shape_in_bins(shape_of="max_abs_eta", nbins=10)

            for model_name in trainer.models.keys():
                score_name = f"{model_name}_score"
                df.loc[df.channel == channel, score_name] = trainer.df.loc[
                    :, score_name
                ]

        if do_plotting:
            trainer.plot_roc_curves()
            parameters_tmp = parameters.copy()
            parameters_tmp["hist_vars"] = []
            parameters_tmp["plot_vars"] = []
            parameters_tmp["regions"] = ["h-peak", "h-sidebands"]
            parameters_tmp["channels"] = [channel]
            all_models = []
            if channel in mva_models.keys():
                all_models += list(mva_models[channel].keys())
            if channel in saved_models.keys():
                all_models += list(saved_models[channel].keys())
            all_models = list(set(all_models))
            for model_name in all_models:
                score_name = f"{model_name}_score"
                parameters_tmp["hist_vars"].append(score_name)
                parameters_tmp["plot_vars"].append(score_name)
                parameters_tmp["variables_lookup"][score_name] = Variable(
                    score_name, score_name, 50, 0, 1
                )

            hist_df = to_histograms(client, parameters_tmp, trainer.df)
            plotter(client, parameters_tmp, hist_df)
예제 #2
0
    parameters["hist_vars"] += ["score_" + m for m in parameters["bdt_models"]]

    # parameters['plot_vars'] = ['dimuon_mass']
    parameters["plot_vars"] = parameters["hist_vars"]
    parameters["datasets"] = datasets

    all_paths = {}
    for year in parameters["years"]:
        all_paths[year] = {}
        for dataset in datasets:
            paths = glob.glob(
                f"{parameters['path']}/"
                f"{year}_{parameters['label']}/"
                f"{dataset}/*.parquet"
            )
            all_paths[year][dataset] = paths

    if args.remake_hists:
        for year in parameters["years"]:
            print(f"Processing {year}")
            for dataset, path in tqdm.tqdm(all_paths[year].items()):
                if len(path) == 0:
                    continue
                df = load_dataframe(client, parameters, inputs=[path])
                if not isinstance(df, dd.DataFrame):
                    continue
                to_histograms(client, parameters, df=df)

    if args.plot:
        yields = plotter(client, parameters)
    "signals": ["ggh_powheg", "vbf_powheg"],
}

if __name__ == "__main__":
    tick = time.time()

    client = Client(processes=True,
                    n_workers=1,
                    threads_per_worker=1,
                    memory_limit="4GB")

    file_name = "dy_delphes_stage1_output.parquet"
    path = f"{os.getcwd()}/tests/samples/{file_name}"

    out_df = load_dataframe(client, parameters, inputs=[path])
    out_hist = to_histograms(client, parameters, df=out_df)
    out_plot = plotter(client, parameters, hist_df=out_hist)
    out_tmp = to_templates(client, parameters, hist_df=out_hist)
    elapsed = round(time.time() - tick, 3)
    print(f"Finished everything in {elapsed} s.")

    slicer = {
        "region": "h-peak",
        "channel": "ggh_0jets",
        "val_sumw2": "value",
        "dimuon_mass": slice(None),
    }

    assert almost_equal(out_hist["hist"][0][slicer].sum(), 3349.189725131393)
    assert almost_equal(sum(out_plot), 3349.189725131393)
    assert almost_equal(sum(out_tmp), 3349.189725131393)