def main(): """ This is used as the entry point for fitting. """ parser = argparse.ArgumentParser() parser.add_argument("--database-analysis", "-d", dest="database_analysis", help="analysis database to be used", required=True) parser.add_argument("--analysis", "-a", dest="type_ana", help="choose type of analysis", required=True) parser.add_argument("--period-number", "-p", dest="period_number", type=int, help="choose type of analysis (0: 2016, 1: 2017, 2: 2018, " \ "-1: all merged (default))", default=-1) parser.add_argument("--output", "-o", default="simple_fit", help="result output directory") args = parser.parse_args() configure_logger(False, None) # Extract database as dictionary data = parse_yaml(args.database_analysis) data = data[list(data.keys())[0]] # Run the chain do_simple_fit(data, args.type_ana, args.period_number, args.output)
def load_fit(save_dir): yaml_path = join(save_dir, "meta.yaml") meta_info = parse_yaml(yaml_path) yaml_path = join(save_dir, "init_pars.yaml") #pylint: disable=import-outside-toplevel import machine_learning_hep.fitting.fitters as search_module #pylint: enable=import-outside-toplevel fit_classes = {f[0]: getattr(search_module, f[0]) \ for f in inspect.getmembers(search_module, inspect.isclass) \ if f[1].__module__ == search_module.__name__} fit = None if meta_info["fit_class"] in fit_classes: fit = fit_classes[meta_info["fit_class"]](parse_yaml(yaml_path)) else: get_logger().fatal("Fit class %s is invalid") yaml_path = join(save_dir, "fit_pars.yaml") fit.fit_pars = parse_yaml(yaml_path) root_file_name = join(save_dir, "root_objects.root") root_file = TFile.Open(root_file_name, "READ") keys = root_file.GetListOfKeys() root_objects = {} for k in keys: if k.GetName() == "kernel": fit.kernel = k.ReadObj() continue obj = k.ReadObj() obj.SetDirectory(0) root_objects[k.GetName()] = obj root_file.Close() fit.set_root_objects(root_objects) fit.success = meta_info["success"] fit.init_fit() if "annotations" not in meta_info: return fit return fit, meta_info["annotations"]
def __read_successful_trials(self): save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "successful_trials.yaml") if not exists(save_path): print( f"Cannot load working points. File {save_path} doesn't (yet) exist." ) print("Do full syst in 10s...") sleep(10) return [] return parse_yaml(save_path)["successful_trials"]
def __load_working_points(self): save_path = join(self.nominal_analyzer_merged.d_resultsallpdata, self.syst_out_dir, "working_points.yaml") if not exists(save_path): print( f"Cannot load working points. File {save_path} doesn't exist") sys.exit(1) read_yaml = parse_yaml(save_path) self.cent_cv_cut = read_yaml["central"] self.min_cv_cut = read_yaml["lower_limits"] self.max_cv_cut = read_yaml["upper_limits"] self.ml_wps = read_yaml["working_points"]
def read(self, yaml_errors, extra_errors=None): """ Read everything from YAML """ error_dict = parse_yaml(yaml_errors) for name, errors in error_dict.items(): if name == "names": self.names = errors.copy() else: self.add_errors(name, errors) if extra_errors is not None: self.errors.update(extra_errors) for key in extra_errors: self.names.append(key)
def multi_mergeml_allinone(self): for ipt in range(self.p_nptbins): merge_method(self.lptper_recoml[ipt], self.lpt_recoml_mergedallp[ipt]) if self.mcordata == "mc": merge_method(self.lptper_genml[ipt], self.lpt_genml_mergedallp[ipt]) count_evt = 0 count_evtorig = 0 for evt_count_file in self.lper_evt_count_ml: count_dict = parse_yaml(evt_count_file) count_evt += count_dict["evt"] count_evtorig += count_dict["evtorig"] dump_yaml_from_dict({ "evt": count_evt, "evtorig": count_evtorig }, self.f_evtml_count)
def make_distributions(args, inv_mass, inv_mass_window): # pylint: disable=too-many-statements config = parse_yaml(args.config) database_path = config["database"] data_or_mc = config["data_or_mc"] analysis_name = config["analysis"] distribution = config["distribution"] distribution_x_range = config["x_range"] out_file = config["out_file"] # whether or not to slice and derive weights in these slices period_cuts = config.get("period_cuts", None) slice_cuts = config.get("slice_cuts", None) required_columns = config.get("required_columns", None) query_all = config.get("query_all", None) use_ml_selection = config.get("use_ml_selection", True) use_mass_window = config.get("use_mass_window", True) # Now open database _, database = read_database(database_path) analysis_config = database["analysis"][analysis_name] inv_mass[0] = database["mass"] inv_mass_window[0] = config.get("mass_window", 0.02) # required column names column_names = ["ev_id", "ev_id_ext", "run_number"] column_names.append(distribution) # Add column names required by the user if required_columns: for rcn in required_columns: if rcn not in column_names: column_names.append(rcn) periods = database["multi"][data_or_mc]["period"] # is this ML or STD? is_ml = database["doml"] # No cuts for specific input file file_names_cut_map = None # Set where to read data from and set overall selection query column_names.append("inv_mass") trigger_sel = analysis_config["triggersel"][data_or_mc] in_top_dirs = database["mlapplication"][data_or_mc]["pkl_skimmed_dec"] if trigger_sel: if query_all: query_all += f" and {trigger_sel}" else: query_all = trigger_sel in_file_name_gen = database["files_names"]["namefile_reco"] in_file_name_gen = in_file_name_gen[:in_file_name_gen.find(".")] if is_ml: pkl_extension = "" if use_ml_selection: model_name = database["mlapplication"]["modelname"] ml_sel_column = f"y_test_prob{model_name}" column_names.append(ml_sel_column) ml_sel_pt = database["mlapplication"]["probcutoptimal"] pt_bins_low = database["sel_skim_binmin"] pt_bins_up = database["sel_skim_binmax"] in_file_names = [f"{in_file_name_gen}{ptl}_{ptu}" \ for ptl, ptu in zip(pt_bins_low, pt_bins_up)] file_names_cut_map = {ifn: f"{ml_sel_column} > {cut}" \ for ifn, cut in zip(in_file_names, ml_sel_pt)} else: pkl_extension = "_std" in_file_name_gen = in_file_name_gen + "*" # Now make the directory path right in_top_dirs = [f"{itd}{pkl_extension}" for itd in in_top_dirs] derive(periods, in_top_dirs, in_file_name_gen, column_names, use_mass_window, distribution, distribution_x_range, file_names_cut_map, out_file, period_cuts, query_all, slice_cuts)
def assert_model_config(self): # pylint: disable=R0912 """ Validate and return the configuration for ml models Args: path: path to configuration YAML run_config: Run configuration since loading some models can depend on that, e.g. if run_config["activate_keras"] == 0 the keras config does not need to be checked and loaded. """ logger = get_logger() logger.debug("Check sanity of user configs") user_config = {} if isinstance(self.model_config_input, str): user_config = parse_yaml( os.path.expanduser(self.model_config_input)) elif isinstance(self.model_config_input, dict): user_config = self.model_config_input # At this point the asserted_config dict is just the one with defaults asserted_config = Configuration.get_meta_config("models")[ self.run_config["mltype"]] user_config = user_config.get(self.run_config["mltype"], {}) # Could probably merged with the former loop, however, would like to see whether there are # e.g. typos. Because steering a run wanting keras - but writing kras - could cost a lot of # time when it needs to be done again. if self.run_config["mltype"] in self.run_config["activate_models"]: for backend, model in \ self.run_config["activate_models"][self.run_config["mltype"]].items(): if backend not in asserted_config: logger.critical("Unknown backend %s.", backend) if model is None: logger.critical("No models specified for backend %s.", backend) for name, activate in model.items(): if name not in asserted_config[backend]: logger.critical("Unknown model %s for backend %s.", name, backend) if name in asserted_config[backend]: if activate is None or not isinstance(activate, bool): logger.critical("Activation value of model %s for backend %s " \ "must be specified as boolean value.", name, backend) asserted_config[backend][name]["activate"] = activate # Pop deactivated models for backend in list(asserted_config.keys()): for model in list(asserted_config[backend].keys()): if not asserted_config[backend][model]["activate"]: del asserted_config[backend][model] else: asserted_config[backend][model] = asserted_config[backend][ model]["default"] if backend in user_config and model in user_config[backend]: if len(user_config[backend][model]) != len( asserted_config[backend][model]): logger.critical( "Parameter list for %s model %s differs", backend, model) for u in asserted_config[backend][model]: asserted_config[backend][model][u] = \ user_config[backend][model].get(u, asserted_config[backend][model][u]) self.model_config = asserted_config
def assert_run_config(self): """ Validate and return the configuration for run Args: path: path to configuration YAML """ logger = get_logger() logger.debug("Check sanity of user configs") user_run_config = {} if isinstance(self.run_config_input, str): user_run_config = parse_yaml( os.path.expanduser(self.run_config_input)) elif isinstance(self.run_config_input, dict): user_run_config = self.run_config_input # At this point the asserted_config dict is just the one with defaults run_config = Configuration.get_meta_config("run") asserted_config = {k: run_config[k]["default"] for k in run_config} choices_config = { k: run_config[k]["choices"] for k in run_config if "choices" in run_config[k] } depends_config = { k: run_config[k]["depends"] for k in run_config if "depends" in run_config[k] } types_config = { k: run_config[k]["type_as"] for k in run_config if "type_as" in run_config[k] } # Check for unknown parameters and abort since running entire machinery with wrong # setting (e.g. 'dotaining' instead of 'dotraining' might happen just by accident) # could be just overhead. for k in user_run_config: if k not in asserted_config: logger.critical("Unkown parameter %s in config", k) elif user_run_config[k] is None: logger.critical("Missing value for parameter %s in config", k) # Replace all defaults if user specified parameter for k in asserted_config: asserted_config[k] = user_run_config.get(k, asserted_config[k]) # If parameter is already set, check if consistent if k in choices_config and asserted_config[ k] not in choices_config[k]: logger.critical( "Invalid value %s for parameter %s. Must be one of %s", str(user_run_config[k]), k, str(choices_config[k])) if k in types_config: check_types = [type(t) for t in types_config[k]] if not isinstance(asserted_config[k], tuple(check_types)): logger.critical( "Invalid value type %s of parameter %s. Must be of type %s", str(type(asserted_config[k])), k, str(check_types)) # Can so far only depend on one parameter, change to combination # of parameters. Do we need to check for circular dependencies? for k in depends_config: if (asserted_config[depends_config[k]["parameter"]] == depends_config[k]["value"] and asserted_config[k] != depends_config[k]["set"]): asserted_config[k] = depends_config[k]["set"] logger.info( "Parameter %s = %s enforced since it is required for %s == %s", k, str(depends_config[k]["set"]), str(depends_config[k]["parameter"]), str(depends_config[k]["value"])) self.run_config = asserted_config
def test_yaml(): assert isinstance(parse_yaml(YAML_PATH), dict)
def __plot_summary(self, out_dir, from_yaml=None, from_pickle=None):# pylint: disable=too-many-statements """Plot results Results are plotted to out_dir/results.png Args: out_dir: str output directory where results.png will be saved from_yaml: str path to YAML file to read and plot results from """ results_tmp = self.results scores_tmp = list(self.scoring.keys()) score_opt_tmp = self.scoring_opt if from_yaml: read_yaml = parse_yaml(from_yaml) results_tmp = read_yaml["cv"] scores_tmp = read_yaml["score_names"] score_opt_tmp = read_yaml["score_opt_name"] elif from_pickle: read_yaml = pickle.load(open(from_pickle, "rb")) results_tmp = read_yaml["cv"] scores_tmp = read_yaml["score_names"] score_opt_tmp = read_yaml["score_opt_name"] # Re-arrange such that always the optimisation score is on top score_names = list(scores_tmp) del score_names[score_names.index(score_opt_tmp)] score_names.insert(0, score_opt_tmp) # Prepare figrue and axes figsize = (35, 18 * len(score_names)) fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, figsize=figsize) # If only one score is given, need to make it iterable try: iter(axes) except TypeError: axes = [axes] markerstyles = ["o", "+"] markersize = 20 for axi, (sn, ax) in enumerate(zip(score_names, axes)): ax.set_ylabel(f"CV mean {sn}", fontsize=20) ax.get_yaxis().set_tick_params(labelsize=20) # Get means of scores and plot with their std means = {} for i, tt in enumerate(("train", "test")): markerstyle = markerstyles[i % len(markerstyles)] means[tt] = [r[f"{tt}_{sn}"] for r in results_tmp] stds = [r[f"{tt}_{sn}_std"] for r in results_tmp] ax.errorbar(range(len(means[tt])), means[tt], yerr=stds, ls="", marker=markerstyle, markersize=markersize, label=f"{sn} ({tt})") # Relative deviations between test and train index_high_score = means["test"].index(max(means["test"])) dev_high_score = \ abs(means["test"][index_high_score] - means["train"][index_high_score]) \ / means["test"][index_high_score] index_low_score = means["test"].index(min(means["test"])) dev_low_score = \ abs(means["test"][index_low_score] - means["train"][index_low_score]) \ / means["test"][index_low_score] dev_min = [abs(test - train) / test \ for train, test in zip(means["train"], means["test"])] index_min = dev_min.index(min(dev_min)) dev_min = min(dev_min) ax.axvline(index_high_score, color="red") y_coord = (means["test"][index_high_score] + means["train"][index_high_score]) / 2 ax.text(index_high_score, y_coord, f"{dev_high_score:.4f}", color="red", fontsize=20) ax.axvline(index_low_score, color="blue") y_coord = (means["test"][index_low_score] + means["train"][index_low_score]) / 2 ax.text(index_low_score, y_coord, f"{dev_low_score:.4f}", color="blue", fontsize=20) ax.axvline(index_min, color="green") y_coord = (means["test"][index_min] + means["train"][index_min]) / 2 ax.text(index_min, y_coord, f"{dev_min:.4f}", color="green", fontsize=20) leg = ax.legend(loc="upper right", fontsize=20) if axi == 0: # Add another legend for highest, lowest score and min. rel. deviation between # test and train score handles = [Line2D([0], [0], color="red"), Line2D([0], [0], color="blue"), Line2D([0], [0], color="green")] labels = ["highest test score", "lowest test score", "min. rel deviation"] ax.legend(handles, labels, bbox_to_anchor=(0., 1.02, 1., .102), loc='lower left', ncol=3, mode="expand", borderaxespad=0., fontsize=20) # Add back first legend ax.add_artist(leg) axes[-1].set_xticks(range(len(results_tmp))) axes[-1].set_xticklabels(range(len(results_tmp)), fontsize=20) axes[-1].set_xlabel("# trial", fontsize=20) fig.suptitle("Bayesian model optimisation", fontsize=35) fig.tight_layout() out_file = join(out_dir, "results.png") fig.savefig(out_file) plt.close(fig)
def perform_plot_gridsearch(names, out_dirs): ''' Function for grid scores plotting (working with scikit 0.20) ''' logger = get_logger() for name, out_dir in zip(names, out_dirs): # Read written results gps = parse_yaml(osjoin(out_dir, "parameters.yaml")) score_obj = pickle.load(openfile(osjoin(out_dir, "results.pkl"), "rb")) param_keys = [f"param_{key}" for key in gps["params"].keys()] if not param_keys: logger.warning("Add at least 1 parameter (even just 1 value)") continue # Re-arrange scoring such that the refitted one is always on top score_names = gps["scoring"] refit_score = gps["refit"] del score_names[score_names.index(refit_score)] score_names.insert(0, refit_score) # Extract scores x_labels = [] y_values = {} y_errors = {} for sn in score_names: y_values[sn] = {"train": [], "test": []} y_errors[sn] = {"train": [], "test": []} # Get indices of values to put on x-axis and identify parameter combination values_indices = [ range(len(values)) for values in gps["params"].values() ] y_axis_mins = {sn: 9999 for sn in score_names} y_axis_maxs = {sn: -9999 for sn in score_names} for indices, case in zip( itertools.product(*values_indices), itertools.product(*list(gps["params"].values()))): df_case = score_obj.copy() for i_case, i_key in zip(case, param_keys): df_case = df_case.loc[df_case[i_key] == df_case[i_key].dtype.type(i_case)] x_labels.append(",".join([str(i) for i in indices])) # As we just nailed it down to one value for sn in score_names: for tt in ("train", "test"): y_values[sn][tt].append( df_case[f"mean_{tt}_{sn}"].values[0]) y_errors[sn][tt].append( df_case[f"std_{tt}_{sn}"].values[0]) y_axis_mins[sn] = min(y_axis_mins[sn], y_values[sn][tt][-1]) y_axis_maxs[sn] = max(y_axis_maxs[sn], y_values[sn][tt][-1]) # Prepare text for parameters text_parameters = "\n".join( [f"{key}: {values}" for key, values in gps["params"].items()]) # To determine fontsizes later figsize = (35, 18 * len(score_names)) fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, figsize=figsize) ax_plot = dict(zip(score_names, axes)) # The axes to put the parameter list ax_main = axes[-1] # The axes with the title being on top ax_top = axes[0] points_per_inch = 72 markerstyles = ["o", "+"] markersize = 20 for sn in score_names: ax = ax_plot[sn] ax_min = y_axis_mins[sn] - (y_axis_maxs[sn] - y_axis_mins[sn]) / 10. ax_max = y_axis_maxs[sn] + (y_axis_maxs[sn] - y_axis_mins[sn]) / 10. ax.set_ylim(ax_min, ax_max) ax.set_ylabel(f"mean {sn}", fontsize=20) ax.get_yaxis().set_tick_params(labelsize=20) for j, tt in enumerate(("train", "test")): markerstyle = markerstyles[j % len(markerstyles)] ax.errorbar(range(len(x_labels)), y_values[sn][tt], yerr=y_errors[sn][tt], ls="", marker=markerstyle, markersize=markersize, label=f"{sn} ({tt})") # Add values to points ylim = ax.get_ylim() plot_labels_offset = (ylim[1] - ylim[0]) / 40 for x, y in enumerate(y_values[sn][tt]): ax.text(x, y - plot_labels_offset, f"{y:.4f}", fontsize=20) ax_main.set_xlabel("parameter indices", fontsize=20) ax_top.set_title(f"Grid search {name}", fontsize=30) ax_main.get_xaxis().set_tick_params(labelsize=20) ax_main.set_xticks(range(len(x_labels))) ax_main.set_xticklabels(x_labels, rotation=45) text_point_size = int(4 * fig.dpi / points_per_inch * figsize[1] / len(gps["params"])) xlim = ax_main.get_xlim() ylim = ax_main.get_ylim() xlow = xlim[0] + (xlim[1] - xlim[0]) / 100 ylow = ylim[0] + (ylim[1] - ylim[0]) / 3 ax_main.text(xlow, ylow, text_parameters, fontsize=text_point_size) for ax in ax_plot.values(): ax.legend(loc="center right", fontsize=20) plotname = osjoin(out_dir, "GridSearchResults.png") plt.savefig(plotname) plt.close(fig)
def do_significance(self): if self.step_done("significance"): return self.do_apply() self.do_test() self.logger.info("Doing significance optimization") gROOT.SetBatch(True) gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;") #first extract the number of data events in the ml sample # This might need a revisit, for now just extract the numbers from the ML merged # event count (aka from a YAML since the actual events are not needed) # Before the ML count was always taken from the ML merged event df while the total # number was taken from the event counter. But the latter is basically not used # anymore for a long time cause "dofullevtmerge" is mostly "false" in the DBs #and the total number of events count_dict = parse_yaml(self.f_evt_count_ml) self.p_nevttot = count_dict["evtorig"] self.p_nevtml = count_dict["evt"] self.logger.debug("Number of data events used for ML: %d", self.p_nevtml) self.logger.debug("Total number of data events: %d", self.p_nevttot) #calculate acceptance correction. we use in this case all #the signal from the mc sample, without limiting to the n. signal #events used for training denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1]) numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1]) acc, acc_err = calc_eff(numacc, denacc) self.logger.debug("Acceptance: %.3e +/- %.3e", acc, acc_err) #calculation of the expected fonll signals delta_pt = self.p_binmax - self.p_binmin if self.is_fonll_from_root: df_fonll = TFile.Open(self.f_fonll) df_fonll_Lc = df_fonll.Get(self.p_fonllparticle + "_" + self.p_fonllband) bin_min = df_fonll_Lc.FindBin(self.p_binmin) bin_max = df_fonll_Lc.FindBin(self.p_binmax) prod_cross = df_fonll_Lc.Integral( bin_min, bin_max) * self.p_fragf * 1e-12 / delta_pt signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa * self.p_br \ / (self.p_sigmamb * self.p_fprompt) #now we plot the fonll expectation cFONLL = TCanvas('cFONLL', 'The FONLL expectation') df_fonll_Lc.GetXaxis().SetRangeUser(0, 16) df_fonll_Lc.Draw("") cFONLL.SaveAs("%s/FONLL_curve_%s.png" % (self.dirmlplot, self.s_suffix)) else: df_fonll = pd.read_csv(self.f_fonll) df_fonll_in_pt = \ df_fonll.query('(pt >= @self.p_binmin) and (pt < @self.p_binmax)')\ [self.p_fonllband] prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / delta_pt signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \ / (self.p_sigmamb * self.p_fprompt) #now we plot the fonll expectation fig = plt.figure(figsize=(20, 15)) plt.subplot(111) plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0) plt.xlabel('P_t [GeV/c]', fontsize=20) plt.ylabel('Cross Section [pb/GeV]', fontsize=20) plt.title("FONLL cross section " + self.p_case, fontsize=20) plt.semilogy() plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png') plt.close(fig) self.logger.debug("Expected signal yield: %.3e", signal_yield) signal_yield = self.p_raahp * signal_yield self.logger.debug("Expected signal yield x RAA hp: %.3e", signal_yield) df_data_sideband = self.df_data.query(self.s_selbkgml) df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle) df_data_sideband = df_data_sideband.tail( round(len(df_data_sideband) * self.p_bkgfracopt)) hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1] mass_array = df_mc_signal[self.v_invmass].values for mass_value in np.nditer(mass_array): hmass.Fill(mass_value) gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) gaus_fit.SetParameters(0, hmass.Integral()) gaus_fit.SetParameters(1, self.p_mass) gaus_fit.SetParameters(2, 0.02) self.logger.debug("To fit the signal a gaussian function is used") fitsucc = hmass.Fit("gaus_fit", "RQ") if int(fitsucc) != 0: self.logger.warning("Problem in signal peak fit") sigma = 0. sigma = gaus_fit.GetParameter(2) self.logger.debug("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1)) self.logger.debug("Sigma of the gaussian: %.3e", sigma) sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma] fig_signif_pevt = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20) #plt.title("Significance Per Event vs Threshold", fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) fig_signif = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20) #plt.title("Significance vs Threshold", fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1] for name in self.p_classname: eff_array, eff_err_array, x_axis = calc_sigeff_steps( self.p_nstepsign, df_sig, name) bkg_array, bkg_err_array, _ = calc_bkg( df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim, self.p_bkg_func, self.p_bin_width, sig_region, self.p_savefit, self.dirmlplot, [self.p_binmin, self.p_binmax], self.v_invmass) sig_array = [eff * signal_yield for eff in eff_array] sig_err_array = [ eff_err * signal_yield for eff_err in eff_err_array ] bkg_array = [ bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array ] bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \ for bkg_err in bkg_err_array] signif_array, signif_err_array = calc_signif( sig_array, sig_err_array, bkg_array, bkg_err_array) plt.figure(fig_signif_pevt.number) plt.errorbar(x_axis, signif_array, yerr=signif_err_array, label=f'{name}', elinewidth=2.5, linewidth=5.0) signif_array_ml = [ sig * sqrt(self.p_nevtml) for sig in signif_array ] signif_err_array_ml = [ sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array ] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=5.0) plt.text( 0.7, 0.95, f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$", verticalalignment="center", transform=fig_signif.gca().transAxes, fontsize=30) #signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] #signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] #plt.figure(fig_signif.number) #plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, # label=f'{name}_Tot', elinewidth=2.5, linewidth=5.0) plt.figure(fig_signif_pevt.number) plt.legend(loc="upper left", prop={'size': 30}) plt.savefig( f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png') plt.figure(fig_signif.number) plt.legend(loc="upper left", prop={'size': 30}) mpl.rcParams.update({"text.usetex": True}) plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png') mpl.rcParams.update({"text.usetex": False}) with open(f'{self.dirmlplot}/Significance_{self.s_suffix}.pickle', 'wb') as out: pickle.dump(fig_signif, out) plt.close(fig_signif_pevt) plt.close(fig_signif)
class Optimiser: # pylint: disable=too-many-public-methods #Class Attribute species = "optimiser" def __init__(self, data_param, case, typean, model_config, binmin, binmax, raahp, training_var, index): self.logger = get_logger() dirmcml = data_param["multi"]["mc"]["pkl_skimmed_merge_for_ml_all"] dirdataml = data_param["multi"]["data"]["pkl_skimmed_merge_for_ml_all"] self.v_bin = data_param["var_binning"] #directory self.dirmlout = data_param["ml"]["mlout"] self.dirmlplot = data_param["ml"]["mlplot"] # Check here which steps have been done already self.steps_done = None self.file_steps_done = os.path.join(self.dirmlout, "steps_done.yaml") if os.path.exists(self.file_steps_done): self.steps_done = parse_yaml(self.file_steps_done)["done"] if self.steps_done is None \ and (os.listdir(self.dirmlout) or os.listdir(self.dirmlplot)): # Backwards compatible print(f"rm -r {self.dirmlout}") print(f"rm -r {self.dirmlplot}") self.logger.fatal("Please remove above directories as indicated above first and " \ "run again") #ml file names self.n_reco = data_param["files_names"]["namefile_reco"] self.n_reco = self.n_reco.replace( ".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax)) self.n_evt = data_param["files_names"]["namefile_evt"] self.n_evt_count_ml = data_param["files_names"].get( "namefile_evt_count", "evtcount.yaml") self.n_gen = data_param["files_names"]["namefile_gen"] self.n_gen = self.n_gen.replace( ".pkl", "_%s%d_%d.pkl" % (self.v_bin, binmin, binmax)) self.n_treetest = data_param["files_names"]["treeoutput"] self.n_reco_applieddata = data_param["files_names"][ "namefile_reco_applieddata"] self.n_reco_appliedmc = data_param["files_names"][ "namefile_reco_appliedmc"] # ml files self.f_gen_mc = os.path.join(dirmcml, self.n_gen) self.f_reco_mc = os.path.join(dirmcml, self.n_reco) self.f_evt_mc = os.path.join(dirmcml, self.n_evt) self.f_reco_data = os.path.join(dirdataml, self.n_reco) self.f_evt_count_ml = os.path.join(dirdataml, self.n_evt_count_ml) self.f_reco_applieddata = os.path.join(self.dirmlout, self.n_reco_applieddata) self.f_reco_appliedmc = os.path.join(self.dirmlout, self.n_reco_appliedmc) #variables self.v_all = data_param["variables"]["var_all"] self.v_train = training_var self.v_selected = data_param["variables"].get("var_selected", None) if self.v_selected: self.v_selected = self.v_selected[index] self.v_bound = data_param["variables"]["var_boundaries"] self.v_sig = data_param["variables"]["var_signal"] self.v_invmass = data_param["variables"]["var_inv_mass"] self.v_cuts = data_param["variables"].get("var_cuts", []) self.v_corrx = data_param["variables"]["var_correlation"][0] self.v_corry = data_param["variables"]["var_correlation"][1] self.v_isstd = data_param["bitmap_sel"]["var_isstd"] self.v_ismcsignal = data_param["bitmap_sel"]["var_ismcsignal"] self.v_ismcprompt = data_param["bitmap_sel"]["var_ismcprompt"] self.v_ismcfd = data_param["bitmap_sel"]["var_ismcfd"] self.v_ismcbkg = data_param["bitmap_sel"]["var_ismcbkg"] #parameters self.p_case = case self.p_typean = typean self.p_nbkg = data_param["ml"]["nbkg"] self.p_nsig = data_param["ml"]["nsig"] self.p_tagsig = data_param["ml"]["sampletagforsignal"] self.p_tagbkg = data_param["ml"]["sampletagforbkg"] self.p_binmin = binmin self.p_binmax = binmax self.p_npca = None self.p_mltype = data_param["ml"]["mltype"] self.p_nkfolds = data_param["ml"]["nkfolds"] self.p_ncorescross = data_param["ml"]["ncorescrossval"] self.rnd_shuffle = data_param["ml"]["rnd_shuffle"] self.rnd_splt = data_param["ml"]["rnd_splt"] self.test_frac = data_param["ml"]["test_frac"] self.p_plot_options = data_param["variables"].get("plot_options", {}) self.p_dofullevtmerge = data_param["dofullevtmerge"] self.p_evtsel = data_param["ml"]["evtsel"] self.p_triggersel_mc = data_param["ml"]["triggersel"]["mc"] self.p_triggersel_data = data_param["ml"]["triggersel"]["data"] #dataframes self.df_mc = None self.df_mcgen = None self.df_data = None self.arraydf = None self.df_sig = None self.df_bkg = None self.df_ml = None self.df_mltest = None self.df_mltrain = None self.df_sigtrain = None self.df_sigtest = None self.df_bkgtrain = None self.df_bktest = None self.df_xtrain = None self.df_ytrain = None self.df_xtest = None self.df_ytest = None #selections self.s_selbkgml = data_param["ml"]["sel_bkgml"] self.s_selsigml = data_param["ml"]["sel_sigml"] self.p_equalise_sig_bkg = data_param["ml"].get("equalise_sig_bkg", False) #model param self.db_model = model_config self.p_class = None self.p_classname = None self.p_trainedmod = None self.s_suffix = None #significance self.is_fonll_from_root = data_param["ml"]["opt"]["isFONLLfromROOT"] self.f_fonll = data_param["ml"]["opt"]["filename_fonll"] if self.is_fonll_from_root and "fonll_particle" not in data_param[ "ml"]["opt"]: self.logger.fatal("Attempt to read FONLL from ROOT file but field " \ "\"fonll_particle\" not provided in database") self.p_fonllparticle = data_param["ml"]["opt"].get( "fonll_particle", "") self.p_fonllband = data_param["ml"]["opt"]["fonll_pred"] self.p_fragf = data_param["ml"]["opt"]["FF"] self.p_sigmamb = data_param["ml"]["opt"]["sigma_MB"] self.p_taa = data_param["ml"]["opt"]["Taa"] self.p_br = data_param["ml"]["opt"]["BR"] self.p_fprompt = data_param["ml"]["opt"]["f_prompt"] self.p_bkgfracopt = data_param["ml"]["opt"]["bkg_data_fraction"] self.p_nstepsign = data_param["ml"]["opt"]["num_steps"] self.p_bkg_func = data_param["ml"]["opt"]["bkg_function"] self.p_savefit = data_param["ml"]["opt"]["save_fit"] self.p_nevtml = None self.p_nevttot = None self.p_presel_gen_eff = data_param["ml"]["opt"]["presel_gen_eff"] # Potentially mask certain values (e.g. nsigma TOF of -999) self.p_mask_values = data_param["ml"].get("mask_values", None) self.p_mass_fit_lim = data_param["analysis"][ self.p_typean]['mass_fit_lim'] self.p_bin_width = data_param["analysis"][self.p_typean]['bin_width'] self.p_num_bins = int(round((self.p_mass_fit_lim[1] - self.p_mass_fit_lim[0]) / \ self.p_bin_width)) self.p_mass = data_param["mass"] self.p_raahp = raahp self.create_suffix() self.preparesample() self.loadmodels() self.df_evt_data = None self.df_evttotsample_data = None self.f_reco_applieddata = \ self.f_reco_applieddata.replace(".pkl", "%s.pkl" % self.s_suffix) self.f_reco_appliedmc = \ self.f_reco_appliedmc.replace(".pkl", "%s.pkl" % self.s_suffix) self.f_df_ml_test_to_df = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl" self.f_mltest_applied = f"{self.dirmlout}/testsample_{self.s_suffix}_mldecision.pkl" self.df_mltest_applied = None print(training_var)