def do_significance(self): self.logger.info("Doing significance optimization") #first extract the number of data events in the ml sample self.df_evt_data = pickle.load(openfile(self.f_evt_data, 'rb')) if self.p_dofullevtmerge is True: self.df_evttotsample_data = pickle.load(openfile(self.f_evttotsample_data, 'rb')) else: self.logger.info("The total merged event dataframe was not merged \ for space limits") self.df_evttotsample_data = pickle.load(openfile(self.f_evt_data, 'rb')) #and the total number of events self.p_nevttot = len(self.df_evttotsample_data) self.p_nevtml = len(self.df_evt_data) self.logger.info("Number of data events used for ML: %d", self.p_nevtml) self.logger.info("Total number of data events: %d", self.p_nevttot) #calculate acceptance correction. we use in this case all #the signal from the mc sample, without limiting to the n. signal #events used for training denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1]) numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1]) acc, acc_err = self.calceff(numacc, denacc) self.logger.info("Acceptance: %.3e +/- %.3e", acc, acc_err) #calculation of the expected fonll signals df_fonll = pd.read_csv(self.f_fonll) ptmin = self.p_binmin ptmax = self.p_binmax df_fonll_in_pt = df_fonll.query('(pt >= @ptmin) and (pt < @ptmax)')[self.p_fonllband] prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / len(df_fonll_in_pt) delta_pt = ptmax - ptmin signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \ / (self.p_sigmamb * self.p_fprompt) self.logger.info("Expected signal yield: %.3e", signal_yield) signal_yield = self.p_raahp * signal_yield self.logger.info("Expected signal yield x RAA hp: %.3e", signal_yield) #now we plot the fonll expectation plt.figure(figsize=(20, 15)) plt.subplot(111) plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0) plt.xlabel('P_t [GeV/c]', fontsize=20) plt.ylabel('Cross Section [pb/GeV]', fontsize=20) plt.title("FONLL cross section " + self.p_case, fontsize=20) plt.semilogy() plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png') df_data_sideband = self.df_data.query(self.s_selbkgml) df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle) df_data_sideband = df_data_sideband.tail(round(len(df_data_sideband) * self.p_bkgfracopt)) hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1] mass_array = df_mc_signal['inv_mass'].values for mass_value in np.nditer(mass_array): hmass.Fill(mass_value) gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) gaus_fit.SetParameters(0, hmass.Integral()) gaus_fit.SetParameters(1, self.p_mass) gaus_fit.SetParameters(2, 0.02) self.logger.info("To fit the signal a gaussian function is used") fitsucc = hmass.Fit("gaus_fit", "RQ") if int(fitsucc) != 0: self.logger.warning("Problem in signal peak fit") sigma = 0. sigma = gaus_fit.GetParameter(2) self.logger.info("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1)) self.logger.info("Sigma of the gaussian: %.3e", sigma) sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma] fig_signif_pevt = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20) plt.title("Significance Per Event vs Threshold", fontsize=20) fig_signif = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20) plt.title("Significance vs Threshold", fontsize=20) for name in self.p_classname: df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1] eff_array, eff_err_array, x_axis = self.calc_sigeff_steps(self.p_nstepsign, df_sig, name) bkg_array, bkg_err_array, _ = calc_bkg(df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim, self.p_bin_width, sig_region, self.p_savefit, self.dirmlplot) sig_array = [eff * signal_yield for eff in eff_array] sig_err_array = [eff_err * signal_yield for eff_err in eff_err_array] bkg_array = [bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array] bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \ for bkg_err in bkg_err_array] signif_array, signif_err_array = calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array) plt.figure(fig_signif_pevt.number) plt.errorbar(x_axis, signif_array, yerr=signif_err_array, alpha=0.3, label=f'{name}', elinewidth=2.5, linewidth=4.0) signif_array_ml = [sig * sqrt(self.p_nevtml) for sig in signif_array] signif_err_array_ml = [sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, alpha=0.3, label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=4.0) signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, alpha=0.3, label=f'{name}_Tot', elinewidth=2.5, linewidth=4.0) plt.figure(fig_signif_pevt.number) plt.legend(loc="lower left", prop={'size': 18}) plt.savefig(f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png') plt.figure(fig_signif.number) plt.legend(loc="lower left", prop={'size': 18}) plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
def do_significance(self): if self.step_done("significance"): return self.do_apply() self.do_test() self.logger.info("Doing significance optimization") gROOT.SetBatch(True) gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;") #first extract the number of data events in the ml sample # This might need a revisit, for now just extract the numbers from the ML merged # event count (aka from a YAML since the actual events are not needed) # Before the ML count was always taken from the ML merged event df while the total # number was taken from the event counter. But the latter is basically not used # anymore for a long time cause "dofullevtmerge" is mostly "false" in the DBs #and the total number of events count_dict = parse_yaml(self.f_evt_count_ml) self.p_nevttot = count_dict["evtorig"] self.p_nevtml = count_dict["evt"] self.logger.debug("Number of data events used for ML: %d", self.p_nevtml) self.logger.debug("Total number of data events: %d", self.p_nevttot) #calculate acceptance correction. we use in this case all #the signal from the mc sample, without limiting to the n. signal #events used for training denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1]) numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1]) acc, acc_err = calc_eff(numacc, denacc) self.logger.debug("Acceptance: %.3e +/- %.3e", acc, acc_err) #calculation of the expected fonll signals delta_pt = self.p_binmax - self.p_binmin if self.is_fonll_from_root: df_fonll = TFile.Open(self.f_fonll) df_fonll_Lc = df_fonll.Get(self.p_fonllparticle + "_" + self.p_fonllband) bin_min = df_fonll_Lc.FindBin(self.p_binmin) bin_max = df_fonll_Lc.FindBin(self.p_binmax) prod_cross = df_fonll_Lc.Integral( bin_min, bin_max) * self.p_fragf * 1e-12 / delta_pt signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa * self.p_br \ / (self.p_sigmamb * self.p_fprompt) #now we plot the fonll expectation cFONLL = TCanvas('cFONLL', 'The FONLL expectation') df_fonll_Lc.GetXaxis().SetRangeUser(0, 16) df_fonll_Lc.Draw("") cFONLL.SaveAs("%s/FONLL_curve_%s.png" % (self.dirmlplot, self.s_suffix)) else: df_fonll = pd.read_csv(self.f_fonll) df_fonll_in_pt = \ df_fonll.query('(pt >= @self.p_binmin) and (pt < @self.p_binmax)')\ [self.p_fonllband] prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / delta_pt signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \ / (self.p_sigmamb * self.p_fprompt) #now we plot the fonll expectation fig = plt.figure(figsize=(20, 15)) plt.subplot(111) plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0) plt.xlabel('P_t [GeV/c]', fontsize=20) plt.ylabel('Cross Section [pb/GeV]', fontsize=20) plt.title("FONLL cross section " + self.p_case, fontsize=20) plt.semilogy() plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png') plt.close(fig) self.logger.debug("Expected signal yield: %.3e", signal_yield) signal_yield = self.p_raahp * signal_yield self.logger.debug("Expected signal yield x RAA hp: %.3e", signal_yield) df_data_sideband = self.df_data.query(self.s_selbkgml) df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle) df_data_sideband = df_data_sideband.tail( round(len(df_data_sideband) * self.p_bkgfracopt)) hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1] mass_array = df_mc_signal[self.v_invmass].values for mass_value in np.nditer(mass_array): hmass.Fill(mass_value) gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) gaus_fit.SetParameters(0, hmass.Integral()) gaus_fit.SetParameters(1, self.p_mass) gaus_fit.SetParameters(2, 0.02) self.logger.debug("To fit the signal a gaussian function is used") fitsucc = hmass.Fit("gaus_fit", "RQ") if int(fitsucc) != 0: self.logger.warning("Problem in signal peak fit") sigma = 0. sigma = gaus_fit.GetParameter(2) self.logger.debug("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1)) self.logger.debug("Sigma of the gaussian: %.3e", sigma) sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma] fig_signif_pevt = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20) #plt.title("Significance Per Event vs Threshold", fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) fig_signif = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20) #plt.title("Significance vs Threshold", fontsize=20) plt.xticks(fontsize=18) plt.yticks(fontsize=18) df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1] for name in self.p_classname: eff_array, eff_err_array, x_axis = calc_sigeff_steps( self.p_nstepsign, df_sig, name) bkg_array, bkg_err_array, _ = calc_bkg( df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim, self.p_bkg_func, self.p_bin_width, sig_region, self.p_savefit, self.dirmlplot, [self.p_binmin, self.p_binmax], self.v_invmass) sig_array = [eff * signal_yield for eff in eff_array] sig_err_array = [ eff_err * signal_yield for eff_err in eff_err_array ] bkg_array = [ bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array ] bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \ for bkg_err in bkg_err_array] signif_array, signif_err_array = calc_signif( sig_array, sig_err_array, bkg_array, bkg_err_array) plt.figure(fig_signif_pevt.number) plt.errorbar(x_axis, signif_array, yerr=signif_err_array, label=f'{name}', elinewidth=2.5, linewidth=5.0) signif_array_ml = [ sig * sqrt(self.p_nevtml) for sig in signif_array ] signif_err_array_ml = [ sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array ] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=5.0) plt.text( 0.7, 0.95, f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$", verticalalignment="center", transform=fig_signif.gca().transAxes, fontsize=30) #signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] #signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] #plt.figure(fig_signif.number) #plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, # label=f'{name}_Tot', elinewidth=2.5, linewidth=5.0) plt.figure(fig_signif_pevt.number) plt.legend(loc="upper left", prop={'size': 30}) plt.savefig( f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png') plt.figure(fig_signif.number) plt.legend(loc="upper left", prop={'size': 30}) mpl.rcParams.update({"text.usetex": True}) plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png') mpl.rcParams.update({"text.usetex": False}) with open(f'{self.dirmlplot}/Significance_{self.s_suffix}.pickle', 'wb') as out: pickle.dump(fig_signif, out) plt.close(fig_signif_pevt) plt.close(fig_signif)