示例#1
0
    def do_significance(self):
        self.logger.info("Doing significance optimization")
        #first extract the number of data events in the ml sample
        self.df_evt_data = pickle.load(openfile(self.f_evt_data, 'rb'))
        if self.p_dofullevtmerge is True:
            self.df_evttotsample_data = pickle.load(openfile(self.f_evttotsample_data, 'rb'))
        else:
            self.logger.info("The total merged event dataframe was not merged \
                             for space limits")
            self.df_evttotsample_data = pickle.load(openfile(self.f_evt_data, 'rb'))
        #and the total number of events
        self.p_nevttot = len(self.df_evttotsample_data)
        self.p_nevtml = len(self.df_evt_data)
        self.logger.info("Number of data events used for ML: %d", self.p_nevtml)
        self.logger.info("Total number of data events: %d", self.p_nevttot)
        #calculate acceptance correction. we use in this case all
        #the signal from the mc sample, without limiting to the n. signal
        #events used for training
        denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1])
        numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1])
        acc, acc_err = self.calceff(numacc, denacc)

        self.logger.info("Acceptance: %.3e +/- %.3e", acc, acc_err)
        #calculation of the expected fonll signals
        df_fonll = pd.read_csv(self.f_fonll)
        ptmin = self.p_binmin
        ptmax = self.p_binmax
        df_fonll_in_pt = df_fonll.query('(pt >= @ptmin) and (pt < @ptmax)')[self.p_fonllband]
        prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / len(df_fonll_in_pt)
        delta_pt = ptmax - ptmin
        signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \
                       / (self.p_sigmamb * self.p_fprompt)
        self.logger.info("Expected signal yield: %.3e", signal_yield)
        signal_yield = self.p_raahp * signal_yield
        self.logger.info("Expected signal yield x RAA hp: %.3e", signal_yield)

        #now we plot the fonll expectation
        plt.figure(figsize=(20, 15))
        plt.subplot(111)
        plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0)
        plt.xlabel('P_t [GeV/c]', fontsize=20)
        plt.ylabel('Cross Section [pb/GeV]', fontsize=20)
        plt.title("FONLL cross section " + self.p_case, fontsize=20)
        plt.semilogy()
        plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png')

        df_data_sideband = self.df_data.query(self.s_selbkgml)
        df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle)
        df_data_sideband = df_data_sideband.tail(round(len(df_data_sideband) * self.p_bkgfracopt))
        hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
        df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1]
        mass_array = df_mc_signal['inv_mass'].values
        for mass_value in np.nditer(mass_array):
            hmass.Fill(mass_value)

        gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
        gaus_fit.SetParameters(0, hmass.Integral())
        gaus_fit.SetParameters(1, self.p_mass)
        gaus_fit.SetParameters(2, 0.02)
        self.logger.info("To fit the signal a gaussian function is used")
        fitsucc = hmass.Fit("gaus_fit", "RQ")

        if int(fitsucc) != 0:
            self.logger.warning("Problem in signal peak fit")
            sigma = 0.

        sigma = gaus_fit.GetParameter(2)
        self.logger.info("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1))
        self.logger.info("Sigma of the gaussian: %.3e", sigma)
        sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma]
        fig_signif_pevt = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20)
        plt.title("Significance Per Event vs Threshold", fontsize=20)
        fig_signif = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20)
        plt.title("Significance vs Threshold", fontsize=20)

        for name in self.p_classname:
            df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1]
            eff_array, eff_err_array, x_axis = self.calc_sigeff_steps(self.p_nstepsign,
                                                                      df_sig, name)
            bkg_array, bkg_err_array, _ = calc_bkg(df_data_sideband, name, self.p_nstepsign,
                                                   self.p_mass_fit_lim, self.p_bin_width,
                                                   sig_region, self.p_savefit, self.dirmlplot)
            sig_array = [eff * signal_yield for eff in eff_array]
            sig_err_array = [eff_err * signal_yield for eff_err in eff_err_array]
            bkg_array = [bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array]
            bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \
                             for bkg_err in bkg_err_array]
            signif_array, signif_err_array = calc_signif(sig_array, sig_err_array, bkg_array,
                                                         bkg_err_array)
            plt.figure(fig_signif_pevt.number)
            plt.errorbar(x_axis, signif_array, yerr=signif_err_array, alpha=0.3, label=f'{name}',
                         elinewidth=2.5, linewidth=4.0)
            signif_array_ml = [sig * sqrt(self.p_nevtml) for sig in signif_array]
            signif_err_array_ml = [sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, alpha=0.3,
                         label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=4.0)
            signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array]
            signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, alpha=0.3,
                         label=f'{name}_Tot', elinewidth=2.5, linewidth=4.0)
            plt.figure(fig_signif_pevt.number)
            plt.legend(loc="lower left", prop={'size': 18})
            plt.savefig(f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png')
            plt.figure(fig_signif.number)
            plt.legend(loc="lower left", prop={'size': 18})
            plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
示例#2
0
    def do_significance(self):
        if self.step_done("significance"):
            return

        self.do_apply()
        self.do_test()

        self.logger.info("Doing significance optimization")
        gROOT.SetBatch(True)
        gROOT.ProcessLine("gErrorIgnoreLevel = kWarning;")
        #first extract the number of data events in the ml sample
        # This might need a revisit, for now just extract the numbers from the ML merged
        # event count (aka from a YAML since the actual events are not needed)
        # Before the ML count was always taken from the ML merged event df while the total
        # number was taken from the event counter. But the latter is basically not used
        # anymore for a long time cause "dofullevtmerge" is mostly "false" in the DBs
        #and the total number of events
        count_dict = parse_yaml(self.f_evt_count_ml)
        self.p_nevttot = count_dict["evtorig"]
        self.p_nevtml = count_dict["evt"]
        self.logger.debug("Number of data events used for ML: %d",
                          self.p_nevtml)
        self.logger.debug("Total number of data events: %d", self.p_nevttot)
        #calculate acceptance correction. we use in this case all
        #the signal from the mc sample, without limiting to the n. signal
        #events used for training
        denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1])
        numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1])
        acc, acc_err = calc_eff(numacc, denacc)
        self.logger.debug("Acceptance: %.3e +/- %.3e", acc, acc_err)
        #calculation of the expected fonll signals
        delta_pt = self.p_binmax - self.p_binmin
        if self.is_fonll_from_root:
            df_fonll = TFile.Open(self.f_fonll)
            df_fonll_Lc = df_fonll.Get(self.p_fonllparticle + "_" +
                                       self.p_fonllband)
            bin_min = df_fonll_Lc.FindBin(self.p_binmin)
            bin_max = df_fonll_Lc.FindBin(self.p_binmax)
            prod_cross = df_fonll_Lc.Integral(
                bin_min, bin_max) * self.p_fragf * 1e-12 / delta_pt
            signal_yield = 2. * prod_cross * delta_pt * acc * self.p_taa * self.p_br \
                           / (self.p_sigmamb * self.p_fprompt)
            #now we plot the fonll expectation
            cFONLL = TCanvas('cFONLL', 'The FONLL expectation')
            df_fonll_Lc.GetXaxis().SetRangeUser(0, 16)
            df_fonll_Lc.Draw("")
            cFONLL.SaveAs("%s/FONLL_curve_%s.png" %
                          (self.dirmlplot, self.s_suffix))
        else:
            df_fonll = pd.read_csv(self.f_fonll)
            df_fonll_in_pt = \
                    df_fonll.query('(pt >= @self.p_binmin) and (pt < @self.p_binmax)')\
                    [self.p_fonllband]
            prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / delta_pt
            signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \
                           / (self.p_sigmamb * self.p_fprompt)
            #now we plot the fonll expectation
            fig = plt.figure(figsize=(20, 15))
            plt.subplot(111)
            plt.plot(df_fonll['pt'],
                     df_fonll[self.p_fonllband] * self.p_fragf,
                     linewidth=4.0)
            plt.xlabel('P_t [GeV/c]', fontsize=20)
            plt.ylabel('Cross Section [pb/GeV]', fontsize=20)
            plt.title("FONLL cross section " + self.p_case, fontsize=20)
            plt.semilogy()
            plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png')
            plt.close(fig)

        self.logger.debug("Expected signal yield: %.3e", signal_yield)
        signal_yield = self.p_raahp * signal_yield
        self.logger.debug("Expected signal yield x RAA hp: %.3e", signal_yield)

        df_data_sideband = self.df_data.query(self.s_selbkgml)
        df_data_sideband = shuffle(df_data_sideband,
                                   random_state=self.rnd_shuffle)
        df_data_sideband = df_data_sideband.tail(
            round(len(df_data_sideband) * self.p_bkgfracopt))
        hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0],
                     self.p_mass_fit_lim[1])
        df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1]
        mass_array = df_mc_signal[self.v_invmass].values
        for mass_value in np.nditer(mass_array):
            hmass.Fill(mass_value)

        gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0],
                       self.p_mass_fit_lim[1])
        gaus_fit.SetParameters(0, hmass.Integral())
        gaus_fit.SetParameters(1, self.p_mass)
        gaus_fit.SetParameters(2, 0.02)
        self.logger.debug("To fit the signal a gaussian function is used")
        fitsucc = hmass.Fit("gaus_fit", "RQ")

        if int(fitsucc) != 0:
            self.logger.warning("Problem in signal peak fit")
            sigma = 0.

        sigma = gaus_fit.GetParameter(2)
        self.logger.debug("Mean of the gaussian: %.3e",
                          gaus_fit.GetParameter(1))
        self.logger.debug("Sigma of the gaussian: %.3e", sigma)
        sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma]
        fig_signif_pevt = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20)
        #plt.title("Significance Per Event vs Threshold", fontsize=20)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        fig_signif = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20)
        #plt.title("Significance vs Threshold", fontsize=20)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)

        df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1]

        for name in self.p_classname:
            eff_array, eff_err_array, x_axis = calc_sigeff_steps(
                self.p_nstepsign, df_sig, name)
            bkg_array, bkg_err_array, _ = calc_bkg(
                df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim,
                self.p_bkg_func, self.p_bin_width, sig_region, self.p_savefit,
                self.dirmlplot, [self.p_binmin, self.p_binmax], self.v_invmass)
            sig_array = [eff * signal_yield for eff in eff_array]
            sig_err_array = [
                eff_err * signal_yield for eff_err in eff_err_array
            ]
            bkg_array = [
                bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array
            ]
            bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \
                             for bkg_err in bkg_err_array]
            signif_array, signif_err_array = calc_signif(
                sig_array, sig_err_array, bkg_array, bkg_err_array)
            plt.figure(fig_signif_pevt.number)
            plt.errorbar(x_axis,
                         signif_array,
                         yerr=signif_err_array,
                         label=f'{name}',
                         elinewidth=2.5,
                         linewidth=5.0)
            signif_array_ml = [
                sig * sqrt(self.p_nevtml) for sig in signif_array
            ]
            signif_err_array_ml = [
                sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array
            ]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis,
                         signif_array_ml,
                         yerr=signif_err_array_ml,
                         label=f'{name}_ML_dataset',
                         elinewidth=2.5,
                         linewidth=5.0)
            plt.text(
                0.7,
                0.95,
                f" ${self.p_binmin} < p_\\mathrm{{T}}/(\\mathrm{{GeV}}/c) < {self.p_binmax}$",
                verticalalignment="center",
                transform=fig_signif.gca().transAxes,
                fontsize=30)
            #signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array]
            #signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array]
            #plt.figure(fig_signif.number)
            #plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot,
            #             label=f'{name}_Tot', elinewidth=2.5, linewidth=5.0)
            plt.figure(fig_signif_pevt.number)
            plt.legend(loc="upper left", prop={'size': 30})
            plt.savefig(
                f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png')
            plt.figure(fig_signif.number)
            plt.legend(loc="upper left", prop={'size': 30})
            mpl.rcParams.update({"text.usetex": True})
            plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
            mpl.rcParams.update({"text.usetex": False})

            with open(f'{self.dirmlplot}/Significance_{self.s_suffix}.pickle',
                      'wb') as out:
                pickle.dump(fig_signif, out)

            plt.close(fig_signif_pevt)
            plt.close(fig_signif)