Пример #1
0
    def preparesample(self):
        self.logger.info("Prepare Sample")
        self.df_data = pickle.load(openfile(self.f_reco_data, "rb"))
        self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb"))
        self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb"))
        self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff)
        arraydf = [self.df_data, self.df_mc]
        self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax)


        self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[self.p_tagbkg]
        self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_sig = self.df_sig.query(self.s_selsigml)
        self.df_bkg = self.df_bkg.query(self.s_selbkgml)
        self.df_bkg["ismcsignal"] = 0
        self.df_bkg["ismcprompt"] = 0
        self.df_bkg["ismcfd"] = 0
        self.df_bkg["ismcbkg"] = 0


        if self.p_nsig > len(self.df_sig):
            self.logger.warning("There are not enough signal events")
        if self.p_nbkg > len(self.df_bkg):
            self.logger.warning("There are not enough background events")

        self.p_nsig = min(len(self.df_sig), self.p_nsig)
        self.p_nbkg = min(len(self.df_bkg), self.p_nbkg)

        self.logger.info("Used number of signal events is %d", self.p_nsig)
        self.logger.info("Used number of background events is %d", self.p_nbkg)

        self.df_ml = pd.DataFrame()
        self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle)
        self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle)
        self.df_sig = self.df_sig[:self.p_nsig]
        self.df_bkg = self.df_bkg[:self.p_nbkg]
        self.df_sig[self.v_sig] = 1
        self.df_bkg[self.v_sig] = 0
        self.df_ml = pd.concat([self.df_sig, self.df_bkg])
        self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \
                                           test_size=self.test_frac, random_state=self.rnd_splt)
        self.df_mltrain = self.df_mltrain.reset_index(drop=True)
        self.df_mltest = self.df_mltest.reset_index(drop=True)
        self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(self.df_mltrain, self.v_sig)
        self.df_sigtest, self.df_bkgtest = split_df_sigbkg(self.df_mltest, self.v_sig)
        self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain),
                         len(self.df_mltest))
        self.logger.info("Number of signal candidates: train %d and test %d",
                         len(self.df_sigtrain), len(self.df_sigtest))
        self.logger.info("Number of bkg candidates: %d and test %d", len(self.df_bkgtrain),
                         len(self.df_bkgtest))

        self.df_xtrain = self.df_mltrain[self.v_train]
        self.df_ytrain = self.df_mltrain[self.v_sig]
        self.df_xtest = self.df_mltest[self.v_train]
        self.df_ytest = self.df_mltest[self.v_sig]
Пример #2
0
    def preparesample(self):

        self.logger.info("Prepare Sample")

        filename_train = \
                os.path.join(self.dirmlout, f"df_train_{self.p_binmin}_{self.p_binmax}.pkl")
        filename_test = \
                os.path.join(self.dirmlout, f"df_test_{self.p_binmin}_{self.p_binmax}.pkl")

        if os.path.exists(filename_train) \
                and os.path.exists(filename_test) \
                and self.step_done("preparemlsamples"):
            self.df_mltrain = pickle.load(openfile(filename_train, "rb"))
            self.df_mltest = pickle.load(openfile(filename_test, "rb"))

        else:

            self.prepare_data_mc_mcgen()

            self.df_sig, self.df_bkg = self.arraydf[
                self.p_tagsig], self.arraydf[self.p_tagbkg]
            self.df_sig = seldf_singlevar(self.df_sig, self.v_bin,
                                          self.p_binmin, self.p_binmax)
            self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin,
                                          self.p_binmin, self.p_binmax)
            self.df_sig = self.df_sig.query(self.s_selsigml)
            self.df_bkg = self.df_bkg.query(self.s_selbkgml)
            self.df_bkg["ismcsignal"] = 0
            self.df_bkg["ismcprompt"] = 0
            self.df_bkg["ismcfd"] = 0
            self.df_bkg["ismcbkg"] = 0

            if self.p_equalise_sig_bkg:
                self.p_nsig = min(len(self.df_sig), len(self.df_bkg),
                                  self.p_nsig)
                self.p_nbkg = min(len(self.df_sig), len(self.df_bkg),
                                  self.p_nbkg)

            self.df_ml = pd.DataFrame()
            self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle)
            self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle)
            self.df_sig = self.df_sig[:self.p_nsig]
            self.df_bkg = self.df_bkg[:self.p_nbkg]
            self.df_sig[self.v_sig] = 1
            self.df_bkg[self.v_sig] = 0
            self.df_ml = pd.concat([self.df_sig, self.df_bkg])
            self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \
                                               test_size=self.test_frac, random_state=self.rnd_splt)
            self.df_mltrain = self.df_mltrain.reset_index(drop=True)
            self.df_mltest = self.df_mltest.reset_index(drop=True)

            # Write for later usage
            pickle.dump(self.df_mltrain,
                        openfile(filename_train, "wb"),
                        protocol=4)
            pickle.dump(self.df_mltest,
                        openfile(filename_test, "wb"),
                        protocol=4)

        # Now continue with extracting signal and background stats and report
        self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(
            self.df_mltrain, self.v_sig)
        self.df_sigtest, self.df_bkgtest = split_df_sigbkg(
            self.df_mltest, self.v_sig)
        self.logger.info("Total number of candidates: train %d and test %d",
                         len(self.df_mltrain), len(self.df_mltest))
        self.logger.info("Number of signal candidates: train %d and test %d",
                         len(self.df_sigtrain), len(self.df_sigtest))
        self.logger.info("Number of bkg candidates: %d and test %d",
                         len(self.df_bkgtrain), len(self.df_bkgtest))

        self.logger.info("Aim for number of signal events: %d", self.p_nsig)
        self.logger.info("Aim for number of background events: %d",
                         self.p_nbkg)

        if self.p_nsig > (len(self.df_sigtrain) + len(self.df_sigtest)):
            self.logger.warning("There are not enough signal events")
        if self.p_nbkg > (len(self.df_bkgtrain) + len(self.df_bkgtest)):
            self.logger.warning("There are not enough background events")

        if self.p_mask_values:
            self.logger.info("Maksing values for training and testing")
            mask_df(self.df_mltrain, self.p_mask_values)
            mask_df(self.df_mltest, self.p_mask_values)
        # Final preparation of signal and background samples for training and testing
        self.df_xtrain = self.df_mltrain[self.v_train]
        self.df_ytrain = self.df_mltrain[self.v_sig]
        self.df_xtest = self.df_mltest[self.v_train]
        self.df_ytest = self.df_mltest[self.v_sig]

        self.step_done("preparemlsamples")