Exemplo n.º 1
0
 def _n_bins(self, c, idx):
     if len(c) == 2:
         default = self._default_n_bins_2d
     elif len(c) == 3:
         default = self._default_n_bins_3d
     else:
         default = self._default_n_bins_1d
     return root_helper.get_variable_value(self.var_number_of_bins, c, idx, default)
    def _ignore_values(self, c, idx=0):
        """Determine list of values to ignore.

        :param list c: list of variables, or string variable
        :param int idx: index of the variable in c, for which to return values to ignore
        :return: list of values to ignore
        """
        i_v = root_helper.get_variable_value(self.var_ignore_values, c, idx, self.ignore_values)
        if not isinstance(i_v, list):
            i_v = [i_v]
        return i_v
    def _accept_categories(self, c, idx=0):
        """Determine list of categories to accept.

        :param list c: list of variables, or string variable
        :param int idx: index of the variable in c, for which to return categories to accept
        :return: list of categories to accept
        """
        i_c = root_helper.get_variable_value(self.var_accept_categories, c, idx, self.accept_categories)
        if not isinstance(i_c, list):
            i_c = [i_c]
        return i_c
    def execute(self):
        """Execute the link."""
        ds = process_manager.service(DataStore)

        # 1a. basic checks on contents of the roodataset
        rds = self._get_roodataset()

        # 1b. retrieve read_key_vars rooargset from datastore
        varset = self._get_rooargset(rds)

        # 1c. check provided columns
        #     match all provided columns to varset
        self._match_columns_with_rooargset(varset)

        # 1e. retrieve map_to_original from ds
        self._retrieve_map_to_original()

        # 1f. create report pages
        # data scientis report
        self.pages = []
        if self.pages_key:
            self.pages = ds.get(self.pages_key, [])
            assert isinstance(self.pages, list), 'Pages key "{key}" does not refer to a list'.format(key=self.pages_key)
        # client report
        self.client_pages = []
        if self.client_pages_key:
            self.client_pages = ds.get(self.client_pages_key, [])
            assert isinstance(self.client_pages, list), \
                'Client pages key "{key}" does not refer to a list'.format(key=self.client_pages_key)

        if self.var_binning_key:
            self.var_binning.update( ds.get(self.var_binning_key, {}) )

        # 1g. initialize matrices
        if self.columns:
            self.nx = len(self.columns)
            self.ny = len(self.columns)
            self.x_cols = self.columns
            self.y_cols = self.columns
        if self.x_columns or self.y_columns:
            self.nx = len(self.x_columns)
            self.ny = len(self.y_columns)
            self.x_cols = self.x_columns
            self.y_cols = self.y_columns
        self.chi2_matrix = np.zeros((self.ny, self.nx))
        self.correlation_matrix = np.ones((self.ny, self.nx))
        self.significance_matrix = np.zeros((self.ny, self.nx))
        self.symmetrize = True if self.columns else False
        self.n_bins = self.nx * self.ny if not self.symmetrize else self.nx * self.nx - self.nx
        self.n_unique = self.n_bins if not self.symmetrize else (self.nx * self.nx - self.nx) / 2

        # 2a. loop over unique column pairs and add to combinations
        for idx, c1 in enumerate(self.columns):
            for c2 in self.columns[idx + 1:]:
                self.combinations.append([c1, c2])
        # add left-right pair combinations
        if self.x_columns and self.inproduct:
            assert len(self.x_columns) == len(self.y_columns)
        for i, c1 in enumerate(self.x_columns):
            if self.inproduct:
                c2 = self.y_columns[i]
                self.combinations.append([c1, c2])
            else:
                for j, c2 in enumerate(self.y_columns):
                    self.combinations.append([c1, c2])

        # 2b. loop over all combinations: calculate significance and residuals
        n_combos = len(self.combinations)
        self.n_entries = rds.numEntries()
        for i_c, combo in enumerate(self.combinations):
            combo_name = ':'.join(combo)
            # make roodatahist for each combination
            obsset = ROOT.RooArgSet()
            for c in combo:
                obsset.add(varset.find(c))
            cut_str = '1'
            # set binning of variables, can differ per combo
            c_nbins = []
            for j, var in enumerate(obsset):
                if isinstance(var, ROOT.RooRealVar):
                    if var.GetName() in self.var_binning and isinstance(self.var_binning[var.GetName()], ROOT.RooBinning):
                        binning = self.var_binning[var.GetName()]
                        var.setBinning(binning)
                    else:
                        var_min = root_helper.get_variable_value(self.var_min_value, combo, j, -999.)
                        var_max = root_helper.get_variable_value(self.var_min_value, combo, j, 999.)
                        if var_min != -999.:
                            var.setMin(var_min)
                        if var_max != 999.:
                            var.setMax(var_max)
                        nbins = root_helper.get_variable_value(self.var_number_of_bins, combo, j,
                                                               self.default_number_of_bins)
                        var.setBins(nbins)
                    c_nbins.append(var.numBins())
                    # ignore values
                    ignore_values = self._ignore_values(combo, j)
                    for iv in ignore_values:
                        cut_str += ' && ({}!={})'.format(var.GetName(), iv)
                elif isinstance(var, ROOT.RooCategory):
                    ntypes = var.numTypes()
                    accept_categories = self._accept_categories(combo, j)
                    if accept_categories:
                        ntypes = sum([var.isValidLabel(ic) for ic in accept_categories])
                        if ntypes:
                            select_str = " && (0 "
                            for ic in accept_categories:
                                if not var.isValidLabel(ic):
                                    continue
                                select_str += '|| ({}=={}::{})'.format(var.GetName(), var.GetName(), ic)
                            cut_str += select_str + ')'
                    ignore_categories = self._ignore_categories(combo, j)
                    for ic in ignore_categories:
                        if not var.isValidLabel(ic):
                            continue
                        ntypes -= 1
                        cut_str += ' && ({}!={}::{})'.format(var.GetName(), var.GetName(), ic)
                    assert ntypes>0, 'Number of (selected) categories for category {} is not positive: {}'.format(var.GetName(), ntypes)
                    c_nbins.append(ntypes)
            # remove specific categories (e.g. nan) if this has been requested so
            rdh = ROOT.RooDataHist(combo_name, combo_name, obsset)
            red = rds.reduce(ROOT.RooFit.Cut(cut_str))
            rdh.add(red)
            del red
            # rdh.add(rds)

            self.logger.debug(
                'Now processing combination ({index:d}/{total:d}): '
                '{comb} with {nbins} bins and {nentries} entries.',
                index=i_c + 1, total=n_combos, comb=str(combo), nbins=rdh.numEntries(), nentries=rdh.sumEntries())

            # 0) calculate global correlation of combo
            if self.calc_correlations:
                chi2_value = ROOT.Eskapade.ABCD.Chi2OfUncorrelatedHypothesis(rdh, obsset)
                if len(combo) == 2 and self.nx > 0 and self.ny > 0:
                    rho = correlation.rho_from_chi2(chi2_value, rdh.sumEntries(), c_nbins[0], c_nbins[1])
                    ix = self.x_cols.index(combo[0])
                    iy = self.y_cols.index(combo[1])
                    if ix < self.nx and iy < self.ny:
                        self.chi2_matrix[iy, ix] = chi2_value
                        self.correlation_matrix[iy, ix] = rho
                        if self.symmetrize:
                            self.chi2_matrix[ix, iy] = chi2_value
                            self.correlation_matrix[ix, iy] = rho

            # a) calculate global significance of combo
            if self.calc_significance:
                Zi = ROOT.Eskapade.ABCD.SignificanceOfUncorrelatedHypothesis(rdh, obsset, self.nsims_per_significance)
                self.significance_map[combo_name] = Zi
                self.logger.debug(
                    'Combination {comb!s} has significance: {zi:f}.', comb=combo, zi=Zi)
                if len(combo) == 2 and self.nx > 0 and self.ny > 0:
                    ix = self.x_cols.index(combo[0])
                    iy = self.y_cols.index(combo[1])
                    if ix < self.nx and iy < self.ny:
                        self.significance_matrix[iy, ix] = Zi
                        if self.symmetrize:
                            self.significance_matrix[ix, iy] = Zi

            # b) calculate residuals
            if self.calc_residuals:
                success = ROOT.Eskapade.ABCD.checkInputData(rdh)
                if not success:
                    self.logger.warning('Cannot calculate residuals for combination: {comb!s}. Skipping.', comb=combo)
                    del rdh
                    continue
                residi = ROOT.Eskapade.ABCD.GetNormalizedResiduals(rdh, obsset)
                dfri = data_conversion.rds_to_df(residi)
                del rdh
                del residi
                # do the mapping of roofit categories back to original format
                if self.mto:
                    dfri.replace(self.mto, inplace=True)
                self.residuals_map[combo_name] = dfri

        # below, create report page for each variable in data frame
        # create resulting heatmaps and histograms
        if self.calc_correlations:
            self._make_correlations_report()
        if self.calc_significance:
            self._make_significance_report()
        if self.calc_residuals:
            self._make_residuals_report()

        # 3. storage
        if self.hist_dict_key:
            ds[self.hist_dict_key] = self.hist_dict
        if self.pages_key:
            ds[self.pages_key] = self.pages
        if self.sk_significance_map:
            ds[self.sk_significance_map] = self.significance_map
            self.logger.debug('Stored significance map in data store under key: {key}.', key=self.sk_significance_map)
        if self.sk_residuals_map:
            ds[self.sk_residuals_map] = self.residuals_map
            self.logger.debug('Stored residuals map in data store under key: {key}.', key=self.sk_residuals_map)
        if len(self.sk_residuals_overview)>0 and len(self.resid_all)>0:
            ds[self.sk_residuals_overview] = self.resid_all
            self.logger.debug('Stored residuals list in data store under key: {key}.', key=self.sk_residuals_overview)
        if self.correlation_key:
            ds[self.correlation_key] = pd.DataFrame(self.correlation_matrix, index=self.columns, columns=self.columns)
        if self.significance_key:
            ds[self.significance_key] = pd.DataFrame(self.significance_matrix, index=self.columns, columns=self.columns)

        return StatusCode.Success
Exemplo n.º 5
0
 def _max(self, c, idx):
     return root_helper.get_variable_value(self.var_max_value, c, idx, self._default_max)