def _n_bins(self, c, idx): if len(c) == 2: default = self._default_n_bins_2d elif len(c) == 3: default = self._default_n_bins_3d else: default = self._default_n_bins_1d return root_helper.get_variable_value(self.var_number_of_bins, c, idx, default)
def _ignore_values(self, c, idx=0): """Determine list of values to ignore. :param list c: list of variables, or string variable :param int idx: index of the variable in c, for which to return values to ignore :return: list of values to ignore """ i_v = root_helper.get_variable_value(self.var_ignore_values, c, idx, self.ignore_values) if not isinstance(i_v, list): i_v = [i_v] return i_v
def _accept_categories(self, c, idx=0): """Determine list of categories to accept. :param list c: list of variables, or string variable :param int idx: index of the variable in c, for which to return categories to accept :return: list of categories to accept """ i_c = root_helper.get_variable_value(self.var_accept_categories, c, idx, self.accept_categories) if not isinstance(i_c, list): i_c = [i_c] return i_c
def execute(self): """Execute the link.""" ds = process_manager.service(DataStore) # 1a. basic checks on contents of the roodataset rds = self._get_roodataset() # 1b. retrieve read_key_vars rooargset from datastore varset = self._get_rooargset(rds) # 1c. check provided columns # match all provided columns to varset self._match_columns_with_rooargset(varset) # 1e. retrieve map_to_original from ds self._retrieve_map_to_original() # 1f. create report pages # data scientis report self.pages = [] if self.pages_key: self.pages = ds.get(self.pages_key, []) assert isinstance(self.pages, list), 'Pages key "{key}" does not refer to a list'.format(key=self.pages_key) # client report self.client_pages = [] if self.client_pages_key: self.client_pages = ds.get(self.client_pages_key, []) assert isinstance(self.client_pages, list), \ 'Client pages key "{key}" does not refer to a list'.format(key=self.client_pages_key) if self.var_binning_key: self.var_binning.update( ds.get(self.var_binning_key, {}) ) # 1g. initialize matrices if self.columns: self.nx = len(self.columns) self.ny = len(self.columns) self.x_cols = self.columns self.y_cols = self.columns if self.x_columns or self.y_columns: self.nx = len(self.x_columns) self.ny = len(self.y_columns) self.x_cols = self.x_columns self.y_cols = self.y_columns self.chi2_matrix = np.zeros((self.ny, self.nx)) self.correlation_matrix = np.ones((self.ny, self.nx)) self.significance_matrix = np.zeros((self.ny, self.nx)) self.symmetrize = True if self.columns else False self.n_bins = self.nx * self.ny if not self.symmetrize else self.nx * self.nx - self.nx self.n_unique = self.n_bins if not self.symmetrize else (self.nx * self.nx - self.nx) / 2 # 2a. loop over unique column pairs and add to combinations for idx, c1 in enumerate(self.columns): for c2 in self.columns[idx + 1:]: self.combinations.append([c1, c2]) # add left-right pair combinations if self.x_columns and self.inproduct: assert len(self.x_columns) == len(self.y_columns) for i, c1 in enumerate(self.x_columns): if self.inproduct: c2 = self.y_columns[i] self.combinations.append([c1, c2]) else: for j, c2 in enumerate(self.y_columns): self.combinations.append([c1, c2]) # 2b. loop over all combinations: calculate significance and residuals n_combos = len(self.combinations) self.n_entries = rds.numEntries() for i_c, combo in enumerate(self.combinations): combo_name = ':'.join(combo) # make roodatahist for each combination obsset = ROOT.RooArgSet() for c in combo: obsset.add(varset.find(c)) cut_str = '1' # set binning of variables, can differ per combo c_nbins = [] for j, var in enumerate(obsset): if isinstance(var, ROOT.RooRealVar): if var.GetName() in self.var_binning and isinstance(self.var_binning[var.GetName()], ROOT.RooBinning): binning = self.var_binning[var.GetName()] var.setBinning(binning) else: var_min = root_helper.get_variable_value(self.var_min_value, combo, j, -999.) var_max = root_helper.get_variable_value(self.var_min_value, combo, j, 999.) if var_min != -999.: var.setMin(var_min) if var_max != 999.: var.setMax(var_max) nbins = root_helper.get_variable_value(self.var_number_of_bins, combo, j, self.default_number_of_bins) var.setBins(nbins) c_nbins.append(var.numBins()) # ignore values ignore_values = self._ignore_values(combo, j) for iv in ignore_values: cut_str += ' && ({}!={})'.format(var.GetName(), iv) elif isinstance(var, ROOT.RooCategory): ntypes = var.numTypes() accept_categories = self._accept_categories(combo, j) if accept_categories: ntypes = sum([var.isValidLabel(ic) for ic in accept_categories]) if ntypes: select_str = " && (0 " for ic in accept_categories: if not var.isValidLabel(ic): continue select_str += '|| ({}=={}::{})'.format(var.GetName(), var.GetName(), ic) cut_str += select_str + ')' ignore_categories = self._ignore_categories(combo, j) for ic in ignore_categories: if not var.isValidLabel(ic): continue ntypes -= 1 cut_str += ' && ({}!={}::{})'.format(var.GetName(), var.GetName(), ic) assert ntypes>0, 'Number of (selected) categories for category {} is not positive: {}'.format(var.GetName(), ntypes) c_nbins.append(ntypes) # remove specific categories (e.g. nan) if this has been requested so rdh = ROOT.RooDataHist(combo_name, combo_name, obsset) red = rds.reduce(ROOT.RooFit.Cut(cut_str)) rdh.add(red) del red # rdh.add(rds) self.logger.debug( 'Now processing combination ({index:d}/{total:d}): ' '{comb} with {nbins} bins and {nentries} entries.', index=i_c + 1, total=n_combos, comb=str(combo), nbins=rdh.numEntries(), nentries=rdh.sumEntries()) # 0) calculate global correlation of combo if self.calc_correlations: chi2_value = ROOT.Eskapade.ABCD.Chi2OfUncorrelatedHypothesis(rdh, obsset) if len(combo) == 2 and self.nx > 0 and self.ny > 0: rho = correlation.rho_from_chi2(chi2_value, rdh.sumEntries(), c_nbins[0], c_nbins[1]) ix = self.x_cols.index(combo[0]) iy = self.y_cols.index(combo[1]) if ix < self.nx and iy < self.ny: self.chi2_matrix[iy, ix] = chi2_value self.correlation_matrix[iy, ix] = rho if self.symmetrize: self.chi2_matrix[ix, iy] = chi2_value self.correlation_matrix[ix, iy] = rho # a) calculate global significance of combo if self.calc_significance: Zi = ROOT.Eskapade.ABCD.SignificanceOfUncorrelatedHypothesis(rdh, obsset, self.nsims_per_significance) self.significance_map[combo_name] = Zi self.logger.debug( 'Combination {comb!s} has significance: {zi:f}.', comb=combo, zi=Zi) if len(combo) == 2 and self.nx > 0 and self.ny > 0: ix = self.x_cols.index(combo[0]) iy = self.y_cols.index(combo[1]) if ix < self.nx and iy < self.ny: self.significance_matrix[iy, ix] = Zi if self.symmetrize: self.significance_matrix[ix, iy] = Zi # b) calculate residuals if self.calc_residuals: success = ROOT.Eskapade.ABCD.checkInputData(rdh) if not success: self.logger.warning('Cannot calculate residuals for combination: {comb!s}. Skipping.', comb=combo) del rdh continue residi = ROOT.Eskapade.ABCD.GetNormalizedResiduals(rdh, obsset) dfri = data_conversion.rds_to_df(residi) del rdh del residi # do the mapping of roofit categories back to original format if self.mto: dfri.replace(self.mto, inplace=True) self.residuals_map[combo_name] = dfri # below, create report page for each variable in data frame # create resulting heatmaps and histograms if self.calc_correlations: self._make_correlations_report() if self.calc_significance: self._make_significance_report() if self.calc_residuals: self._make_residuals_report() # 3. storage if self.hist_dict_key: ds[self.hist_dict_key] = self.hist_dict if self.pages_key: ds[self.pages_key] = self.pages if self.sk_significance_map: ds[self.sk_significance_map] = self.significance_map self.logger.debug('Stored significance map in data store under key: {key}.', key=self.sk_significance_map) if self.sk_residuals_map: ds[self.sk_residuals_map] = self.residuals_map self.logger.debug('Stored residuals map in data store under key: {key}.', key=self.sk_residuals_map) if len(self.sk_residuals_overview)>0 and len(self.resid_all)>0: ds[self.sk_residuals_overview] = self.resid_all self.logger.debug('Stored residuals list in data store under key: {key}.', key=self.sk_residuals_overview) if self.correlation_key: ds[self.correlation_key] = pd.DataFrame(self.correlation_matrix, index=self.columns, columns=self.columns) if self.significance_key: ds[self.significance_key] = pd.DataFrame(self.significance_matrix, index=self.columns, columns=self.columns) return StatusCode.Success
def _max(self, c, idx): return root_helper.get_variable_value(self.var_max_value, c, idx, self._default_max)