def test_parse_sel_expr_right_sel(self): """Test that selections with only a right condition can be parsed""" sel_expr = 'x < 0.2' exp_sel = lambda d: d.x < 0.2 sel = mh.parse_sel_expr(sel_expr) pdt.assert_frame_equal(apply_selections(self.dfr, sel), apply_selections(self.dfr, exp_sel))
def test_parse_sel_expr_left_sel(self): """Test that selections with only a left condition can be parsed""" sel_expr = '0.8 < x' exp_sel = lambda d: d.x > 0.8 sel = mh.parse_sel_expr(sel_expr) pdt.assert_frame_equal(apply_selections(self.dfr, sel), apply_selections(self.dfr, exp_sel))
def samples(toydatadir, scenario, sel=None): chi1, chi2 = scenario chi1_file = '/'.join([toydatadir, chi1, 'toy_data.root']) chi2_file = '/'.join([toydatadir, chi2, 'toy_data.root']) return (apply_selections(get_dataframe(chi1_file), sel), apply_selections(get_dataframe(chi2_file), sel))
def get_ratio_mc(dfr, variable, selections, hist_sett): """ Get the chic2 / chic1 ratio for the real mc (necessary since chic1 and chic2 are in the same dataframe) """ dchi1 = apply_selections(dfr, lambda df: chic_state_sel(df, 'chic1')) dchi2 = apply_selections(dfr, lambda df: chic_state_sel(df, 'chic2')) return get_ratio(dchi1, dchi2, variable, selections, hist_sett)
def test_parse_sel_expr_double_sel(self): """Test that a double sided expression can be parsed""" sel_expr = '-0.2 < x < 0.3' exp_sel = lambda d: (d.x > -0.2) & (d.x < 0.3) sel = mh.parse_sel_expr(sel_expr) pdt.assert_frame_equal(apply_selections(self.dfr, sel), apply_selections(self.dfr, exp_sel)) sel_expr = '0.1 < abs(x) < 0.2' exp_sel = lambda d: (d.x.abs() > 0.1) & (d.x.abs() < 0.2) sel = mh.parse_sel_expr(sel_expr) sel_dfr = apply_selections(self.dfr, sel) pdt.assert_frame_equal(apply_selections(self.dfr, sel), apply_selections(self.dfr, exp_sel))
def get_chic2_chic1_ratio(df, var, frame, gen=False): """ Get the chic2 / chic1 ratio for a given variable in a given frame """ chi1_data = apply_selections(df, lambda df: chic_state_sel(df, 'chic1')) chi2_data = apply_selections(df, lambda df: chic_state_sel(df, 'chic2')) chi1_hist = get_var_histo(chi1_data, var, frame, gen) chi2_hist = get_var_histo(chi2_data, var, frame, gen) chi2_hist.Divide(chi1_hist) chi2_hist.SetYTitle('#chi_{c2} / #chi_{c1}') return chi2_hist
def main(args): """Main""" data = get_dataframe(args.datafile) cmfile = r.TFile.Open(args.corrmapfile) accmap = get_correction_map(cmfile, not args.no_pt, args.acceptance) cond_mkdir(args.outdir) plot_args = {'drawOpt': 'colz'} if args.plot_arguments is not None: plot_args.update(parse_plot_args(args.plot_arguments.split(';;'))) if isinstance(accmap, r.TH2): plot = make_overlay_plot(accmap, data, **plot_args) plot.SaveAs('{}/corrmap_data_overlay_2d.pdf'.format(args.outdir)) else: pt_binning = get_binning(accmap, 2) pt_bins = zip(pt_binning[:-1], pt_binning[1:]) for pt_bin in pt_bins: pdata = apply_selections(data, select_bin('JpsiPt', *pt_bin)) pmap = get_pt_bin(accmap, 0.5 * np.sum(pt_bin)) plot = make_overlay_plot(pmap, pdata, **plot_args) plot.SaveAs('{}/corrmap_data_overlay_2d_{}_{}.pdf'.format( args.outdir, int(pt_bin[0]), int(pt_bin[1])))
def get_phi_hist(df, frame, selections, gen=False): """ Make a 1D phi hist from the passed data """ return create_histogram(costh_phi(apply_selections(df, selections), frame, gen)[:,1], (10, 0, 90), x_axis='|#phi^{{{}}}|'.format(frame))
def load_data(scanfile, var_x, var_y, tree='log_like_scan'): """Load the dataframe containing the scan results and do some cleanup""" # Only load the necessary variables load_vars = [var_x, var_y, 'llh'] rfile = r.TFile.Open(scanfile) if has_branch(rfile.Get(tree), ['goodFit']): load_vars.append('goodFit') rfile.Close() data = get_dataframe(scanfile, treename=tree, columns=load_vars) n_full = data.shape[0] logging.info('Loaded %d rows of data', n_full) # Check if the data come from a fit scan if 'goodFit' in data.columns: # remove all rows where the fit did not converge and where there are # nan values for any of the fitted parameters data = apply_selections(data, lambda d: d.goodFit > 0) n_bad_fit = n_full - data.shape[0] logging.info('Removed %d because of no good fit', n_bad_fit) return data
def get_contour_graph(data, conf_level, var_x, var_y, tf_x='identity', tf_y='identity', bound_x=None, bound_y=None): """Get the two dimensional histogram from which the contour will be obtained""" trans_f_x = globals()[tf_x] trans_f_y = globals()[tf_y] llh_min = data.llh.min() llh_cond = lambda d: 2 * (d.llh - llh_min) < chi2.ppf(conf_level, 2) # remove the minimum value data point. In case it is outside the scanned # range, than this would "distort" the contour. rm_min = lambda d: d.llh != llh_min select_funcs = [llh_cond, rm_min] if bound_x is not None: select_funcs.append(select_bin(var_x, *bound_x)) if bound_y is not None: select_funcs.append(select_bin(var_y, *bound_y)) sel_data = apply_selections(data, select_funcs) filled = np.array(sel_data.loc[:, [var_x, var_y]]) filled[:, 0] = trans_f_x(filled[:, 0]) filled[:, 1] = trans_f_y(filled[:, 1]) return contour_as_tgraph(filled)
def get_costh_binning(dfr, n_bins, full_range=False, selection=None): """ Get an equi-populated binning in abs(costh_HX) Args: dfr (pandas.DataFrame): DataFrame containing all the data that should be considered for the binning n_bins (int): Number of bins full_range (bool): Make the last bin span the full range up to 1 (True) or stop at the maximum observed value (False) selection (numpy.array, optional): Selection array that can be used in a DataFrame indexing to select certain events, defaults to None, where all entries are used Returns: list: list of tuples with the bin borders for all the bins, where the first bin starts at 0 and the last one ends at 1, regardless of the exact values for the bin borders """ from utils.data_handling import apply_selections sel_dfr = apply_selections(dfr, selection) abs_costh = lambda d: np.abs(d.costh_HX) binning = get_equi_pop_bins(sel_dfr, abs_costh, n_bins) # replace the lowest and highest bin border binning[0] = (0, binning[0][1]) if full_range: binning[-1] = (binning[-1][0], 1) return binning
def make_plot_good_fit(data, var_x, var_y, tf_x, tf_y): """Make a plot showing whether the fit was successful or not for each point of the scan""" # Remove the minimum since that is in any case valid data = apply_selections(data, lambda d: d.llh != d.llh.min()) trans_f_x = globals()[tf_x] trans_f_y = globals()[tf_y] x_vals = trans_f_x(data.loc[:, var_x]) y_vals = trans_f_y(data.loc[:, var_y]) x_binning = get_variable_binning(x_vals) y_binning = get_variable_binning(y_vals) good_fit = data.goodFit > 0 hist = hist2d(x_vals[good_fit], y_vals[good_fit], x_hist_sett=(len(x_binning) - 1, x_binning), y_hist_sett=(len(y_binning) - 1, y_binning)) can = mkplot(to_bw_hist(hist), drawOpt='colz', xRange=[x_binning[0], x_binning[-1]], yRange=[y_binning[0], y_binning[-1]], xLabel=get_var_name(var_x, tf_x), yLabel=get_var_name(var_y, tf_y)) hist = can.pltables[1] hist.GetZaxis().SetRangeUser(0, 10) remove_color_bar(can) return can
def make_plot_min_chi2(data, var_x, var_y, tf_x, tf_y, gf_only=False): """Make a plot showing the min chi2 from the scan for each scan point, where the fit was OK""" data.loc[:, 'delta_chi2'] = 2 * (data.llh - data.llh.min()) bin_data = apply_selections(data, lambda d: d.delta_chi2 != 0) # Cleanup the data frame by dropping duplicates which can stem from # overlapping scanning when done in parallel bin_data = bin_data.drop_duplicates() if gf_only: bin_data = apply_selections(bin_data, lambda d: d.goodFit > 0) # Get the binning trans_f_x = globals()[tf_x] trans_f_y = globals()[tf_y] x_vals = trans_f_x(bin_data.loc[:, var_x]) y_vals = trans_f_y(bin_data.loc[:, var_y]) x_binning = get_variable_binning(x_vals) y_binning = get_variable_binning(y_vals) arr = np.zeros((len(x_binning) - 1, len(y_binning) - 1)) x_bin = find_bin(x_binning, x_vals) y_bin = find_bin(y_binning, y_vals) dchi2 = bin_data.delta_chi2.values arr[x_bin, y_bin] = dchi2[:] hist = from_array(arr, np.array([x_binning, y_binning])) can = _setup_canvas(None) can.SetRightMargin(0.12) mkplot(hist, can=can, drawOpt='colz', xRange=[x_binning[0], x_binning[-1]], yRange=[y_binning[0], y_binning[-1]], xLabel=get_var_name(var_x, tf_x), yLabel=get_var_name(var_y, tf_y)) hist.GetZaxis().SetRangeUser(0, 25) can.Update() return can
def test_array_selection(self): sel_array = np.ones(self.dfr.shape[0], dtype=bool) sel_array[0] = False sel_array[-1] = False sel_array[2] = False sel_dfr = apply_selections(self.dfr, sel_array) pdt.assert_frame_equal(sel_dfr, self.dfr[sel_array])
def get_costh_hist(df, frame, selections, gen=False): """ Make a 1D costh hist from the passed data """ return create_histogram(costh_phi(apply_selections(df, selections), frame, gen)[:,0], # (8, 0, 1), (len(costh_binning_data) - 1, costh_binning_data), x_axis='|cos#theta^{{{}}}|'.format(frame))
def get_ratio(dchi1, dchi2, variable, selections, hist_sett, get_weights=None): """ Get the chic2 / chic1 ratio """ hchi1 = create_hist(apply_selections(dchi1, selections), variable, hist_sett, get_weights) hchi2 = create_hist(apply_selections(dchi2, selections), variable, hist_sett, get_weights) # scale such that the integrated ratio is 1 nchi1 = hchi1.Integral() nchi2 = hchi2.Integral() ratio = divide(hchi2, hchi1) ratio.Scale(nchi1 / nchi2) return ratio
def test_single_func_selection(self): sel_a = lambda df: df.colA_a == 1 a_positions = [ i for i in xrange(len(self.dummy_list)) if self.dummy_list[i] == 'a' ] sel_dfr = apply_selections(self.dfr, sel_a) pdt.assert_frame_equal(sel_dfr, self.dfr.iloc[a_positions])
def test_negate(self): sel_a = lambda df: df.colA_a == 1 sel_b = lambda df: df.colB_a == 1 nab_positions = [ i for i in xrange(len(self.dummy_list)) if not (self.dummy_list[i] == 'a' and self.dummy_list2[i] == 'a') ] sel_dfr = apply_selections(self.dfr, (sel_a, sel_b), negate=True) pdt.assert_frame_equal(sel_dfr, self.dfr.iloc[nab_positions])
def main(): """Main""" set_TDR_style() r.gStyle.SetPadRightMargin(r.gStyle.GetPadRightMargin() + 0.01) data = apply_selections(get_dataframe(INFILE, columns=collect_requirements(SELECTIONS)), SELECTIONS) can = make_photon_pt_dist_plot(data) can.SaveAs(os.path.join(OUTDIR, 'photon_pt_dist.pdf'))
def apply_selection_one_by_one(data, selections): """ Apply the selections one by one and return the number of events after each selection """ n_events = [sf.get_n_events(data)] sel_data = data for sel in selections: sel_data = apply_selections(sel_data, sel) n_events.append(sf.get_n_events(sel_data)) return sel_data, n_events
def create_workspace(model, datafile, binvar, binning, massrange, fitfile, weights=None): """ Create the workspace with the data already imported and the model defined, also in charge of writing the bin info json file """ wsp = r.RooWorkspace('ws_mass_fit') massrange = [float(v) for v in massrange.split(',')] # load the data and apply the mass selection of the fitting range immediately bin_var = parse_func_var(binvar) # necessary for loading variables = [model.mname, bin_var[0]] if weights is not None: variables.append(weights) data = apply_selections(get_dataframe(datafile, columns=variables), select_bin(model.mname, *massrange)) costh_bins, costh_means = get_costh_bins(binning, bin_var, data) create_bin_info_json(fitfile.replace('.root', '_bin_sel_info.json'), costh_bins, costh_means, bin_var[0], datafile) # Create the variables in the workspace try_factory(wsp, '{}[{}, {}]'.format(model.mname, *massrange)) if 'abs' in bin_var[1].__name__: try_factory( wsp, '{}[{}, {}]'.format(bin_var[0], -np.max(costh_bins), np.max(costh_bins))) else: try_factory( wsp, '{}[{}, {}]'.format(bin_var[0], np.min(costh_bins), np.max(costh_bins))) dset_vars = r.RooArgSet(get_var(wsp, model.mname), get_var(wsp, bin_var[0])) tree = array2tree(data.to_records(index=False)) if weights is not None: try_factory(wsp, '{}[0, 1e5]'.format(weights)) dataset = r.RooDataSet('full_data', 'full data sample', tree, dset_vars, '', weights) else: dataset = r.RooDataSet('full_data', 'full data sample', tree, dset_vars) ws_import(wsp, dataset) return wsp, costh_bins
def get_n_events(data, selections=None, weight=None): """ Get the number of events in the dataframe surviving the passed selection If weight is not None the corresponding column will be used as weights, unless weight is a function taking the dataframe as only input, than the sum of the array returned by that function call will be returned """ sel_data = apply_selections(data, selections) if weight is None: return sel_data.shape[0] weights = _get_var(sel_data, weight) return weights.sum()
def test_get_bin_means_weighted(self): # add weights to dataframe for easier handling self.dfr['w'] = np.random.uniform(0, 1, 500000) binning = [(lo, hi) for lo, hi in zip( np.linspace(0, 1, 5)[:-1], np.linspace(0, 1, 5)[1:])] exp_means = [] for lo, hi in binning: sel_dfr = apply_selections(self.dfr, lambda d: (d.x > lo) & (d.x < hi)) w_sum = sel_dfr.w.sum() wx_sum = np.sum(sel_dfr.x * sel_dfr.w) exp_means.append(wx_sum / w_sum) means = mh.get_bin_means(self.dfr, 'x', binning, weights=self.dfr.w) npt.assert_allclose(means, np.array(exp_means))
def make_costh_mu_pt_plot(data): """Make a comparison of costh distributions for different muon pt cuts""" pt_cuts = [3.5, 4, 4.5, 5, 5.5] mu_pt_labels = ['p_{{T}}^{{#mu}} > {:.1f} GeV'.format(p) for p in pt_cuts] hists = [ hist1d(apply_selections(data, single_muon_sel(flat_pt(p, 1.6))).costh_HX_fold.abs(), nbins=24, min=0, max=1) for p in pt_cuts ] [h.Scale(1.0 / h.GetBinContent(1)) for h in hists] can = mkplot(hists, drawOpt='PE', xRange=[0, 1], xLabel=CTH_LAB, attr=default_attributes(size=1.0, width=2, open_markers=False), legPos=(0.65, 0.50, 0.8, 0.92), legEntries=mu_pt_labels, yLabel='normalized to {} = 0 [a.u.]'.format(CTH_LAB)) can.pltables[0].SetNdivisions(505, 'X') can.attached_tobjects[0].SetTextSize(0.05) return can
def main(): """Main""" set_TDR_style() cond_mkdir(OUTDIR) data = apply_selections(get_dataframe(INFILE), SELECTIONS) r.gStyle.SetPadRightMargin(0.129) r.gStyle.SetPadLeftMargin(r.gStyle.GetPadLeftMargin() - 0.007) can = make_costh_phi_plot(data, 'HX') can.SaveAs(os.path.join(OUTDIR, 'costh_phi_fold_HX_pt_12_18_all.pdf')) can = make_costh_phi_plot(data, 'CS') can.SaveAs(os.path.join(OUTDIR, 'costh_phi_fold_CS_pt_12_18_all.pdf')) set_TDR_style() can = make_costh_mu_pt_plot(data) can.SaveAs(os.path.join(OUTDIR, 'costh_HX_comp_mu_pt_jpsipt_12_18_all.pdf'))
def load_data(filen, model): """ Load the raw data making sure that only the data specified in the fitting range as well as in the range defined by the binning to avoid loading events that are not used in the fit. NOTE: This still does not beat a proper pre-selection! """ mass_sel = select_bin(model.fit_var, *model.fit_range) selections = [mass_sel] for var, bounds in model.get_load_vars(): selections.append( select_bin(var, *[float(v) for v in bounds.split(',')])) load_vars = ['{costh,phi}_HX_fold'] + collect_requirements(selections) return apply_selections(get_dataframe(filen, columns=load_vars), selections)
def select_phys_data(dfr, plot_vars, do_selection=True): """Select the data to plot according to the variables that are plotted. If lth_chicJ is in the plot_vars no selection will be applied to it, if it is projected over, it will be restricted to the physically allowed domain""" if not do_selection: return dfr if not isinstance(plot_vars, (list, tuple)): plot_vars = [plot_vars] selections = [] if 'lth_chic1' not in plot_vars: selections.append(LTH_CHI1_SEL) if 'lth_chic2' not in plot_vars: selections.append(LTH_CHI2_SEL) selections = selections if selections else None return apply_selections(dfr, selections)
def get_corrected_ratio(data, wsp, model, sym_uncer=False, dbg_file=None): """ Get the corrected ratio in all bins """ logging.info('Getting corrected ratio with {} errors'. format('HESSE' if sym_uncer else 'MINOS')) corr_ratio = [] # NOTE: Assuming here that the bins are ordered correctly AND that the for label, bounds in model.bins.iteritems(): selections = [] for ivar, var in enumerate(model.bin_cut_vars): selections.append(select_bin(var, *bounds[ivar])) bin_data = apply_selections(data, selections) chi1_prob, chi2_prob = get_state_fractions(bin_data, wsp, model, label) print_info('chi1', bin_data, chi1_prob) print_info('chi2', bin_data, chi2_prob) if dbg_file is not None: debug_plots('chi1_{}'.format(label), dbg_file, chi1_prob, bin_data.corr_chi1, bin_data.chicMass) debug_plots('chi2_{}'.format(label), dbg_file, chi2_prob, bin_data.corr_chi2, bin_data.chicMass) chi1_w = bin_data.loc[:, 'corr_chi1'] * chi1_prob chi2_w = bin_data.loc[:, 'corr_chi2'] * chi2_prob chi1_corr = np.sum(chi1_w) chi2_corr = np.sum(chi2_w) corr_ratio.append(chi2_corr / chi1_corr) # Assume that the relative uncertainties are unchanged for the corrected and # the uncorrected graph and use them to determine the uncertainties of the # corrected graph uncorr_graph = get_graph(wsp, model, 'r_chic2_chic1', sym_uncer) xlo, xhi, err_lo, err_hi = get_errors(uncorr_graph) xvals, yvals = np.array(uncorr_graph.GetX()), np.array(uncorr_graph.GetY()) corr_ratio = np.array(corr_ratio) return r.TGraphAsymmErrors(len(corr_ratio), xvals, corr_ratio, xlo, xhi, err_lo / yvals * corr_ratio, err_hi / yvals * corr_ratio)
def get_bin_means(dfr, get_var, bins, selection=None, weights=None): """ Get the the mean value in all bins from the data Args: dfr (pandas.DataFrame): DataFrame containing the data get_var (function): Function taking a DataFrame as only argument and returning one value for every row in the DataFrame. The return value of this will be used to calculate the mean in each bin bins (list of tuples): list of tuples containing the bin borders for each bin selection (numpy.array or list of functions, optional): Selection that is valid in a call to apply_selections weights(numpy.array, optional): If not none, the weighted average will be returned Returns: list: list of mean values for each bin See also: apply_selections, numpy.average """ from utils.data_handling import apply_selections sel_dfr = apply_selections(dfr, selection) var = _get_var(sel_dfr, get_var) means = [] for low, high in bins: bin_sel = (var > low) & (var < high) var_bin = var[bin_sel] w_bin = weights[bin_sel] if weights is not None else None # NOTE: here we use the fact that np.sum(None) does't evaluate to 0 if np.sum(w_bin) != 0 and np.sum(bin_sel) != 0: # Have to do this to avoid np.average raising a ZeroDivisionError means.append(np.average(var_bin, weights=w_bin)) else: means.append(np.nan) return means
def main(args): """Main""" selections = OrderedDict() selections['trigger'] = sf.trigger_sel_(args.trigger) selections['muon'] = get_muon_sel(args.muon) selections['vtx prob'] = sf.vtx_prob_sel_(args.vtxprob) selections['photon sel'] = sf.photon_sel_(sf.flat_pt(0.4, 1.5)) selections['jpsi kin sel'] = get_jpsi_sel(args.jpsi) selections['lifetime cut'] = get_lt_selection(args.mc, args.lifetime) selections['deta cut (MC only)'] = get_deta_sel(args.deta) # selections['chis mass cut'] = sf.chic_mass_sel # not strictly necessary from a PS point of view # To ensure rectangular region in costh-phi # selections['costh'] = lambda d: d.costh_HX_fold.abs() < 0.625 selections['state_sel'] = get_state_sel(args.mc, args.state) selections['costh_sel'] = get_costh_sel(args.max_costh) global VARIABLES VARIABLES.extend(sf.collect_requirements(selections.values())) if args.deta: VARIABLES.append('gen_photonEta') if args.mc: VARIABLES.append('pdgId') VARIABLES = list(set(VARIABLES)) data = get_dataframe(args.infile, columns=VARIABLES, where='trigger > 0') if not args.hist: sel_data = apply_selections(data, selections.values()) else: sel_data, n_evts = apply_selection_one_by_one(data, selections.values()) hist = create_store_sel_hist(n_evts, selections.keys()) # compute the Q-value based mass sel_data.loc[:, 'mQ'] = sel_data.mumugammaMass - sel_data.JpsiMass + m_psiPDG store_dataframe(sel_data, args.outfile, 'chic_tuple')