def gen_jet_pair_mass(df): gjmass = None gjets = df.GenJet gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13) | (abs(df.GenPart.pdgId) == 11) | (abs(df.GenPart.pdgId) == 15)] gl_pair = ak.cartesian({ "jet": gjets, "lepton": gleptons }, axis=1, nested=True) _, _, dr_gl = delta_r( gl_pair["jet"].eta, gl_pair["lepton"].eta, gl_pair["jet"].phi, gl_pair["lepton"].phi, ) isolated = ak.all((dr_gl > 0.3), axis=-1) if ak.count(gjets[isolated], axis=None) > 0: # TODO: convert only relevant fields! gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0], ["pt", "eta", "phi", "mass"]] gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1], ["pt", "eta", "phi", "mass"]] gjet1.index = gjet1.index.droplevel("subentry") gjet2.index = gjet2.index.droplevel("subentry") gjsum = p4_sum(gjet1, gjet2) gjmass = gjsum.mass return gjmass
def read_data(paths, ds_predictions, pn_predictions): dfs = [] for path in paths: valid_jets = read_nanoaod(path) jet_pt = ak.to_pandas(valid_jets.pt) gen_jet_pt = ak.to_pandas(valid_jets.matched_gen.pt) gen_jet_eta = ak.to_pandas(valid_jets.matched_gen.eta) parton_flavour = ak.to_pandas(valid_jets.matched_gen.partonFlavour) hadron_flavour = ak.to_pandas(valid_jets.matched_gen.hadronFlavour) df = pd.concat( (jet_pt, gen_jet_pt, gen_jet_eta, parton_flavour, hadron_flavour), axis=1) df.columns = [ 'Jet_pt', 'GenJet_pt', 'GenJet_eta', 'GenJet_partonFlavour', 'GenJet_hadronFlavour' ] flavour = df.GenJet_hadronFlavour.where(df.GenJet_hadronFlavour != 0, other=np.abs( df.GenJet_partonFlavour)) df = df.drop(columns=['GenJet_partonFlavour', 'GenJet_hadronFlavour']) df['flavour'] = flavour dfs.append(df) df = pd.concat(dfs, axis=0) df['response'] = df.Jet_pt / df.GenJet_pt df['ds_response'] = ds_predictions.flatten() * df.Jet_pt / df.GenJet_pt df['pn_response'] = pn_predictions.flatten() * df.Jet_pt / df.GenJet_pt return df
def fill_gen_jets(df, output): gjets = df.GenJet gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13) | (abs(df.GenPart.pdgId) == 11) | (abs(df.GenPart.pdgId) == 15)] gl_pair = ak.cartesian({ "jet": gjets, "lepton": gleptons }, axis=1, nested=True) _, _, dr_gl = delta_r( gl_pair["jet"].eta, gl_pair["lepton"].eta, gl_pair["jet"].phi, gl_pair["lepton"].phi, ) isolated = ak.all((dr_gl > 0.3), axis=-1) gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0], ["pt", "eta", "phi", "mass"]] gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1], ["pt", "eta", "phi", "mass"]] gjet1.index = gjet1.index.droplevel("subentry") gjet2.index = gjet2.index.droplevel("subentry") gjsum = p4_sum(gjet1, gjet2) for var in ["pt", "eta", "phi", "mass"]: output[f"gjet1_{var}"] = gjet1[var] output[f"gjet2_{var}"] = gjet2[var] output[f"gjj_{var}"] = gjsum[var] output["gjj_dEta"], output["gjj_dPhi"], output["gjj_dR"] = delta_r( output.gjet1_eta, output.gjet2_eta, output.gjet1_phi, output.gjet2_phi) return output
def test(): simple = ak.Array([0.0, 1.1, 2.2, 3.3, 4.4, 5.5]) assert ak.to_pandas(simple)["values"].values.tolist() == [ 0.0, 1.1, 2.2, 3.3, 4.4, 5.5, ] index = ak.layout.Index64(np.array([3, 3, 1, 5], dtype=np.int64)) indexed = ak.Array(ak.layout.IndexedArray64(index, simple.layout)) assert indexed.tolist() == [3.3, 3.3, 1.1, 5.5] assert ak.to_pandas(indexed)["values"].values.tolist() == [3.3, 3.3, 1.1, 5.5] tuples = ak.Array(ak.layout.RecordArray([simple.layout, simple.layout])) assert ak.to_pandas(tuples)["1"].values.tolist() == [0.0, 1.1, 2.2, 3.3, 4.4, 5.5] offsets = ak.layout.Index64(np.array([0, 1, 1, 3, 4], dtype=np.int64)) nested = ak.Array(ak.layout.ListOffsetArray64(offsets, indexed.layout)) assert ak.to_pandas(nested)["values"].values.tolist() == [3.3, 3.3, 1.1, 5.5] offsets2 = ak.layout.Index64(np.array([0, 3, 3, 4, 6], dtype=np.int64)) nested2 = ak.Array(ak.layout.ListOffsetArray64(offsets2, tuples.layout)) assert ak.to_pandas(nested2)["1"].values.tolist() == [0.0, 1.1, 2.2, 3.3, 4.4, 5.5] recrec = ak.Array([{"x": {"y": 1}}, {"x": {"y": 2}}, {"x": {"y": 3}}]) assert ak.to_pandas(recrec)["x", "y"].values.tolist() == [1, 2, 3] recrec2 = ak.Array( [ {"x": {"a": 1, "b": 2}, "y": {"c": 3, "d": 4}}, {"x": {"a": 10, "b": 20}, "y": {"c": 30, "d": 40}}, ] ) assert ak.to_pandas(recrec2)["y", "c"].values.tolist() == [3, 30] recrec3 = ak.Array( [{"x": 1, "y": {"c": 3, "d": 4}}, {"x": 10, "y": {"c": 30, "d": 40}}] ) assert ak.to_pandas(recrec3)["y", "c"].values.tolist() == [3, 30] tuptup = ak.Array([(1.0, (1.1, 1.2)), (2.0, (2.1, 2.2)), (3.0, (3.1, 3.2))]) assert ak.to_pandas(tuptup)["1", "0"].values.tolist() == [1.1, 2.1, 3.1] recrec4 = ak.Array( [[{"x": 1, "y": {"c": 3, "d": 4}}], [{"x": 10, "y": {"c": 30, "d": 40}}]] ) assert ak.to_pandas(recrec4)["y", "c"].values.tolist() == [3, 30]
def test_broken(): ex = ak.Array([[1, 2, 3], [], [4, 5]]) p4 = ak.zip({"x": ex}) p4c = ak.cartesian({"a": p4, "b": p4}) df = ak.to_pandas(p4c) assert df["a", "x"].values.tolist() == [1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5] assert df["b", "x"].values.tolist() == [1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5]
def get_df_subentry2(root_file_name): """returns a dataframe that contains only subentry 2 data This subentry seems to contain all the relevant information""" df = pd.DataFrame() with rt.open(f'{root_file_name}:Hits') as tree: df = ak.to_pandas(tree.arrays()) return df.xs(2, level='subentry')
def get_dat_glob(self): dat_glob = ak.to_pandas( self.tree.arrays([ 'ebeam', 'emeas', 'lumoff', 'lumofferr', 'runnum', 'finalstate_id' ])) badruns = np.loadtxt('pylib/badruns.dat') dat_glob['badrun'] = dat_glob.runnum.isin(badruns) return dat_glob
def fill_softjets(df, output, variables, cutoff): saj_df = ak.to_pandas(df.SoftActivityJet) saj_df["mass"] = 0.0 nj_name = f"SoftActivityJetNjets{cutoff}" ht_name = f"SoftActivityJetHT{cutoff}" res = ak.to_pandas(df[[nj_name, ht_name]]) res["to_correct"] = output.two_muons | (variables.njets > 0) _, _, dR_m1 = delta_r(saj_df.eta, output.mu1_eta, saj_df.phi, output.mu1_phi) _, _, dR_m2 = delta_r(saj_df.eta, output.mu2_eta, saj_df.phi, output.mu2_phi) _, _, dR_j1 = delta_r(saj_df.eta, variables.jet1_eta, saj_df.phi, variables.jet1_phi) _, _, dR_j2 = delta_r(saj_df.eta, variables.jet2_eta, saj_df.phi, variables.jet2_phi) saj_df["dR_m1"] = dR_m1 < 0.4 saj_df["dR_m2"] = dR_m2 < 0.4 saj_df["dR_j1"] = dR_j1 < 0.4 saj_df["dR_j2"] = dR_j2 < 0.4 dr_cols = ["dR_m1", "dR_m2", "dR_j1", "dR_j2"] saj_df[dr_cols] = saj_df[dr_cols].fillna(False) saj_df["to_remove"] = saj_df[dr_cols].sum(axis=1).astype(bool) saj_df_filtered = saj_df[(~saj_df.to_remove) & (saj_df.pt > cutoff)] footprint = saj_df[(saj_df.to_remove) & (saj_df.pt > cutoff)] res["njets_corrected"] = ( saj_df_filtered.reset_index().groupby("entry")["subentry"].nunique()) res["njets_corrected"] = res["njets_corrected"].fillna(0).astype(int) res["footprint"] = footprint.pt.groupby(level=[0]).sum() res["footprint"] = res["footprint"].fillna(0.0) res["ht_corrected"] = res[ht_name] - res.footprint res.loc[res.ht_corrected < 0, "ht_corrected"] = 0.0 res.loc[res.to_correct, nj_name] = res.loc[res.to_correct, "njets_corrected"] res.loc[res.to_correct, ht_name] = res.loc[res.to_correct, "ht_corrected"] variables[f"nsoftjets{cutoff}"] = res[f"SoftActivityJetNjets{cutoff}"] variables[f"htsoft{cutoff}"] = res[f"SoftActivityJetHT{cutoff}"]
def find_cluster(interactions, cluster_size_space, cluster_size_time): """ Function which finds cluster within a event. Args: x (pandas.DataFrame): Subentries of event must contain the fields, x,y,z,time cluster_size_space (float): Max spatial distance between two points to be inside a cluster [cm]. cluster_size_time (float): Max time distance between two points to be inside a cluster [ns]. Returns: awkward.array: Adds to interaction a cluster_ids record. """ # TODO is there a better way to get the df? df = [] for key in ['x', 'y', 'z', 'ed', 't']: df.append(ak.to_pandas(interactions[key], anonymous=key)) df = pd.concat(df, axis=1) if df.empty: # TPC interaction is empty return interactions # Splitting into individual events and apply time clustering: groups = df.groupby('entry') df["time_cluster"] = np.concatenate( groups.apply( lambda x: simple_1d_clustering(x.t.values, cluster_size_time))) # Splitting into individual events and time cluster and apply space clustering space: df['cluster_id'] = np.zeros(len(df.index), dtype=np.int) for evt in df.index.get_level_values(0).unique(): _df_evt = df.loc[evt] _t_clusters = _df_evt.time_cluster.unique() add_to_cluster = 0 for _t in _t_clusters: _cl = _find_cluster(_df_evt[_df_evt.time_cluster == _t], cluster_size_space=cluster_size_space) df.loc[(df.time_cluster == _t) & (df.index.get_level_values(0) == evt), 'cluster_id'] = _cl + add_to_cluster add_to_cluster = max(_cl) + add_to_cluster + 1 ci = df.loc[:, 'cluster_id'].values offsets = ak.num(interactions['x']) interactions['cluster_ids'] = reshape_awkward(ci, offsets) return interactions
def get_dat_photons(self): arrs = self.tree.arrays( ['pt', 'theta', 'phi', 'mass'], cut='(nt>=2)&(nks>0)&(phen>0)', aliases={ 'pt': 'phen*sin(phth)', 'theta': 'phth', 'phi': 'phphi', 'mass': '0*phen' }) vecs = vector.Array(arrs) df = ak.to_pandas(ak.combinations(vecs.px, 2)) df = df.rename({'0': 'px0', '1': 'px1'}, axis=1) df_len = len(df) df = df.join(ak.to_pandas(ak.combinations(vecs.py, 2))) assert df_len == len(df) df_len = len(df) df = df.rename({'0': 'py0', '1': 'py1'}, axis=1) df = df.join(ak.to_pandas(ak.combinations(vecs.pz, 2))) assert df_len == len(df) df_len = len(df) df = df.rename({'0': 'pz0', '1': 'pz1'}, axis=1) df = df.join(ak.to_pandas(ak.combinations(vecs.E, 2))) assert df_len == len(df) df_len = len(df) df = df.rename({'0': 'E0', '1': 'E1'}, axis=1) for coord in ('x', 'y', 'z'): df[f'P{coord}'] = df[f'p{coord}0'] + df[f'p{coord}1'] df['P'] = np.sqrt(df['Px']**2 + df['Py']**2 + df['Pz']**2) df['E'] = df['E0'] + df['E1'] M2 = df['E']**2 - df['P']**2 df['M'] = np.where(M2 > 0, np.sqrt(np.abs(M2)), -np.sqrt(np.abs(M2))) return df
def get_dat_tracks(self): e0 = self.tree['emeas'].array()[0] pidedx = '5.58030e+9 / (tptot + 40.)**3 + 2.21228e+3 - 3.77103e-1 * tptot - tdedx' arrs = self.tree.arrays( ['tz', 'tptot', 'tdedx', 'tcharge', 'trho', 'tth', 'tphi'], f'(nt>=2)&(nks>0)&(tnhit>6)&(abs(pidedx)<{self.cut_dedx})&(tchi2r<20)&(tchi2z<20)&(abs(tz)<{self.cut_z})&(tptot<{e0})&(tptot>40)', aliases={'pidedx': pidedx}) dat_tracks = ak.to_pandas(arrs) dat_tracks_groups = dat_tracks.groupby('entry').agg(uniques=('tz', 'count'), charge=('tcharge', 'sum')) idx = dat_tracks_groups.query('(uniques==2)&(charge==0)').index return dat_tracks.loc[idx] #.drop('tcharge', axis=1)
def fill_gen_jets(df, output): features = ["PT", "Eta", "Phi", "Mass"] gjets = df.GenJet[features] print(df.GenJet.fields) gleptons = df.MuonMedium # gleptons = df.GenPart[ # (abs(df.GenPart.pdgId) == 13) # | (abs(df.GenPart.pdgId) == 11) # | (abs(df.GenPart.pdgId) == 15) # ] gl_pair = ak.cartesian({"jet": gjets, "lepton": gleptons}, axis=1, nested=True) _, _, dr_gl = delta_r( gl_pair["jet"].Eta, gl_pair["lepton"].Eta, gl_pair["jet"].Phi, gl_pair["lepton"].Phi, ) isolated = ak.all((dr_gl > 0.3), axis=-1) gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0], features] gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1], features] gjet1.index = gjet1.index.droplevel("subentry") gjet2.index = gjet2.index.droplevel("subentry") feat_map = {"pt": "PT", "eta": "Eta", "phi": "Phi", "mass": "Mass"} for var in ["pt", "eta", "phi", "mass"]: gjet1[var] = gjet1[feat_map[var]] gjet2[var] = gjet2[feat_map[var]] gjsum = p4_sum(gjet1, gjet2) for var in ["pt", "eta", "phi", "mass"]: output[f"gjet1_{var}"] = gjet1[var] output[f"gjet2_{var}"] = gjet2[var] output[f"gjj_{var}"] = gjsum[var] output["gjj_dEta"], output["gjj_dPhi"], output["gjj_dR"] = delta_r( output.gjet1_eta, output.gjet2_eta, output.gjet1_phi, output.gjet2_phi ) return output
def get_dat_kaons(self): dlt_mass = 'abs(ksminv-497.6)' cuts = f'(nt>=2)&(nks>0)&(ksalign>{self.cut_align})&(dlt_mass<200)' dat_kaons = ak.to_pandas( self.tree.arrays([ 'ksptot', 'ksminv', 'ksalign', 'dlt_mass', 'ksvind', 'ksdpsi', 'ksz0', 'kslen', 'ksth', 'ksphi' ], cuts, aliases={'dlt_mass': dlt_mass})).loc[:, :, :1] dat_kaons = dat_kaons.reset_index().drop( 'subsubentry', axis=1).set_index(['entry', 'subentry']) kaons = dat_kaons.sort_values( by=['dlt_mass']).reset_index().drop_duplicates( subset=['entry'], keep='first').set_index(['entry', 'subentry']).index dat_kaons = dat_kaons.loc[kaons] return dat_kaons.reset_index().drop(['subentry'], axis=1).rename( { 'ksvind': 'subentry' }, axis=1).set_index(['entry', 'subentry'])
def cellid_adj_matrix( self ): #pytorch_geometric adj_matrix format (tensor of connected edges with dim 2xnum_of_edges) fnlup = osp.join(self.geometry_dir, "DetIdLUT.root") #conf["luppath"] rf = uproot.open(fnlup) arr = rf["analyzer/tree"].arrays() keydf = ak.to_pandas(arr[0]) keydf = keydf.set_index("globalid") # load the geometry geoyamlpath = osp.join(self.geometry_dir, "geometry.yaml") fngeopic = osp.join( self.geometry_dir, "geometry.pickle") #conf["geoyamlpath"].strip("yaml") + "pickle" if os.path.isfile(fngeopic): with open(fngeopic, "rb") as f: geoD = pickle.load(f) else: with open(geoyamlpath, "r") as f: geoD = yaml.load(f) with open(fngeopic, "wb") as f: pickle.dump(geoD, f) graphpath = osp.join(self.geometry_dir, "edge_index.pt") if os.path.isfile(graphpath): edge_index = torch.load(graphpath) else: # Instanciate array edgeA = np.empty((2, 0), dtype=int) for originid, row in keydf.iterrows(): for i in range(row.nneighbors + row.ngapneighbors): edgeA = np.append(edgeA, [[originid], [row["n" + str(i)]]], axis=1) # Prune edgeA = edgeA[:, edgeA[0] != 0] edge_index = torch.tensor(edgeA, dtype=torch.long) torch.save(edge_index, graphpath) return keydf.index, edge_index
def process(self, events): events = events[ ak.num(events.Jet) > 0] #corrects for rare case where there isn't a single jet in event output = self.accumulator.identity() # we can use a very loose preselection to filter the events. nothing is done with this presel, though presel = ak.num(events.Jet) >= 0 ev = events[presel] dataset = ev.metadata['dataset'] # load the config - probably not needed anymore # cfg = loadConfig() output['totalEvents']['all'] += len(events) output['skimmedEvents']['all'] += len(ev) ### For FCNC, we want electron -> tightTTH electron = Collections(ev, "Electron", "tightFCNC").get() fakeableelectron = Collections(ev, "Electron", "fakeableFCNC").get() muon = Collections(ev, "Muon", "tightFCNC").get() fakeablemuon = Collections(ev, "Muon", "fakeableFCNC").get() ##Jets Jets = events.Jet ## MET -> can switch to puppi MET met_pt = ev.MET.pt met_phi = ev.MET.phi lepton = fakeablemuon #ak.concatenate([fakeablemuon, fakeableelectron], axis=1) mt_lep_met = mt(lepton.pt, lepton.phi, ev.MET.pt, ev.MET.phi) min_mt_lep_met = ak.min(mt_lep_met, axis=1) selection = PackedSelection() selection.add('MET<20', (ev.MET.pt < 20)) selection.add('mt<20', min_mt_lep_met < 20) #selection.add('MET<19', (ev.MET.pt<19) ) selection_reqs = ['MET<20', 'mt<20'] #, 'MET<19'] fcnc_reqs_d = {sel: True for sel in selection_reqs} fcnc_selection = selection.require(**fcnc_reqs_d) # define the weight weight = Weights(len(ev)) if not dataset == 'MuonEG': # generator weight weight.add("weight", ev.genWeight) jets = getJets( ev, maxEta=2.4, minPt=25, pt_var='pt' ) #& (ak.num(jets[~match(jets, fakeablemuon, deltaRCut=1.0)])>=1) single_muon_sel = (ak.num(muon) == 1) & (ak.num(fakeablemuon) == 1) | ( ak.num(muon) == 0) & (ak.num(fakeablemuon) == 1) single_electron_sel = (ak.num(electron) == 1) & ( ak.num(fakeableelectron) == 1) | (ak.num(electron) == 0) & (ak.num(fakeableelectron) == 1) fcnc_muon_sel = (ak.num( jets[~match(jets, fakeablemuon, deltaRCut=1.0)]) >= 1) & fcnc_selection & single_muon_sel fcnc_electron_sel = (ak.num( jets[~match(jets, fakeableelectron, deltaRCut=1.0)]) >= 1) & fcnc_selection & single_electron_sel tight_muon_sel = (ak.num(muon) == 1) & fcnc_muon_sel loose_muon_sel = (ak.num(fakeablemuon) == 1) & fcnc_muon_sel tight_electron_sel = (ak.num(electron) == 1) & fcnc_electron_sel loose_electron_sel = (ak.num(fakeableelectron) == 1) & fcnc_electron_sel output['single_mu_fakeable'].fill( dataset=dataset, pt=ak.to_numpy(ak.flatten(fakeablemuon[loose_muon_sel].conePt)), eta=np.abs( ak.to_numpy(ak.flatten(fakeablemuon[loose_muon_sel].eta)))) output['single_mu'].fill( dataset=dataset, pt=ak.to_numpy(ak.flatten(muon[tight_muon_sel].conePt)), eta=np.abs(ak.to_numpy(ak.flatten(muon[tight_muon_sel].eta)))) output['single_e_fakeable'].fill( dataset=dataset, pt=ak.to_numpy( ak.flatten(fakeableelectron[loose_electron_sel].conePt)), eta=np.abs( ak.to_numpy( ak.flatten(fakeableelectron[loose_electron_sel].eta)))) output['single_e'].fill( dataset=dataset, pt=ak.to_numpy(ak.flatten(electron[tight_electron_sel].conePt)), eta=np.abs( ak.to_numpy(ak.flatten(electron[tight_electron_sel].eta)))) if self.debug: #create pandas dataframe for debugging passed_events = ev[tight_muon_sel] passed_muons = muon[tight_muon_sel] event_p = ak.to_pandas(passed_events[["event"]]) event_p["MET_PT"] = passed_events["MET"]["pt"] event_p["mt"] = min_mt_lep_met[tight_muon_sel] event_p["num_tight_mu"] = ak.to_numpy(ak.num(muon)[tight_muon_sel]) event_p["num_loose_mu"] = ak.num(fakeablemuon)[tight_muon_sel] muon_p = ak.to_pandas( ak.flatten(passed_muons)[[ "pt", "conePt", "eta", "dz", "dxy", "ptErrRel", "miniPFRelIso_all", "jetRelIsoV2", "jetRelIso", "jetPtRelv2" ]]) #convert to numpy array for the output events_array = pd.concat([muon_p, event_p], axis=1) events_to_add = [6886009] for e in events_to_add: tmp_event = ev[ev.event == e] added_event = ak.to_pandas(tmp_event[["event"]]) added_event["MET_PT"] = tmp_event["MET"]["pt"] added_event["mt"] = min_mt_lep_met[ev.event == e] added_event["num_tight_mu"] = ak.to_numpy( ak.num(muon)[ev.event == e]) added_event["num_loose_mu"] = ak.to_numpy( ak.num(fakeablemuon)[ev.event == e]) add_muon = ak.to_pandas( ak.flatten(muon[ev.event == e])[[ "pt", "conePt", "eta", "dz", "dxy", "ptErrRel", "miniPFRelIso_all", "jetRelIsoV2", "jetRelIso", "jetPtRelv2" ]]) add_concat = pd.concat([add_muon, added_event], axis=1) events_array = pd.concat([events_array, add_concat], axis=0) output['muons_df'] += processor.column_accumulator( events_array.to_numpy()) return output
def test_union_to_record(): recordarray1 = ak.Array([{"x": 1, "y": 1.1}, {"x": 3, "y": 3.3}]).layout recordarray2 = ak.Array([{"y": 2.2, "z": 999}]).layout tags = ak.layout.Index8(np.array([0, 1, 0], dtype=np.int8)) index = ak.layout.Index64(np.array([0, 0, 1], dtype=np.int64)) unionarray = ak.layout.UnionArray8_64(tags, index, [recordarray1, recordarray2]) assert ak.to_list(unionarray) == [ { "x": 1, "y": 1.1 }, { "y": 2.2, "z": 999 }, { "x": 3, "y": 3.3 }, ] converted = ak._util.union_to_record(unionarray, "values") assert isinstance(converted, ak.layout.RecordArray) assert ak.to_list(converted) == [ { "x": 1, "y": 1.1, "z": None }, { "x": None, "y": 2.2, "z": 999 }, { "x": 3, "y": 3.3, "z": None }, ] otherarray = ak.Array(["one", "two"]).layout tags2 = ak.layout.Index8(np.array([0, 2, 1, 2, 0], dtype=np.int8)) index2 = ak.layout.Index64(np.array([0, 0, 0, 1, 1], dtype=np.int64)) unionarray2 = ak.layout.UnionArray8_64( tags2, index2, [recordarray1, recordarray2, otherarray]) assert ak.to_list(unionarray2) == [ { "x": 1, "y": 1.1 }, "one", { "y": 2.2, "z": 999 }, "two", { "x": 3, "y": 3.3 }, ] converted2 = ak._util.union_to_record(unionarray2, "values") assert isinstance(converted2, ak.layout.RecordArray) assert ak.to_list(converted2) == [ { "x": 1, "y": 1.1, "z": None, "values": None }, { "x": None, "y": None, "z": None, "values": "one" }, { "x": None, "y": 2.2, "z": 999, "values": None }, { "x": None, "y": None, "z": None, "values": "two" }, { "x": 3, "y": 3.3, "z": None, "values": None }, ] df_unionarray = ak.to_pandas(unionarray) np.testing.assert_array_equal(df_unionarray["x"].values, np.array([1, np.nan, 3])) np.testing.assert_array_equal(df_unionarray["y"].values, np.array([1.1, 2.2, 3.3])) np.testing.assert_array_equal(df_unionarray["z"].values, np.array([np.nan, 999, np.nan])) df_unionarray2 = ak.to_pandas(unionarray2) np.testing.assert_array_equal(df_unionarray2["x"].values, [1, np.nan, np.nan, np.nan, 3]) np.testing.assert_array_equal(df_unionarray2["y"].values, [1.1, np.nan, 2.2, np.nan, 3.3]) np.testing.assert_array_equal(df_unionarray2["z"].values, [np.nan, np.nan, 999, np.nan, np.nan]) np.testing.assert_array_equal(df_unionarray2["values"].values, ["nan", "one", "nan", "two", "nan"])
import os import pickle import yaml import awkward as ak import numpy as np import uproot import torch from ..config import conf from ..utils.logger import logger from torch_geometric.data import Data # load the root table fnlup = conf["luppath"] rf = uproot.open(fnlup) arr = rf["analyzer/tree"].arrays() keydf = ak.to_pandas(arr[0]) keydf = keydf.set_index("globalid") # load the geometry fngeopic = conf["geoyamlpath"].strip("yaml") + "pickle" if os.path.isfile(fngeopic): with open(fngeopic, "rb") as f: geoD = pickle.load(f) else: with open(conf["geoyamlpath"], "r") as f: geoD = yaml.load(f) with open(fngeopic, "wb") as f: pickle.dump(geoD, f) if os.path.isfile(conf["graphpath"]):
def write_to_df(self, events, output_name): df = awkward.to_pandas(events) df.to_pickle(output_name) return
def process(self, df): # print(df.fields) # numevents = len(df) # dataset = df.metadata["dataset"] output = pd.DataFrame({"event": df.Event.Number}) output.index.name = "entry" output["dataset"] = df.metadata["dataset"] regions = df.metadata["regions"] # channels = df.metadata['channels'] output["lumi_wgt"] = float(df.metadata["lumi_wgt"]) output["mc_wgt"] = ak.to_pandas(df.Event.Weight) # There are multiple weights per event - need to figure this out # output['lhe_wgt'] = ak.to_pandas(df.Weight.Weight) output["year"] = "snowmass" # Select muons muons = df[parameters["muon_branch"]] muon_filter = ((muons.pt > parameters["muon_pt_cut"]) & (abs(muons.eta) < parameters["muon_eta_cut"]) & (muons.IsolationVar < parameters["muon_iso_cut"])) nmuons = ak.to_pandas(ak.count(muons[muon_filter].pt, axis=1)) mu_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Charge": "charge"} muon_columns = ["PT", "Eta", "Phi", "Charge", "IsolationVar"] # Convert one column at a time to preserve event indices in Pandas muon_feature_list = [] for col in muon_columns: muon_feature = df[parameters["muon_branch"]][col] val = ak.to_pandas(muon_feature[muon_filter]) muon_feature_list.append(val) muons = pd.concat(muon_feature_list, axis=1) muons.columns = muon_columns muons.rename(columns=mu_map, inplace=True) mu1 = muons.loc[muons.pt.groupby("entry").idxmax()] mu2 = muons.loc[muons.pt.groupby("entry").idxmin()] mu1.index = mu1.index.droplevel("subentry") mu2.index = mu2.index.droplevel("subentry") pass_leading_pt = mu1.pt > parameters["muon_leading_pt"] fill_muons(output, mu1, mu2) output.mm_charge = output.mu1_charge * output.mu2_charge # Select electrons electrons = df[parameters["electron_branch"]] electrons = electrons[ (electrons.pt > parameters["electron_pt_cut"]) & (abs(electrons.eta) < parameters["electron_eta_cut"])] nelectrons = ak.to_pandas(ak.count(electrons.pt, axis=1)) # Select jets jets = df[parameters["jet_branch"]] mu_for_clean = df[parameters["muon_branch"]] mu_for_clean = mu_for_clean[ (mu_for_clean.pt > parameters["muon_pt_cut"]) & (mu_for_clean.IsolationVar < parameters["muon_iso_cut"])] _, jet_mu_dr = jets.nearest(mu_for_clean, return_metric=True) jet_filter = ( ak.fill_none(jet_mu_dr > parameters["min_dr_mu_jet"], True) & (jets.pt > parameters["jet_pt_cut"]) & (abs(jets.eta) < parameters["jet_eta_cut"])) njets = ak.to_pandas(ak.count(jets[jet_filter].pt, axis=1)) jet_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Mass": "mass"} jet_columns = ["PT", "Eta", "Phi", "Mass"] jet_feature_list = [] for col in jet_columns: jet_feature = df[parameters["jet_branch"]][col] val = ak.to_pandas(jet_feature[jet_filter]) jet_feature_list.append(val) jets = pd.concat(jet_feature_list, axis=1) jets.columns = jet_columns jets.rename(columns=jet_map, inplace=True) jets = jets.sort_values(["entry", "pt"], ascending=[True, False]) jets.index = pd.MultiIndex.from_arrays( [jets.index.get_level_values(0), jets.groupby(level=0).cumcount()], names=["entry", "subentry"], ) jet1 = jets.loc[pd.IndexSlice[:, 0], :] jet2 = jets.loc[pd.IndexSlice[:, 1], :] jet1.index = jet1.index.droplevel("subentry") jet2.index = jet2.index.droplevel("subentry") fill_jets(output, jet1, jet2) fill_gen_jets(df, output) # Event selection: two opposite-sign muons and no electrons output["nmuons"] = nmuons output["nelectrons"] = nelectrons output["njets"] = njets output[["nmuons", "nelectrons", "njets"]] = output[["nmuons", "nelectrons", "njets"]].fillna(0) output["event_selection"] = ((output.nmuons == 2) & (output.mm_charge == -1) & (output.nelectrons == 0) & pass_leading_pt) mass = output.dimuon_mass output["region"] = None output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak" output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) & (mass < 150)), "region", ] = "h-sidebands" output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak" output = output.loc[output.event_selection, :] output = output.reindex(sorted(output.columns), axis=1) output = output[output.region.isin(regions)] """ input_evts = numevents output_evts = output.shape[0] out_yield = output.lumi_wgt.sum() out_vbf = output[ (output.jj_mass>400) & (output.jj_dEta>2.5) & (output.jet1_pt>35) & (output.njets>=2) ].lumi_wgt.sum() out_ggh = out_yield - out_vbf print(f"\n{dataset}: {input_evts} -> {output_evts}; yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}") """ to_return = None if self.apply_to_output is None: to_return = output else: self.apply_to_output(output) to_return = self.accumulator.identity() return to_return
def process(self, df): # Initialize timer if self.timer: self.timer.update() # Dataset name (see definitions in config/datasets.py) dataset = df.metadata["dataset"] is_mc = "data" not in dataset numevents = len(df) # ------------------------------------------------------------# # Apply HLT, lumimask, genweights, PU weights # and L1 prefiring weights # ------------------------------------------------------------# # All variables that we want to save # will be collected into the 'output' dataframe output = pd.DataFrame({"run": df.run, "event": df.event}) output.index.name = "entry" output["npv"] = df.PV.npvs output["met"] = df.MET.pt # Separate dataframe to keep track on weights # and their systematic variations weights = Weights(output) if is_mc: # For MC: Apply gen.weights, pileup weights, lumi weights, # L1 prefiring weights mask = np.ones(numevents, dtype=bool) genweight = df.genWeight weights.add_weight("genwgt", genweight) weights.add_weight("lumi", self.lumi_weights[dataset]) pu_wgts = pu_evaluator( self.pu_lookups, self.parameters, numevents, np.array(df.Pileup.nTrueInt), self.auto_pu, ) weights.add_weight("pu_wgt", pu_wgts, how="all") if self.parameters["do_l1prefiring_wgts"]: if "L1PreFiringWeight" in df.fields: l1pfw = l1pf_weights(df) weights.add_weight("l1prefiring_wgt", l1pfw, how="all") else: weights.add_weight("l1prefiring_wgt", how="dummy_vars") else: # For Data: apply Lumi mask lumi_info = LumiMask(self.parameters["lumimask"]) mask = lumi_info(df.run, df.luminosityBlock) # Apply HLT to both Data and MC hlt_columns = [c for c in self.parameters["hlt"] if c in df.HLT.fields] hlt = ak.to_pandas(df.HLT[hlt_columns]) if len(hlt_columns) == 0: hlt = False else: hlt = hlt[hlt_columns].sum(axis=1) if self.timer: self.timer.add_checkpoint("HLT, lumimask, PU weights") # ------------------------------------------------------------# # Update muon kinematics with Rochester correction, # FSR recovery and GeoFit correction # Raw pT and eta are stored to be used in event selection # ------------------------------------------------------------# # Save raw variables before computing any corrections df["Muon", "pt_raw"] = df.Muon.pt df["Muon", "eta_raw"] = df.Muon.eta df["Muon", "phi_raw"] = df.Muon.phi df["Muon", "pfRelIso04_all_raw"] = df.Muon.pfRelIso04_all # Rochester correction if self.do_roccor: apply_roccor(df, self.roccor_lookup, is_mc) df["Muon", "pt"] = df.Muon.pt_roch # variations will be in branches pt_roch_up and pt_roch_down # muons_pts = { # 'nominal': df.Muon.pt, # 'roch_up':df.Muon.pt_roch_up, # 'roch_down':df.Muon.pt_roch_down # } # for ... if True: # indent reserved for loop over muon pT variations # According to HIG-19-006, these variations have negligible # effect on significance, but it's better to have them # implemented in the future # FSR recovery if self.do_fsr: has_fsr = fsr_recovery(df) df["Muon", "pt"] = df.Muon.pt_fsr df["Muon", "eta"] = df.Muon.eta_fsr df["Muon", "phi"] = df.Muon.phi_fsr df["Muon", "pfRelIso04_all"] = df.Muon.iso_fsr # if FSR was applied, 'pt_fsr' will be corrected pt # if FSR wasn't applied, just copy 'pt' to 'pt_fsr' df["Muon", "pt_fsr"] = df.Muon.pt # GeoFit correction if self.do_geofit and ("dxybs" in df.Muon.fields): apply_geofit(df, self.year, ~has_fsr) df["Muon", "pt"] = df.Muon.pt_fsr if self.timer: self.timer.add_checkpoint("Muon corrections") # --- conversion from awkward to pandas --- # muon_columns = [ "pt", "pt_fsr", "eta", "phi", "charge", "ptErr", "mass", "pt_raw", "eta_raw", "pfRelIso04_all", ] + [self.parameters["muon_id"]] muons = ak.to_pandas(df.Muon[muon_columns]) # --------------------------------------------------------# # Select muons that pass pT, eta, isolation cuts, # muon ID and quality flags # Select events with 2 OS muons, no electrons, # passing quality cuts and at least one good PV # --------------------------------------------------------# # Apply event quality flags flags = ak.to_pandas(df.Flag[self.parameters["event_flags"]]) flags = flags[self.parameters["event_flags"]].product(axis=1) muons["pass_flags"] = True if self.parameters["muon_flags"]: muons["pass_flags"] = muons[ self.parameters["muon_flags"]].product(axis=1) # Define baseline muon selection (applied to pandas DF!) muons["selection"] = ( (muons.pt_raw > self.parameters["muon_pt_cut"]) & (abs(muons.eta_raw) < self.parameters["muon_eta_cut"]) & (muons.pfRelIso04_all < self.parameters["muon_iso_cut"]) & muons[self.parameters["muon_id"]] & muons.pass_flags) # Count muons nmuons = (muons[muons.selection].reset_index().groupby("entry") ["subentry"].nunique()) # Find opposite-sign muons mm_charge = muons.loc[muons.selection, "charge"].groupby("entry").prod() # Veto events with good quality electrons electrons = df.Electron[ (df.Electron.pt > self.parameters["electron_pt_cut"]) & (abs(df.Electron.eta) < self.parameters["electron_eta_cut"]) & (df.Electron[self.parameters["electron_id"]] == 1)] electron_veto = ak.to_numpy(ak.count(electrons.pt, axis=1) == 0) # Find events with at least one good primary vertex good_pv = ak.to_pandas(df.PV).npvsGood > 0 # Define baseline event selection output["two_muons"] = nmuons == 2 output["event_selection"] = (mask & (hlt > 0) & (flags > 0) & (nmuons == 2) & (mm_charge == -1) & electron_veto & good_pv) # --------------------------------------------------------# # Select two leading-pT muons # --------------------------------------------------------# # Find pT-leading and subleading muons # This is slow for large chunk size. # Consider reimplementing using sort_values().groupby().nth() # or sort_values().drop_duplicates() # or using Numba # https://stackoverflow.com/questions/50381064/select-the-max-row-per-group-pandas-performance-issue muons = muons[muons.selection & (nmuons == 2)] mu1 = muons.loc[muons.pt.groupby("entry").idxmax()] mu2 = muons.loc[muons.pt.groupby("entry").idxmin()] mu1.index = mu1.index.droplevel("subentry") mu2.index = mu2.index.droplevel("subentry") # --------------------------------------------------------# # Select events with muons passing leading pT cut # and trigger matching (trig match not done in final vrsn) # --------------------------------------------------------# # Events where there is at least one muon passing # leading muon pT cut pass_leading_pt = mu1.pt_raw > self.parameters["muon_leading_pt"] # update event selection with leading muon pT cut output["pass_leading_pt"] = pass_leading_pt output[ "event_selection"] = output.event_selection & output.pass_leading_pt # --------------------------------------------------------# # Fill dimuon and muon variables # --------------------------------------------------------# fill_muons(self, output, mu1, mu2, is_mc) if self.timer: self.timer.add_checkpoint("Event & muon selection") # ------------------------------------------------------------# # Prepare jets # ------------------------------------------------------------# prepare_jets(df, is_mc) # ------------------------------------------------------------# # Apply JEC, get JEC and JER variations # ------------------------------------------------------------# jets = df.Jet self.do_jec = False # We only need to reapply JEC for 2018 data # (unless new versions of JEC are released) if ("data" in dataset) and ("2018" in self.year): self.do_jec = True jets = apply_jec( df, jets, dataset, is_mc, self.year, self.do_jec, self.do_jecunc, self.do_jerunc, self.jec_factories, self.jec_factories_data, ) # ------------------------------------------------------------# # Calculate other event weights # ------------------------------------------------------------# if is_mc: do_nnlops = self.do_nnlops and ("ggh" in dataset) if do_nnlops: nnlopsw = nnlops_weights(df, numevents, self.parameters, dataset) weights.add_weight("nnlops", nnlopsw) else: weights.add_weight("nnlops", how="dummy") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # # do_zpt = ('dy' in dataset) # # if do_zpt: # zpt_weight = np.ones(numevents, dtype=float) # zpt_weight[two_muons] =\ # self.evaluator[self.zpt_path]( # output['dimuon_pt'][two_muons] # ).flatten() # weights.add_weight('zpt_wgt', zpt_weight) # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_musf = True if do_musf: muID, muIso, muTrig = musf_evaluator(self.musf_lookup, self.year, numevents, mu1, mu2) weights.add_weight("muID", muID, how="all") weights.add_weight("muIso", muIso, how="all") weights.add_weight("muTrig", muTrig, how="all") else: weights.add_weight("muID", how="dummy_all") weights.add_weight("muIso", how="dummy_all") weights.add_weight("muTrig", how="dummy_all") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_lhe = (("LHEScaleWeight" in df.fields) and ("LHEPdfWeight" in df.fields) and ("nominal" in self.pt_variations)) if do_lhe: lhe_ren, lhe_fac = lhe_weights(df, output, dataset, self.year) weights.add_weight("LHERen", lhe_ren, how="only_vars") weights.add_weight("LHEFac", lhe_fac, how="only_vars") else: weights.add_weight("LHERen", how="dummy_vars") weights.add_weight("LHEFac", how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_thu = (("vbf" in dataset) and ("dy" not in dataset) and ("nominal" in self.pt_variations) and ("stage1_1_fine_cat_pTjet30GeV" in df.HTXS.fields)) if do_thu: for i, name in enumerate(self.sths_names): wgt_up = stxs_uncert( i, ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV), 1.0, self.stxs_acc_lookups, self.powheg_xsec_lookup, ) wgt_down = stxs_uncert( i, ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV), -1.0, self.stxs_acc_lookups, self.powheg_xsec_lookup, ) thu_wgts = {"up": wgt_up, "down": wgt_down} weights.add_weight("THU_VBF_" + name, thu_wgts, how="only_vars") else: for i, name in enumerate(self.sths_names): weights.add_weight("THU_VBF_" + name, how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_pdf = (self.do_pdf and ("nominal" in self.pt_variations) and ("dy" in dataset or "ewk" in dataset or "ggh" in dataset or "vbf" in dataset) and ("mg" not in dataset)) if "2016" in self.year: max_replicas = 0 if "dy" in dataset: max_replicas = 100 elif "ewk" in dataset: max_replicas = 33 else: max_replicas = 100 if do_pdf: pdf_wgts = df.LHEPdfWeight[:, 0:self. parameters["n_pdf_variations"]] for i in range(100): if (i < max_replicas) and do_pdf: output[f"pdf_mcreplica{i}"] = pdf_wgts[:, i] else: output[f"pdf_mcreplica{i}"] = np.nan else: if do_pdf: pdf_wgts = df.LHEPdfWeight[:, 0:self. parameters["n_pdf_variations"]][ 0] pdf_wgts = np.array(pdf_wgts) pdf_vars = { "up": (1 + 2 * pdf_wgts.std()), "down": (1 - 2 * pdf_wgts.std()), } weights.add_weight("pdf_2rms", pdf_vars, how="only_vars") else: weights.add_weight("pdf_2rms", how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # if is_mc: output = fill_gen_jets(df, output) # ------------------------------------------------------------# # Loop over JEC variations and fill jet variables # ------------------------------------------------------------# output.columns = pd.MultiIndex.from_product( [output.columns, [""]], names=["Variable", "Variation"]) if self.timer: self.timer.add_checkpoint("Jet preparation & event weights") for v_name in self.pt_variations: output_updated = self.jet_loop( v_name, is_mc, df, dataset, mask, muons, mu1, mu2, jets, weights, numevents, output, ) if output_updated is not None: output = output_updated if self.timer: self.timer.add_checkpoint("Jet loop") # ------------------------------------------------------------# # Fill outputs # ------------------------------------------------------------# mass = output.dimuon_mass output["region"] = None output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak" output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) & (mass < 150)), "region", ] = "h-sidebands" output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak" output["dataset"] = dataset output["year"] = int(self.year) for wgt in weights.df.columns: skip_saving = (("nominal" not in wgt) and ("up" not in wgt) and ("down" not in wgt)) if skip_saving: continue output[f"wgt_{wgt}"] = weights.get_weight(wgt) columns_to_save = [ c for c in output.columns if (c[0] in self.vars_to_save) or ("wgt_" in c[0]) or ( "mcreplica" in c[0]) or (c[0] in ["region", "dataset", "year"]) or ("gjet" in c[0]) or ("gjj" in c[0]) ] output = output.loc[output.event_selection, columns_to_save] output = output.reindex(sorted(output.columns), axis=1) output.columns = [ " ".join(col).strip() for col in output.columns.values ] output = output[output.region.isin(self.regions)] """ input_evts = numevents output_evts = output.shape[0] out_yield = output.wgt_nominal.sum() out_vbf = output[ (output["jj_mass nominal"]>400) & (output["jj_dEta nominal"]>2.5) & (output["jet1_pt nominal"]>35) ].wgt_nominal.sum() out_ggh = out_yield - out_vbf print(f"\n{dataset}: {input_evts} -> {output_evts}; yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}") """ to_return = None if self.apply_to_output is None: to_return = output else: self.apply_to_output(output) to_return = self.accumulator.identity() if self.timer: self.timer.add_checkpoint("Saving outputs") self.timer.summary() return to_return
def l1pf_weights(df): l1pfw = ak.to_pandas(df.L1PreFiringWeight) ret = {"nom": l1pfw.Nom, "up": l1pfw.Up, "down": l1pfw.Dn} return ret
def test(): def key(n): if n in ("values", "x", "y"): return n else: return tuple(eval(n.replace("nan", "None").replace("null", "None"))) def regularize(data): if isinstance(data, dict): return dict((key(n), regularize(x)) for n, x in data.items()) else: return data array = ak.Array([[0.0, 1.1, 2.2], [], [3.3, 4.4], [5.5], [6.6, None, 8.8, 9.9]]) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { "values": { (0, 0): 0.0, (0, 1): 1.1, (0, 2): 2.2, (2, 0): 3.3, (2, 1): 4.4, (3, 0): 5.5, (4, 0): 6.6, (4, 1): None, (4, 2): 8.8, (4, 3): 9.9, } } array = ak.Array( [[[0.0, 1.1, 2.2], [], [3.3, 4.4]], [[5.5]], [[6.6, None, 8.8, 9.9]]] ) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { "values": { (0, 0, 0): 0.0, (0, 0, 1): 1.1, (0, 0, 2): 2.2, (0, 2, 0): 3.3, (0, 2, 1): 4.4, (1, 0, 0): 5.5, (2, 0, 0): 6.6, (2, 0, 1): None, (2, 0, 2): 8.8, (2, 0, 3): 9.9, } } array = ak.Array( [ [[0.0, 1.1, 2.2], [], [3.3, 4.4]], [], [[5.5]], None, [[], None, [6.6, None, 8.8, 9.9]], ] ) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { "values": { (0, 0, 0): 0.0, (0, 0, 1): 1.1, (0, 0, 2): 2.2, (0, 2, 0): 3.3, (0, 2, 1): 4.4, (2, 0, 0): 5.5, (4, 2, 0): 6.6, (4, 2, 1): None, (4, 2, 2): 8.8, (4, 2, 3): 9.9, } } array = ak.Array( [ [ [{"x": 0.0, "y": []}, {"x": 1.1, "y": [1]}, {"x": 2.2, "y": [2, 2]}], [], [{"x": 3.3, "y": [3, 3, 3]}, {"x": 4.4, "y": [4, 4, 4, 4]}], ], [], [[{"x": 5.5, "y": [5, 5, 5, 5, 5]}]], ] ) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { "x": { (0, 0, 1, 0): 1.1, (0, 0, 2, 0): 2.2, (0, 0, 2, 1): 2.2, (0, 2, 0, 0): 3.3, (0, 2, 0, 1): 3.3, (0, 2, 0, 2): 3.3, (0, 2, 1, 0): 4.4, (0, 2, 1, 1): 4.4, (0, 2, 1, 2): 4.4, (0, 2, 1, 3): 4.4, (2, 0, 0, 0): 5.5, (2, 0, 0, 1): 5.5, (2, 0, 0, 2): 5.5, (2, 0, 0, 3): 5.5, (2, 0, 0, 4): 5.5, }, "y": { (0, 0, 1, 0): 1, (0, 0, 2, 0): 2, (0, 0, 2, 1): 2, (0, 2, 0, 0): 3, (0, 2, 0, 1): 3, (0, 2, 0, 2): 3, (0, 2, 1, 0): 4, (0, 2, 1, 1): 4, (0, 2, 1, 2): 4, (0, 2, 1, 3): 4, (2, 0, 0, 0): 5, (2, 0, 0, 1): 5, (2, 0, 0, 2): 5, (2, 0, 0, 3): 5, (2, 0, 0, 4): 5, }, } assert regularize(json.loads(ak.to_pandas(array, how="outer").to_json())) == { "x": { (0, 0, 0, None): 0.0, (0, 0, 1, 0.0): 1.1, (0, 0, 2, 0.0): 2.2, (0, 0, 2, 1.0): 2.2, (0, 2, 0, 0.0): 3.3, (0, 2, 0, 1.0): 3.3, (0, 2, 0, 2.0): 3.3, (0, 2, 1, 0.0): 4.4, (0, 2, 1, 1.0): 4.4, (0, 2, 1, 2.0): 4.4, (0, 2, 1, 3.0): 4.4, (2, 0, 0, 0.0): 5.5, (2, 0, 0, 1.0): 5.5, (2, 0, 0, 2.0): 5.5, (2, 0, 0, 3.0): 5.5, (2, 0, 0, 4.0): 5.5, }, "y": { (0, 0, 0, None): None, (0, 0, 1, 0.0): 1.0, (0, 0, 2, 0.0): 2.0, (0, 0, 2, 1.0): 2.0, (0, 2, 0, 0.0): 3.0, (0, 2, 0, 1.0): 3.0, (0, 2, 0, 2.0): 3.0, (0, 2, 1, 0.0): 4.0, (0, 2, 1, 1.0): 4.0, (0, 2, 1, 2.0): 4.0, (0, 2, 1, 3.0): 4.0, (2, 0, 0, 0.0): 5.0, (2, 0, 0, 1.0): 5.0, (2, 0, 0, 2.0): 5.0, (2, 0, 0, 3.0): 5.0, (2, 0, 0, 4.0): 5.0, }, } array = ak.Array( [ [ [{"x": 0.0, "y": 0}, {"x": 1.1, "y": 1}, {"x": 2.2, "y": 2}], [], [{"x": 3.3, "y": 3}, {"x": 4.4, "y": 4}], ], [], [[{"x": 5.5, "y": 5}]], ] ) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { "x": { (0, 0, 0): 0.0, (0, 0, 1): 1.1, (0, 0, 2): 2.2, (0, 2, 0): 3.3, (0, 2, 1): 4.4, (2, 0, 0): 5.5, }, "y": { (0, 0, 0): 0, (0, 0, 1): 1, (0, 0, 2): 2, (0, 2, 0): 3, (0, 2, 1): 4, (2, 0, 0): 5, }, } array = ak.Array( [ [ [ {"x": 0.0, "y": {"z": 0}}, {"x": 1.1, "y": {"z": 1}}, {"x": 2.2, "y": {"z": 2}}, ], [], [{"x": 3.3, "y": {"z": 3}}, {"x": 4.4, "y": {"z": 4}}], ], [], [[{"x": 5.5, "y": {"z": 5}}]], ] ) assert regularize(json.loads(ak.to_pandas(array).to_json())) == { ("x", ""): { (0, 0, 0): 0.0, (0, 0, 1): 1.1, (0, 0, 2): 2.2, (0, 2, 0): 3.3, (0, 2, 1): 4.4, (2, 0, 0): 5.5, }, ("y", "z"): { (0, 0, 0): 0, (0, 0, 1): 1, (0, 0, 2): 2, (0, 2, 0): 3, (0, 2, 1): 4, (2, 0, 0): 5, }, } one = ak.Array([[1.1, 2.2, 3.3], [], [4.4, 5.5]]) two = ak.Array([[100, 200], [300], [400, 500]]) assert [ regularize(json.loads(x.to_json())) for x in ak.to_pandas({"x": one, "y": two}, how=None) ] == [ {"x": {(0, 0): 1.1, (0, 1): 2.2, (0, 2): 3.3, (2, 0): 4.4, (2, 1): 5.5}}, {"y": {(0, 0): 100, (0, 1): 200, (1, 0): 300, (2, 0): 400, (2, 1): 500}}, ]
print(wzg_dat) columns = wzg_dat.fields print(columns) print(len(wzg_dat)) y = np.ones(len(wzg_dat)) * 1 #xsec = np.ones(len(wzg_dat)) * #gen = data = {'y': y} df = pd.DataFrame(data) for column in columns: print(column, len(wzg_dat[column])) df[column] = ak.to_pandas(wzg_dat[column]) display(df) corr = df.corr() # Set up the matplotlib figure f, ax = plt.subplots(figsize=(11, 9)) # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Draw the heatmap with the mask and correct aspect ratio sns.heatmap(corr, cmap=cmap, vmax=.3, center=0,
def get_phasespace_df(timestamp, layer): root_file = f"../results/tracker_{timestamp}_{layer}.root:PhaseSpace" df = pd.DataFrame() with rt.open(root_file) as tree: df = ak.to_pandas(tree.arrays()) return df
def jet_loop( self, variation, is_mc, df, dataset, mask, muons, mu1, mu2, jets, weights, numevents, output, ): # weights = copy.deepcopy(weights) if not is_mc and variation != "nominal": return variables = pd.DataFrame(index=output.index) jet_columns = [ "pt", "eta", "phi", "jetId", "qgl", "puId", "mass", "btagDeepB" ] if "puId17" in df.Jet.fields: jet_columns += ["puId17"] if is_mc: jet_columns += ["partonFlavour", "hadronFlavour"] if variation == "nominal": if self.do_jec: jet_columns += ["pt_jec", "mass_jec"] if is_mc and self.do_jerunc: jet_columns += ["pt_orig", "mass_orig"] # Find jets that have selected muons within dR<0.4 from them matched_mu_pt = jets.matched_muons.pt_fsr matched_mu_iso = jets.matched_muons.pfRelIso04_all matched_mu_id = jets.matched_muons[self.parameters["muon_id"]] matched_mu_pass = ((matched_mu_pt > self.parameters["muon_pt_cut"]) & (matched_mu_iso < self.parameters["muon_iso_cut"]) & matched_mu_id) clean = ~(ak.to_pandas(matched_mu_pass).astype(float).fillna( 0.0).groupby(level=[0, 1]).sum().astype(bool)) # if self.timer: # self.timer.add_checkpoint("Clean jets from matched muons") # Select particular JEC variation if "_up" in variation: unc_name = "JES_" + variation.replace("_up", "") if unc_name not in jets.fields: return jets = jets[unc_name]["up"][jet_columns] elif "_down" in variation: unc_name = "JES_" + variation.replace("_down", "") if unc_name not in jets.fields: return jets = jets[unc_name]["down"][jet_columns] else: jets = jets[jet_columns] # --- conversion from awkward to pandas --- # jets = ak.to_pandas(jets) if jets.index.nlevels == 3: # sometimes there are duplicates? jets = jets.loc[pd.IndexSlice[:, :, 0], :] jets.index = jets.index.droplevel("subsubentry") if variation == "nominal": # Update pt and mass if JEC was applied if self.do_jec: jets["pt"] = jets["pt_jec"] jets["mass"] = jets["mass_jec"] # We use JER corrections only for systematics, so we shouldn't # update the kinematics. Use original values, # unless JEC were applied. if is_mc and self.do_jerunc and not self.do_jec: jets["pt"] = jets["pt_orig"] jets["mass"] = jets["mass_orig"] # ------------------------------------------------------------# # Apply jetID and PUID # ------------------------------------------------------------# pass_jet_id = jet_id(jets, self.parameters, self.year) pass_jet_puid = jet_puid(jets, self.parameters, self.year) # Jet PUID scale factors # if is_mc and False: # disable for now # puid_weight = puid_weights( # self.evaluator, self.year, jets, pt_name, # jet_puid_opt, jet_puid, numevents # ) # weights.add_weight('puid_wgt', puid_weight) # ------------------------------------------------------------# # Select jets # ------------------------------------------------------------# jets["clean"] = clean jet_selection = (pass_jet_id & pass_jet_puid & (jets.qgl > -2) & jets.clean & (jets.pt > self.parameters["jet_pt_cut"]) & (abs(jets.eta) < self.parameters["jet_eta_cut"])) jets = jets[jet_selection] # if self.timer: # self.timer.add_checkpoint("Selected jets") # ------------------------------------------------------------# # Fill jet-related variables # ------------------------------------------------------------# njets = jets.reset_index().groupby("entry")["subentry"].nunique() variables["njets"] = njets # one_jet = (njets > 0) two_jets = njets > 1 # Sort jets by pT and reset their numbering in an event jets = jets.sort_values(["entry", "pt"], ascending=[True, False]) jets.index = pd.MultiIndex.from_arrays( [jets.index.get_level_values(0), jets.groupby(level=0).cumcount()], names=["entry", "subentry"], ) # Select two jets with highest pT try: jet1 = jets.loc[pd.IndexSlice[:, 0], :] jet2 = jets.loc[pd.IndexSlice[:, 1], :] jet1.index = jet1.index.droplevel("subentry") jet2.index = jet2.index.droplevel("subentry") except Exception: return fill_jets(output, variables, jet1, jet2) # if self.timer: # self.timer.add_checkpoint("Filled jet variables") # ------------------------------------------------------------# # Fill soft activity jet variables # ------------------------------------------------------------# # Effect of changes in jet acceptance should be negligible, # no need to calcluate this for each jet pT variation if variation == "nominal": fill_softjets(df, output, variables, 2) fill_softjets(df, output, variables, 5) # if self.timer: # self.timer.add_checkpoint("Calculated SA variables") # ------------------------------------------------------------# # Apply remaining cuts # ------------------------------------------------------------# # Cut has to be defined here because we will use it in # b-tag weights calculation vbf_cut = (variables.jj_mass > 400) & (variables.jj_dEta > 2.5) & (jet1.pt > 35) # ------------------------------------------------------------# # Calculate QGL weights, btag SF and apply btag veto # ------------------------------------------------------------# if is_mc and variation == "nominal": # --- QGL weights --- # isHerwig = "herwig" in dataset qgl_wgts = qgl_weights(jet1, jet2, isHerwig, output, variables, njets) weights.add_weight("qgl_wgt", qgl_wgts, how="all") # --- Btag weights --- # bjet_sel_mask = output.event_selection & two_jets & vbf_cut btag_wgt, btag_syst = btag_weights(self, self.btag_lookup, self.btag_systs, jets, weights, bjet_sel_mask) weights.add_weight("btag_wgt", btag_wgt) # --- Btag weights variations --- # for name, bs in btag_syst.items(): weights.add_weight(f"btag_wgt_{name}", bs, how="only_vars") # if self.timer: # self.timer.add_checkpoint( # "Applied QGL and B-tag weights" # ) # Separate from ttH and VH phase space variables["nBtagLoose"] = ( jets[(jets.btagDeepB > self.parameters["btag_loose_wp"]) & (abs(jets.eta) < 2.5)].reset_index().groupby( "entry")["subentry"].nunique()) variables["nBtagMedium"] = ( jets[(jets.btagDeepB > self.parameters["btag_medium_wp"]) & (abs(jets.eta) < 2.5)].reset_index().groupby( "entry")["subentry"].nunique()) variables.nBtagLoose = variables.nBtagLoose.fillna(0.0) variables.nBtagMedium = variables.nBtagMedium.fillna(0.0) variables.selection = (output.event_selection & (variables.nBtagLoose < 2) & (variables.nBtagMedium < 1)) # --------------------------------------------------------------# # Fill outputs # --------------------------------------------------------------# variables.update({"wgt_nominal": weights.get_weight("nominal")}) # All variables are affected by jet pT because of jet selections: # a jet may or may not be selected depending on pT variation. for key, val in variables.items(): output.loc[:, pd.IndexSlice[key, variation]] = val return output
def main(args): # Read nano, micro, EB or EE cuts nanoaod_arr = ak.from_parquet(args.nano_input_dir) print("Read nanoaod: {}".format(nanoaod_arr.type)) microaod_arr = uproot.concatenate( ["{}/*.root:diphotonDumper/trees/ggH_125_13TeV_All_$SYST".format(args.micro_input_dir)] ) print("Read microaod: {}".format(microaod_arr.type)) # Stupid typo in flashgg if "lead_ch_iso_worst__uncorr" in microaod_arr.fields: microaod_arr["lead_ch_iso_worst_uncorr"] = microaod_arr["lead_ch_iso_worst__uncorr"] if args.sd == "EB": nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) < 1.5] nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) < 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) < 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) < 1.5] if args.sd == "EE": nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.lead_eta) > 1.5] nanoaod_arr = nanoaod_arr[np.abs(nanoaod_arr.sublead_eta) > 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.lead_eta) > 1.5] microaod_arr = microaod_arr[np.abs(microaod_arr.sublead_eta) > 1.5] # Read catalogue of variables to be plotted with open("plots_specs.json", "r") as f: columns = json.load(f) # Create dict where keys are names of variables in nano and values are names of variables in micro nano_micro_names = {var["nano_col"]: var["micro_col"] for var in columns} nano_micro_names["event"] = "event" nano_micro_names["lumi"] = "lumi" # Event by event nano_dict = {k: nanoaod_arr[k] for k in nano_micro_names.keys()} nano_dict["lead_fixedGridRhoAll"] = nanoaod_arr["lead_fixedGridRhoAll"] # needed for XGBoost vs TMVA test_nano = ak.Array(nano_dict) test_micro = microaod_arr[nano_micro_names.values()] pd_nano = ak.to_pandas(test_nano) pd_micro = ak.to_pandas(test_micro) pd_nano = pd_nano.set_index(["event", "lumi"]) pd_micro = pd_micro.set_index(["event", "lumi"]) pd_joined = pd_nano.join(pd_micro, lsuffix="_nano", rsuffix="_micro") print("Joined dataframe:\n{}".format(pd_joined)) #Remove NaN values for nano_name, micro_name in nano_micro_names.items(): if nano_name in ["event", "lumi"]: break if nano_name == micro_name: nano_name += "_nano" micro_name += "_micro" pd_joined = pd_joined[pd_joined[nano_name].notna()] pd_joined = pd_joined[pd_joined[micro_name].notna()] # Cut over delta R # Here https://github.com/CoffeaTeam/coffea/blob/3db3fab23064c70d0ca63b185d51c7fa3b7849dc/coffea/nanoevents/methods/vector.py#L74 # useful info deltaR_threshold = 0.1 four_lead_nano = vector.obj( pt=pd_joined["lead_pt"], phi=pd_joined["lead_phi_nano"], eta=pd_joined["lead_eta_nano"], E=pd_joined["lead_energyRaw"] ) four_sublead_nano = vector.obj( pt=pd_joined["sublead_pt"], phi=pd_joined["sublead_phi_nano"], eta=pd_joined["sublead_eta_nano"], E=pd_joined["sublead_energyRaw"] ) pd_joined["deltaR_nano"] = four_lead_nano.deltaR(four_sublead_nano) four_lead_micro = vector.obj( pt=pd_joined["leadPt"], phi=pd_joined["lead_phi_micro"], eta=pd_joined["lead_eta_micro"], E=pd_joined["lead_SCRawE"] ) four_sublead_micro = vector.obj( pt=pd_joined["subleadPt"], phi=pd_joined["sublead_phi_micro"], eta=pd_joined["sublead_eta_micro"], E=pd_joined["sublead_SCRawE"] ) pd_joined["lead_deltaR"] = four_lead_nano.deltaR(four_lead_micro) pd_joined["sublead_deltaR"] = four_sublead_nano.deltaR(four_sublead_micro) pd_joined = pd_joined[pd_joined["lead_deltaR"] < deltaR_threshold] pd_joined = pd_joined[pd_joined["sublead_deltaR"] < deltaR_threshold] print("Final joined dataframe:\n{}".format(pd_joined)) # Plot print("Start plotting") for column in columns: fig, (up, middle, down) = plt.subplots( nrows=3, ncols=1, gridspec_kw={"height_ratios": (2, 1, 1)} ) nano_name = column["nano_col"] micro_name = column["micro_col"] if nano_name == micro_name: nano_name += "_nano" micro_name += "_micro" range = column["range"] # Up n, n_, n__ = up.hist(pd_joined[nano_name], bins=column["bins"], range=range, histtype="step", label="NanoAOD", linewidth=2) m, m_, m__ = up.hist(pd_joined[micro_name], bins=column["bins"], range=range, histtype="step", label="MicroAOD", linewidth=2) up.legend(fontsize=18, loc="upper right") up.set_xlim(range) up.set_xlabel(column["var"]) up.set_ylabel("Events") if "log" in column: up.set_yscale("log") # Middle ylim = [0, 2] middle.set_ylim(ylim) #middle.axhline(1, xmin=range[0], xmax=range[1], color="black", alpha=0.6) centers = (n_[:-1] + n_[1:]) / 2 middle.plot(centers, n / m, "k.") middle.set_xlim(range) middle.set_xlabel(column["var"]) middle.set_ylabel("$n/\mu$") middle.grid(which="both") # Down perc_range = (-300, 300) perc_bins = 500 down.hist(100 * (pd_joined[nano_name] - pd_joined[micro_name]) / pd_joined[micro_name], bins=perc_bins, range=perc_range, histtype="step", density=True, color="black", linewidth=2) #down.set_yscale("log") down.set_xlabel("$(n_{ev} - \mu_{ev})/\mu_{ev}$ [%]") down.set_ylabel("Events / {}%".format((perc_range[1] - perc_range[0]) / perc_bins)) print(column["nano_col"]) print("nano: {}".format(np.sum(n))) print("micro: {}".format(np.sum(m))) print("diff = {}".format(abs(np.sum(n) - np.sum(m)))) print("rel diff = {}%\n".format(100 * abs(np.sum(n) - np.sum(m)) / max(np.sum(n), np.sum(m)))) fig.tight_layout() fig.savefig("{}/{}_{}.png".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight') fig.savefig("{}/{}_{}.pdf".format(args.output_dir, column["nano_col"], args.sd), bbox_inches='tight') plt.close(fig) # Dump pandas dataframe to parquet file pd_joined.to_parquet("nano_micro_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped dataframe to parquet file") # Redundant: dump separate dataframes for nano and micro with PhotonID inputs nano_vars = { "r9": "lead_r9_nano", "s4": "lead_s4_nano", "sieie": "lead_sieie_nano", "etaWidth": "lead_etaWidth", "phiWidth": "lead_phiWidth", "sieip": "lead_sieip_nano", "pfPhoIso03": "lead_pfPhoIso03", "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV", "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx", "mva_ID": "lead_mvaID_recomputed" } micro_vars = { "r9": "lead_r9_micro", "s4": "lead_s4_micro", "sieie": "lead_sieie_micro", "etaWidth": "lead_eta_width", "phiWidth": "lead_phi_width", "sieip": "lead_sieip_micro", "pfPhoIso03": "lead_pho_iso", "pfChargedIsoPFPV": "lead_ch_iso", "pfChargedIsoWorstVtx": "lead_ch_iso_worst", "mva_ID": "lead_mva" } nano_isos = { "pfPhoIso03": "lead_pfPhoIso03", "pfChargedIsoPFPV": "lead_pfChargedIsoPFPV", "pfChargedIsoWorstVtx": "lead_pfChargedIsoWorstVtx", "pfPhoIso03_uncorr": "lead_uncorr_pfPhoIso03", "pfChargedIsoPFPV_uncorr": "lead_uncorr_pfChargedIsoPFPV", "pfChargedIsoWorstVtx_uncorr": "lead_uncorr_pfChargedIsoWorstVtx", } micro_isos = { "pfPhoIso03": "lead_pho_iso", "pfChargedIsoPFPV": "lead_ch_iso", "pfChargedIsoWorstVtx": "lead_ch_iso_worst", "pfPhoIso03_uncorr": "lead_pho_iso_uncorr", "pfChargedIsoPFPV_uncorr": "lead_ch_iso_uncorr", "pfChargedIsoWorstVtx_uncorr": "lead_ch_iso_worst_uncorr", } nano_df = pd_joined[list(nano_vars.values())] nano_df.rename(columns=dict((v, k) for k, v in nano_vars.items()), inplace=True) nano_df.to_parquet("nano_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped nano dataframe to parquet file") micro_df = pd_joined[list(micro_vars.values())] micro_df.rename(columns=dict((v, k) for k, v in micro_vars.items()), inplace=True) micro_df.to_parquet("micro_{}.parquet".format(args.sd), engine="fastparquet") print("Dumped micro dataframe to parquet file") nano_df = pd_joined[list(nano_isos.values())] nano_df.rename(columns=dict((v, k) for k, v in nano_isos.items()), inplace=True) nano_df.to_parquet("nano_{}_isos.parquet".format(args.sd), engine="fastparquet") print("Dumped nano dataframe for isos to parquet file") micro_df = pd_joined[list(micro_isos.values())] micro_df.rename(columns=dict((v, k) for k, v in micro_isos.items()), inplace=True) micro_df.to_parquet("micro_{}_isos.parquet".format(args.sd), engine="fastparquet") print("Dumped micro dataframe for isos to parquet file")
def setupPionData(root_file_dict,branches=[], layers=[], cluster_tree='ClusterTree', balance_data=True, n_max=-1, cut_distributions=[], cut_values=[], cut_types=[], match_distribution='', match_binning=(), match_log=False, verbose=False, load=False, save=False, filename='', return_indices=False): pdata = {} pcells = {} keys = list(root_file_dict.keys()) rng = np.random.default_rng() pdata_filename = filename + '_frame.h5' pcell_filename = filename + '_images.h5' selec_filename = filename + '_selections.h5' if(load and pathlib.Path(pdata_filename).exists() and pathlib.Path(pcell_filename).exists()): if(verbose): print('Loading pandas DataFrame and calo images from {} and {}.'.format(pdata_filename,pcell_filename)) # Load the DataFrame and images from disk. pdata = { key: pd.read_hdf(pdata_filename,key=key) for key in keys } hf = h5.File(pcell_filename,'r') for key in keys: pcells[key] = {} for layer in layers: pcells[key][layer] = hf['{}:{}'.format(key,layer)][:] hf.close() if(return_indices): # TODO: Rework this a little! hf = h5.File(selec_filename,'r') indices = {key: hf[key][:] for key in keys} hf.close() else: # root_file_dict entries might be glob-style strings, or lists of files. We should consider both possibilities. arrays = {} for key,root_files in root_file_dict.items(): if(type(root_files) == list): arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in branches) else: arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in branches) indices = ApplyCuts(arrays, cut_distributions, cut_values, cut_types, verbose) # Filter out clusters so that our data series match in their distribution of a user-supplied variable. if(match_distribution != ''): if(match_distribution in branches and len(match_binning) == 3): if(verbose): print('Matching data series on distribution: {}.'.format(match_distribution)) binning = np.linspace(match_binning[1],match_binning[2],match_binning[0]+1) n_bins = len(binning) - 1 distributions = { key: np.histogram(arrays[key][match_distribution][indices[key]].to_numpy(), bins=binning)[0] # only keep bin counts for key in keys } # Now determine how many clusters we keep in each bin, for each key. n_keep = np.zeros(n_bins,dtype=np.dtype('i8')) for i in range(n_bins): n_keep[i] = int(np.min([x[i] for x in distributions.values()])) # Now we need to throw out some clusters -- in other words, only keep some. # We will randomly choose which ones we keep, for each match_distribution bin, # for each data series (key). for key in keys: sorted_indices = indices[key][np.argsort(arrays[key][match_distribution][indices[key]])] keep_indices = [] bin_idx_edges = np.insert(np.cumsum(distributions[key]),0,0) for i in range(n_bins): index_block = sorted_indices[bin_idx_edges[i]:bin_idx_edges[i+1]] # all indices corresponding to the i'th bin of match_distribution, for this key keep_indices.append(rng.choice(index_block, n_keep[i], replace=False)) n_before = len(indices[key]) indices[key] = np.hstack(keep_indices) n_after = len(indices[key]) #if(verbose): print('\t{}, number of events: {} -> {}'.format(key, n_before, n_after)) else: print('Warning: Requested matching of distribution \"{}\" but this variable is not among the branches you selected from the data. Skipping this step.'.format(match_distribution)) # Balance data so we have equal amounts of each category. # Note that if we did the matching above, we can potentially skip this as # balancing was implicitly done. However, we might want to take the opportunity # to further slim down our dataset. if(balance_data): n_max_tmp = np.min([len(x) for x in indices.values()]) if(n_max > 0): n_max = np.minimum(n_max_tmp, n_max) else: n_max = n_max_tmp if(verbose): print('Balancing data: {} events per category.'.format(n_max)) indices = {key:rng.choice(val, n_max, replace=False) for key,val in indices.items()} # Make a boolean mask from the indices. This speeds things up below, as opposed to passing (unsorted) lists of indices. for key in indices.keys(): msk = np.zeros(len(arrays[key]),dtype=bool) msk[indices[key]] = True indices[key] = msk # Now, apply our selection indices to the arrays. arrays = { key:arrays[key][indices[key]] for key in keys } # Make the dataframes from the arrays. if(verbose): print('Preparing pandas DataFrame.') pdata = { key: ak.to_pandas(arrays[key][branches]) for key in keys } # Re-make the arrays with just our layer info (using our selection indices again). arrays = {} for key,root_files in root_file_dict.items(): if(type(root_files) == list): arrays[key] = ur.lazy([':'.join((x,cluster_tree)) for x in root_files], filter_branch=lambda x: x.name in layers)[indices[key]] else: arrays[key] = ur.lazy(':'.join((root_files, cluster_tree)), filter_branch=lambda x: x.name in layers)[indices[key]] # Make our calorimeter images. nentries = len(keys) * len(layers) i = 0 if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40) pcells = {} for key in keys: pcells[key] = {} for layer in layers: pcells[key][layer] = setupCells_new(arrays[key],layer) i+=1 if(verbose): qu.printProgressBarColor (i, nentries, prefix='Preparing calorimeter images.', suffix='% Complete', length=40) # Save the dataframes and calorimeter images in HDF5 format for easy access next time. if(filename != '' and save): if(verbose): print('Saving DataFrames to {}.'.format(pdata_filename)) for key,frame in pdata.items(): frame.to_hdf(pdata_filename, key=key, mode='a',complevel=6) if(verbose): print('Saving calorimeter images to {}.'.format(pcell_filename)) hf = h5.File(pcell_filename, 'w') for key in pcells.keys(): for layer in layers: dset = hf.create_dataset('{}:{}'.format(key,layer), data=pcells[key][layer], chunks=True, compression='gzip', compression_opts=7) hf.close() # One may optionally also save the selected event indices. This can be useful if referring back to the original data source. if(return_indices): # Save the indices to a file. hf = h5.File(selec_filename, 'w') for key in indices.keys(): dset = hf.create_dataset(key, data=indices[key], chunks=True, compression='gzip', compression_opts=7) hf.close() return pdata, pcells, indices # return indices return pdata, pcells # don't return indices
def build_dataframe( self, data_path: str, TTree_name: str, tree_dict: Dict[str, Set[str]], is_truth: bool, is_reco: bool, chunksize: int = 1024, validate_missing_events: bool = True, validate_duplicated_events: bool = True, validate_sumofweights: bool = True, ) -> pd.DataFrame: """ Builds a dataframe :param data_path: path to ROOT datafile(s) :param TTree_name: TTree in datapath to set as default tree :param tree_dict: dictionary of tree: variables to extract from Datapath :param is_truth: whether dataset contains truth data :param is_reco: whether dataset contains reco data :param chunksize: chunksize for uproot concat method :param validate_missing_events: whether to check for missing events :param validate_duplicated_events: whether to check for duplicated events :param validate_sumofweights: whether to check sum of weights against weight_mc :return: output dataframe containing columns corresponding to necessary variables """ self.logger.info(f"Building DataFrame from {data_path} ({file_utils.n_files(data_path)} file(s))...") # is the default tree a truth tree? default_tree_truth = 'truth' in TTree_name t1 = time.time() self.logger.debug(f"Extracting {tree_dict[TTree_name]} from {TTree_name} tree...") df = to_pandas(uproot.concatenate(data_path + ':' + TTree_name, tree_dict[TTree_name], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Extracted {len(df)} events.") self.logger.debug(f"Extracting ['total_EventsWeighted', 'dsid'] from 'sumWeights' tree...") sumw = to_pandas(uproot.concatenate(data_path + ':sumWeights', ['totalEventsWeighted', 'dsid'], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Calculating sum of weights and merging...") sumw = sumw.groupby('dsid').sum() df = pd.merge(df, sumw, left_on='mcChannelNumber', right_on='dsid', sort=False, copy=False) df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True) df.index.names = ['DSID', 'eventNumber'] self.logger.debug("Set DSID/eventNumber as index") # merge TTrees if validate_duplicated_events: validation = '1:1' self.logger.info(f"Validating duplicated events in tree {TTree_name}...") self.__drop_duplicates(df) self.__drop_duplicate_event_numbers(df) else: validation = 'm:m' self.logger.info("Skipping duplicted events validation") # iterate over TTrees and merge for tree in tree_dict: if tree == TTree_name: continue self.logger.debug(f"Extracting {tree_dict[tree]} from {tree} tree...") alt_df = to_pandas(uproot.concatenate(data_path + ":" + tree, tree_dict[tree], num_workers=config.n_threads, begin_chunk_size=chunksize)) self.logger.debug(f"Extracted {len(alt_df)} events.") alt_df.set_index(['mcChannelNumber', 'eventNumber'], inplace=True) alt_df.index.names = ['DSID', 'eventNumber'] self.logger.debug("Set DSID/eventNumber as index") if validate_missing_events: self.logger.info(f"Checking for missing events in tree '{tree}'..") tree_is_truth = 'truth' in tree if tree_is_truth and not default_tree_truth: if n_missing := len(df.index.difference(alt_df.index)): raise Exception( f"Found {n_missing} events in '{TTree_name}' tree not found in '{tree}' tree") else: self.logger.debug(f"All events in {TTree_name} tree found in {tree} tree") elif default_tree_truth and not tree_is_truth: if n_missing := len(alt_df.index.difference(df.index)): raise Exception( f"Found {n_missing} events in '{tree}' tree not found in '{TTree_name}' tree") else: self.logger.debug(f"All events in {tree} tree found in {TTree_name} tree") else: self.logger.info(f"Skipping missing events check. Not truth/reco tree combination")
# "pgf.texsystem": "lualatex", # "pgf.rcfonts": False, # "font.family": "serif", # "font.serif": [], # "font.sans-serif": [], # "font.monospace": [], # # "figure.figsize": [default_width, default_width * default_ratsio], # "pgf.preamble": "\\usepackage{mymplsetup}" # }) # plt.rcParams = plt.rcParamsDefault plt.rcParams = plt.rcParamsDefault mpl.rcParams.update({"font.size": 16}) # %% rf = uproot.open("output/DetIdLUT.root") arr = rf["analyzer/tree"].arrays() keydf = ak.to_pandas(arr[0]) keydf = keydf.set_index("globalid") keydf.head() # %% # Debug code to see the if the arrays are filled correctly index = [ "globalid", "detectorid", "subdetid", "layerid", "waferortileid.first", "waferortileid.second", "cellid.first", "cellid.second", "x", "y",
return output def postprocess(self, accumulator): return accumulator import uproot from coffea.nanoevents import NanoEventsFactory, NanoAODSchema import pandas as pd class HackSchema(NanoAODSchema): def __init__(self, base_form): base_form["contents"].pop("Muon_fsrPhotonIdx", None) base_form["contents"].pop("Electron_photonIdx", None) super().__init__(base_form) print(args.file) f=args.file files = {"TTBAR":[args.file]} result = processor.run_uproot_job( files, treename="Events", processor_instance=MyProcessor(), executor=processor.iterative_executor, executor_args={'schema':HackSchema}, chunksize=10000 ) l=args.loc keys=result.keys() finaldict={} for key in keys: finaldict[key]=result[key].value df=ak.to_pandas(ak.zip(finaldict)) df.to_csv("%s/%sDatasetWithTruths.csv"%(l,f[0:-5]))