def evaluate(self, ast_node): qv = ast_visitor() #print(ast.dump(ast_node)) qv.visit(ast_node) if isinstance(self.dataset_source, str): data_pathname = self.dataset_source else: data_pathname = 'temp.awkd' awkward.save(data_pathname, self.dataset_source) f = open('temp.py', 'w') f.write('import awkward\n') source = ast_node.source while hasattr(source, 'source'): source = source.source if data_pathname[-5:] == '.awkd': f.write(source.rep + " = awkward.load('" + data_pathname + "')\n") elif data_pathname[-5:] == '.root': f.write('import uproot\n') f.write("input_file = uproot.open('" + data_pathname + "')\n") f.write( source.rep + " = input_file[input_file.keys()[0]].lazyarrays(namedecode='utf-8')\n" ) else: raise BaseException('unimplemented file type: ' + data_pathname) f.write('output_array = awkward.fromiter(' + ast_node.rep + ')\n') f.write("awkward.save('output.awkd', output_array)\n") f.close() os.system('python temp.py') if not isinstance(self.dataset_source, str): os.remove(data_pathname) os.remove('temp.py') output = awkward.load('output.awkd') os.remove('output.awkd') return output
def _read_awkd(filepath, branches, partial_load=None): import awkward with awkward.load(filepath) as f: outputs = {k: f[k] for k in branches} if partial_load is not None and partial_load != (0, 1): start, stop = np.trunc( np.asfarray(partial_load) * len(outputs[branches[0]])) for k, v in outputs.items(): outputs[k] = v[start:stop] return outputs
def __init__(self, array, *args, **kwargs): if isinstance(array, str): array = ak.load(array) super(DRNDataset, self).__init__('whatever', *args, **kwargs) self.y = array[0] self.constituents = FourVectorArray(array[1], array[2], array[3], array[4]) self.jets = FourVectorArray(array[5], array[6], array[7], array[8]) # Overwrite with delta's w.r.t. to the main jet self.constituents.phi = calc_dphi(self.constituents.phi, self.jets.phi) self.constituents.eta = self.constituents.eta - self.jets.eta
def get_chunking(filelist, chunksize, treename="Events", workers=12, skip_bad_files=False): """ Return 2-tuple of - chunks: triplets of (filename,entrystart,entrystop) calculated with input `chunksize` and `filelist` - total_nevents: total event count over `filelist` """ import uproot import awkward from tqdm.auto import tqdm import concurrent.futures chunksize = int(chunksize) chunks = [] nevents = 0 if skip_bad_files: # slightly slower (serial loop), but can skip bad files for fname in tqdm(filelist): try: items = uproot.numentries(fname, treename, total=False).items() except (IndexError, ValueError) as e: print("Skipping bad file", fname) continue for fn, nentries in items: nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) elif filelist[0].endswith(".awkd"): for fname in tqdm(filelist): f = awkward.load(fname, whitelist=awkward.persist.whitelist + [['blosc', 'decompress']]) nentries = len(f["run"]) nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fname, chunksize * index, min(chunksize * (index + 1), nentries))) else: executor = None if len( filelist) < 5 else concurrent.futures.ThreadPoolExecutor( min(workers, len(filelist))) for fn, nentries in uproot.numentries(filelist, treename, total=False, executor=executor).items(): nevents += nentries for index in range(nentries // chunksize + 1): if nentries <= 0: continue chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) return chunks, nevents
def _read_awkd(filepath, branches, load_range=None): import awkward with awkward.load(filepath) as f: outputs = {k: f[k] for k in branches} if load_range is not None: start = math.trunc(load_range[0] * len(outputs[branches[0]])) stop = max(start + 1, math.trunc(load_range[1] * len(outputs[branches[0]]))) for k, v in outputs.items(): outputs[k] = v[start:stop] return outputs
def _load(self): logging.info('Start loading file %s' % self.filepath) counts = None with awkward.load(self.filepath) as a: self._label = a[self.label] for k in self.feature_dict: cols = self.feature_dict[k] if not isinstance(cols, (list, tuple)): cols = [cols] arrs = [] for col in cols: if counts is None: counts = a[col].counts else: assert np.array_equal(counts, a[col].counts) arrs.append(pad_array(a[col], self.pad_len)) self._values[k] = np.stack(arrs, axis=self.stack_axis) logging.info('Finished loading file %s' % self.filepath)
def __init__(self, array, n_constituents=200): if isinstance(array, str): array = ak.load(array) self.n_constituents = n_constituents self.y = array[0] self.constituents = FourVectorArray(array[1], array[2], array[3], array[4]) self.jets = FourVectorArray(array[5], array[6], array[7], array[8]) # Overwrite with delta's w.r.t. to the main jet self.constituents.phi = calc_dphi(self.constituents.phi, self.jets.phi) self.constituents.eta = self.constituents.eta - self.jets.eta # features self.constituents.logpt = np.log(self.constituents.pt) self.constituents.loge = np.log(self.constituents.energy) self.constituents.logpt_ptjet = np.log(self.constituents.pt / self.jets.pt) self.constituents.loge_ejet = np.log(self.constituents.energy / self.jets.energy) self.constituents.dr = np.sqrt(self.constituents.eta**2 + self.constituents.phi**2)
top_4 = np.argsort(pts)[-4:] num_found = len(top_4) for var_num, var_name in enumerate(kinematics): wanted = j_name + "_Total_" + var_name vals = getattr(ew, wanted)[top_4] all_kinematics[j_class][order, var_num, j, event_n, :num_found] = vals energy, px, py, pz = all_kinematics[j_class][order, 1:, j, event_n, :num_found] if num_found > 1: shape_vals = ShapeVariables.shape(energy, px, py, pz)[1] for var_num, var_name in enumerate(shapes): all_shapes[j_class][order, var_num, j, event_n] = shape_vals[var_name] content = {"shape_names": shapes, "kinematic_names": kinematics, "orders": ["nlo", "lo"], "jet_names": [spectral_names, traditional_names, iterative_names], "kinematics" : awkward.fromiter(all_kinematics), "shapes": awkward.fromiter(all_shapes)} awkward.save("../megaIgnore/IRC_shapes.awkd", content) else: data = awkward.load("../megaIgnore/IRC_shapes.awkd") shapes = data["shape_names"] kinematics = data["kinematic_names"] spectral_names, traditional_names, iterative_names = data["jet_names"] all_kinematics = data["kinematics"] all_shapes = data["shapes"] def plot_jet_name(name, variable, bounds=None, ax=None): colours = ['blue', 'purple', 'orange'] line_styles = ['--', '-', '-.'] if variable in kinematics: table = all_kinematics v_index = kinematics.index(variable) elif variable in shapes: table = all_shapes
else: import awkward major, minor, _ = awkward.version.version_info major = int(major) minor = int(minor) if major == 1: raise ImportError("Need awkward 0.12.X, you have %s" % awkward.__version__) elif minor > 14: raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__) elif minor < 12: raise ImportError("Need awkward 0.12 / 0.13 / 0.14, you have %s" % awkward.__version__) tree_data1 = awkward.load(args.filename) print(len(tree_data1.columns), "hists in main file") is_hdf5_2 = False if args.compareTo: is_hdf5_2 = "hdf5" in os.path.splitext(args.compareTo)[1] if not is_hdf5_2 and "awkd" not in os.path.splitext(args.compareTo)[1]: raise IOError("--compareTo input must be .hdf5 or .awkd") if is_hdf5_2: if not is_hdf5_1: import h5py tree_data2 = h5py.File(args.compareTo) print(len(tree_data2.keys()), "hists in compareTo file") else: if is_hdf5_1:
ac[0]["pair"] = (fcomp, ("lz4.frame", "decompress")) if label.startswith("blosc"): ac[0]["pair"] = (fcomp, ("blosc", "decompress")) if label.startswith("lzma"): ac[0]["pair"] = (fcomp, ("backports.lzma", "decompress")) fname = "tables/table_{}.awkd".format(label) t0 = time.time() awkward.save(fname, table, compression=ac, mode="w") t1 = time.time() info["t_compress_ms"] = 1e3 * (t1 - t0) t0 = time.time() tmp = awkward.load(fname, whitelist=awkward.persist.whitelist + [ ['lz4.frame', 'decompress'], ['lz4.block', 'decompress'], ['blosc', 'decompress'], ['backports.lzma', 'decompress'], ]) t1 = time.time() info["t_decompress_ms"] = 1e3 * (t1 - t0) info["uncompressed_bytes"] = table.nbytes info["compressed_bytes"] = int(os.stat(fname).st_size) data.append(info) pd.DataFrame(data).to_json("jsons/data_{}.json".format(i))
start = time.time() nn = ParticleNetJetTagsProducer(args.model, args.preprocess) diff = time.time() - start print('--- Setup model: %f s total' % (diff,)) start = time.time() outputs = nn.predict(taginfo, eval_flags) diff = time.time() - start print('--- Run prediction: %f s total, %f s per jet ---' % (diff, diff / outputs['probQCDbb'].counts.sum())) # print(outputs) # for k in outputs: # print(k, outputs[k].content.mean()) if 'FatJet_ParticleNetMD_probXbb' in table: print('Compare w/ stored values') print('Stored values:\n ...', table['FatJet_ParticleNetMD_probXbb'][:5]) print('Computed values:\n ...', outputs['probXbb'][:5]) print('Diff (50%, 95%, 99%, 100%) = ', np.percentile( np.abs(outputs['probXbb'] - table['FatJet_ParticleNetMD_probXbb']).content, [50, 95, 99, 100])) # assert(np.array_equal(jetmass.counts, outputs['probQCDbb'].counts)) alloutputs = awkward.JaggedArray.zip(outputs) if args.make_baseline: with open('baseline.awkd', 'wb') as fout: awkward.save(fout, alloutputs) else: if os.path.exists('baseline.awkd'): with open('baseline.awkd', 'rb') as fin: baseline = awkward.load(fin) print("Comparison to baseline:", (alloutputs == baseline).all().all())