def coffea_pyapp(dataset, fn, treename, chunksize, index, procstr, timeout=None, flatten=True): import uproot import cloudpickle as cpkl import pickle as pkl import lz4.frame as lz4f from coffea import hist, processor from coffea.processor.accumulator import value_accumulator uproot.XRootDSource.defaults["parallel"] = False lz4_clevel = 1 # instrument xrootd source if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'): def _read(self, chunkindex): self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes return self._read_real(chunkindex) uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read uproot.source.xrootd.XRootDSource._read = _read processor_instance = cpkl.loads(lz4f.decompress(procstr)) afile = uproot.open(fn) tree = None if isinstance(treename, str): tree = afile[treename] elif isinstance(treename, Sequence): for name in reversed(treename): if name in afile: tree = afile[name] else: raise Exception('treename must be a str or Sequence but is a %s!' % repr(type(treename))) if tree is None: raise Exception('No tree found, out of possible tree names: %s' % repr(treename)) df = processor.LazyDataFrame(tree, chunksize, index, flatten=flatten) df['dataset'] = dataset vals = processor_instance.process(df) if isinstance(afile.source, uproot.source.xrootd.XRootDSource): vals['_bytesread'] = value_accumulator(int) + afile.source.bytesread valsblob = lz4f.compress(pkl.dumps(vals), compression_level=lz4_clevel) return valsblob, df.size, dataset
def process_file(dataset, file, processor_instance, stats_accumulator, preload_items=None, stride=500000): fin = uproot.open(file) skim_sumw = None if 'otree' in fin: tree = fin['otree'] if 'SumWeights' in fin: skim_sumw = fin['SumWeights'].values[0] else: tree = fin['Events'] tic = time.time() output = processor_instance.accumulator.identity() # would be cool to use columns_accessed and work time to dynamically optimize this for index in range(tree.numentries // stride + 1): df = processor.LazyDataFrame(tree, stride, index, preload_items=preload_items) df['dataset'] = dataset # hacky way to only accumulate file-level information once if 'otree' in fin: df['skim_sumw'] = skim_sumw if index == 0 else None output += processor_instance.process(df) toc = time.time() stats = stats_accumulator.identity() stats['nentries'] += tree.numentries stats['bytesread'] += fin.source.bytesread if isinstance( fin.source, uproot.source.xrootd.XRootDSource) else 0 stats['sumworktime'] += toc - tic stats['columns_accessed'] += df.materialized return output, stats
def coffea_pyapp(dataset, fn, treename, chunksize, index, procstr, timeout=None, flatten=True): import uproot import cloudpickle as cpkl import pickle as pkl import lz4.frame as lz4f from coffea import hist, processor from coffea.processor.accumulator import accumulator from concurrent.futures import ThreadPoolExecutor, TimeoutError uproot.XRootDSource.defaults["parallel"] = False lz4_clevel = 1 # instrument xrootd source if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'): def _read(self, chunkindex): self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes return self._read_real(chunkindex) uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read uproot.source.xrootd.XRootDSource._read = _read processor_instance = cpkl.loads(lz4f.decompress(procstr)) for f in fn: afile = None for i in range(5): with ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit(uproot.open, fn) try: afile = future.result(timeout=5) except TimeoutError: afile = None else: break if afile is None: raise Exception('unable to open: %s' % fn) tree = None if isinstance(treename, str): tree = afile[treename] elif isinstance(treename, Sequence): for name in reversed(treename): if name in afile: tree = afile[name] else: raise Exception('treename must be a str or Sequence but is a %s!' % repr(type(treename))) if tree is None: raise Exception('No tree found, out of possible tree names: %s' % repr(treename)) df = processor.LazyDataFrame(tree, chunksize, index, flatten=flatten) df['dataset'] = dataset vals = processor_instance.process(df) vals['_bytesread'] = accumulator(afile.source.bytesread if isinstance( afile.source, uproot.source.xrootd.XRootDSource) else 0) valsblob = lz4f.compress(pkl.dumps(vals), compression_level=lz4_clevel) istart = chunksize * index istop = min(tree.numentries, (index + 1) * chunksize) return valsblob, (istop - istart), dataset