Python LazyDataFrame示例，coffea.processor.LazyDataFrame Python示例

示例#1

0

显示文件

def coffea_pyapp(dataset,
                 fn,
                 treename,
                 chunksize,
                 index,
                 procstr,
                 timeout=None,
                 flatten=True):
    import uproot
    import cloudpickle as cpkl
    import pickle as pkl
    import lz4.frame as lz4f
    from coffea import hist, processor
    from coffea.processor.accumulator import value_accumulator

    uproot.XRootDSource.defaults["parallel"] = False

    lz4_clevel = 1

    # instrument xrootd source
    if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'):

        def _read(self, chunkindex):
            self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes
            return self._read_real(chunkindex)

        uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read
        uproot.source.xrootd.XRootDSource._read = _read

    processor_instance = cpkl.loads(lz4f.decompress(procstr))

    afile = uproot.open(fn)

    tree = None
    if isinstance(treename, str):
        tree = afile[treename]
    elif isinstance(treename, Sequence):
        for name in reversed(treename):
            if name in afile:
                tree = afile[name]
    else:
        raise Exception('treename must be a str or Sequence but is a %s!' %
                        repr(type(treename)))

    if tree is None:
        raise Exception('No tree found, out of possible tree names: %s' %
                        repr(treename))

    df = processor.LazyDataFrame(tree, chunksize, index, flatten=flatten)
    df['dataset'] = dataset

    vals = processor_instance.process(df)
    if isinstance(afile.source, uproot.source.xrootd.XRootDSource):
        vals['_bytesread'] = value_accumulator(int) + afile.source.bytesread
    valsblob = lz4f.compress(pkl.dumps(vals), compression_level=lz4_clevel)

    return valsblob, df.size, dataset

示例#2

0

显示文件

文件： run_baconbits.py 项目： zhangzc11/coffeandbacon

def process_file(dataset,
                 file,
                 processor_instance,
                 stats_accumulator,
                 preload_items=None,
                 stride=500000):
    fin = uproot.open(file)
    skim_sumw = None
    if 'otree' in fin:
        tree = fin['otree']
        if 'SumWeights' in fin:
            skim_sumw = fin['SumWeights'].values[0]
    else:
        tree = fin['Events']

    tic = time.time()

    output = processor_instance.accumulator.identity()
    # would be cool to use columns_accessed and work time to dynamically optimize this
    for index in range(tree.numentries // stride + 1):
        df = processor.LazyDataFrame(tree,
                                     stride,
                                     index,
                                     preload_items=preload_items)
        df['dataset'] = dataset
        # hacky way to only accumulate file-level information once
        if 'otree' in fin:
            df['skim_sumw'] = skim_sumw if index == 0 else None
        output += processor_instance.process(df)

    toc = time.time()

    stats = stats_accumulator.identity()
    stats['nentries'] += tree.numentries
    stats['bytesread'] += fin.source.bytesread if isinstance(
        fin.source, uproot.source.xrootd.XRootDSource) else 0
    stats['sumworktime'] += toc - tic
    stats['columns_accessed'] += df.materialized
    return output, stats

示例#3

0

显示文件

文件： parsl_executor.py 项目： mcremone/coffea

def coffea_pyapp(dataset,
                 fn,
                 treename,
                 chunksize,
                 index,
                 procstr,
                 timeout=None,
                 flatten=True):
    import uproot
    import cloudpickle as cpkl
    import pickle as pkl
    import lz4.frame as lz4f
    from coffea import hist, processor
    from coffea.processor.accumulator import accumulator
    from concurrent.futures import ThreadPoolExecutor, TimeoutError

    uproot.XRootDSource.defaults["parallel"] = False

    lz4_clevel = 1

    # instrument xrootd source
    if not hasattr(uproot.source.xrootd.XRootDSource, '_read_real'):

        def _read(self, chunkindex):
            self.bytesread = getattr(self, 'bytesread', 0) + self._chunkbytes
            return self._read_real(chunkindex)

        uproot.source.xrootd.XRootDSource._read_real = uproot.source.xrootd.XRootDSource._read
        uproot.source.xrootd.XRootDSource._read = _read

    processor_instance = cpkl.loads(lz4f.decompress(procstr))

    for f in fn:
        afile = None
        for i in range(5):
            with ThreadPoolExecutor(max_workers=1) as executor:
                future = executor.submit(uproot.open, fn)
                try:
                    afile = future.result(timeout=5)
                except TimeoutError:
                    afile = None
                else:
                    break

        if afile is None:
            raise Exception('unable to open: %s' % fn)
        tree = None
        if isinstance(treename, str):
            tree = afile[treename]
        elif isinstance(treename, Sequence):
            for name in reversed(treename):
                if name in afile:
                    tree = afile[name]
        else:
            raise Exception('treename must be a str or Sequence but is a %s!' %
                            repr(type(treename)))

        if tree is None:
            raise Exception('No tree found, out of possible tree names: %s' %
                            repr(treename))

        df = processor.LazyDataFrame(tree, chunksize, index, flatten=flatten)
        df['dataset'] = dataset

        vals = processor_instance.process(df)
        vals['_bytesread'] = accumulator(afile.source.bytesread if isinstance(
            afile.source, uproot.source.xrootd.XRootDSource) else 0)
    valsblob = lz4f.compress(pkl.dumps(vals), compression_level=lz4_clevel)

    istart = chunksize * index
    istop = min(tree.numentries, (index + 1) * chunksize)

    return valsblob, (istop - istart), dataset