예제 #1
0
def prepare_data(tmpdir, nfiles=100, nentries=100, ppservers=(), silent=True):

    ## Use generic Task from Kisa
    from ostap.parallel.parallel import GenericTask as Task
    task = Task(processor=create_tree)

    ## task  = PrepareTask ()
    wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent)

    from ostap.utils.cleanup import CleanUp
    tmpfile = CleanUp.tempfile(prefix='test_kisa_', suffix='.root', dir=tmpdir)

    fname = '%s/test_kisa_%d.root'

    files = [
        CleanUp.tempfile(prefix='test_kisa_', suffix='.root', dir=tmpdir)
        for i in range(nfiles)
    ]

    wmgr.process(task, [(f, nentries) for f in files])

    the_files = set()
    for f in task.results():
        if os.path.exists(f):
            the_files.add(f)

    from ostap.trees.data import Data
    the_files = list(the_files)
    the_files.sort()
    return Data('S', list(the_files))
예제 #2
0
def cproject(chain,
             histo,
             what,
             cuts,
             nentries=-1,
             first=0,
             chunk_size=1000000,
             silent=False):
    """Make a projection of the loooong chain into histogram
    >>> chain = ... ## large chain
    >>> histo = ... ## histogram template 
    >>> cproject        ( chain , histo , 'mass' , 'pt>10' )
    >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto 
    >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto     
    For 12-core machine, clear speedup factor of about 8 is achieved     
    """
    #

    from ostap.trees.trees import Chain
    ch = Chain(chain, first=first, nevents=nentries)

    task = ProjectTask(histo, what, cuts)
    wmgr = Parallel.WorkManager(silent=silent)
    wmgr.process(task, ch.split(chunk_size=chunk_size))

    filtered = task.output[0]
    histo += task.output[1]

    return filtered, histo
예제 #3
0
def _pprocess_(chain,
               selector,
               nevents=-1,
               first=0,
               shortcut=True,
               chunk_size=100000,
               ppservers=(),
               max_files=10,
               silent=False):
    """ Parallel processing of loooong chain/tree 
    >>>chain    = ...
    >>> selector =  ...
    >>> chain.pprocess ( selector )
    """

    from ostap.trees.trees import Chain

    ch = Chain(chain)

    selection = selector.selection
    variables = selector.variables

    trivial = selector.trivial_vars and not selector.morecuts

    all = 0 == first and (0 > nevents or len(chain) <= nevents)

    if all and trivial and 1 < len(ch.files):
        logger.info(
            "Configuration is ``trivial'': redefine ``chunk-size'' to -1")
        chunk_size = -1

    task = FillTask(variables, selection, trivial)
    wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent)
    trees = ch.split(chunk_size=chunk_size, max_files=max_files)
    wmgr.process(task, trees)
    del trees

    dataset, stat = task.output

    selector.data = dataset
    selector.stat = stat

    from ostap.logger.logger import attention
    skipped = 'Skipped:%d' % stat[2]
    skipped = '/' + attention(skipped) if stat[2] else ''
    logger.info(
        'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' %
        (selector.name, stat[1], stat[0], skipped, selector.cuts(), dataset))

    return 1
예제 #4
0
def cproject(chain, histo, what, cuts):
    """Make a projection of the loooong chain into histogram
    >>> chain = ... ## large chain
    >>> histo = ... ## histogram template 
    >>> cproject        ( chain , histo , 'mass' , 'pt>10' )
    >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto 
    >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto     
    For 12-core machine, clear speedup factor of about 8 is achieved     
    """
    #
    if not chain:
        return 0, histo
    if not histo:
        logger.error('cproject: invalid histogram')
        return 0, histo

    import ROOT
    histo.Reset()

    if not isinstance(chain, ROOT.TChain):
        logger.warning(
            'cproject method is TChain-specific, skip parallelization')
        from ostap.trees.trees import _tt_project_
        return _tt_project_(chain, histo, what, cuts)

    if isinstance(cuts, ROOT.TCut): cuts = str(cuts)
    ##
    if isinstance(what, str): what = what.split(',')
    if isinstance(what, str): what = what.split(';')
    if isinstance(what, str): what = [what]

    import ostap.trees.trees
    files = chain.files()

    cname = chain.GetName()

    params = [(f, cname, str(w), cuts) for f in files for w in what]

    task = ProjectTask(histo)
    wmgr = Parallel.WorkManager()
    wmgr.process(task, params)

    filtered = task.output[0]
    histo += task.output[1]

    return filtered, histo
예제 #5
0
def fillDataSet(chain, variables, selection, ppservers=()):
    """Fill dataset from loooong TChain using per-file parallelisation
    >>> chain =
    >>> vars  = ...
    >>> dset  = fillDataSet ( chain , vars , 'pt>10' )
    - for 12-core machine, clear speed-up factor of about 8 is achieved 
    """

    task = FillTask(variables, selection)
    wmgr = Parallel.WorkManager(ppservers=ppservers)

    cname = chain.GetName()
    files = chain.files()
    pairs = [(cname, i) for i in files]

    wmgr.process(task, pairs)

    return task.output
예제 #6
0
def _pStatVar_(chain,
               what,
               cuts='',
               nevents=-1,
               first=0,
               chunk_size=100000,
               max_files=10,
               ppservers=(),
               silent=True):
    """ Parallel processing of loooong chain/tree 
    >>> chain    = ...
    >>> chain.pstatVar( 'mass' , 'pt>1') 
    """

    ## few special/trivial cases

    last = min(n_large, first + nevents if 0 < nevents else n_large)

    if 0 <= first and 0 < nevents < chunk_size:
        return chain.statVar(what, cuts, first, last)
    elif isinstance(chain, ROOT.TChain):
        if chain.nFiles() < 5 and len(chain) < chunk_size:
            return chain.statVar(what, cuts, first, last)
    elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size:
        return chain.statVar(what, cuts, first, last)

    from ostap.trees.trees import Chain
    ch = Chain(chain, first=first, nevents=nevents)

    task = StatVarTask(what, cuts)
    wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent)

    trees = ch.split(chunk_size=chunk_size, max_files=max_files)

    wmgr.process(task, trees)

    del trees
    del ch

    return task.output
예제 #7
0
def tproject(
        tree,  ## the tree 
        histo,  ## histogram 
        what,  ## variable/expression/list to be projected 
        cuts='',  ## selection/weighting criteria 
        nentries=-1,  ## number of entries 
        first=0,  ## the first entry 
        chunk_size=1000000,  ## chunk size 
        silent=False):  ## silent processing
    """Make a projection of the loooong tree into histogram
    >>> tree  = ... ## large chain
    >>> histo = ... ## histogram template 
    >>> tproject ( tree , histo , 'mass' , 'pt>10' )    
    >>> tree.pproject ( histo , 'mass' , 'pt>10' )    ## ditto 
    - significant gain can be achieved for very large TTrees with complicated expressions and cuts
    - maxentries parameter should be rather large
    Arguments:
    - tree       the tree
    - histo      the histogram
    - what       variable/expression/varlist to be projected
    - cuts       selection/weighting criteria 
    - nentries   number of entries to process  (>0: all entries in th tree)
    - first      the first entry to process
    - maxentries chunk size for parallel processing 
    """

    from ostap.trees.trees import Tree
    ch = Tree(tree, first=first, nevents=nevents)

    task = ProjectTask(histo, what, cuts)
    wmgr = Parallel.WorkManager(silent=silent)
    wmgr.process(task, ch.split(chunk_size=chunk_size))

    filtered = task.output[0]
    histo += task.output[1]

    return filtered, histo
예제 #8
0
def tproject(
        tree,  ## the tree 
        histo,  ## histogram 
        what,  ## variable/expression/list to be projected 
        cuts='',  ## selection/weighting criteria 
        nentries=-1,  ## number of entries 
        first=0,  ## the first entry 
        maxentries=1000000):  ## chunk size
    """Make a projection of the loooong tree into histogram
    >>> tree  = ... ## large chain
    >>> histo = ... ## histogram template 
    >>> tproject ( tree , histo , 'mass' , 'pt>10' )    
    >>> tree.pproject ( histo , 'mass' , 'pt>10' )    ## ditto 
    - significant gain can be achieved for very large ttrees with complicated expressions and cuts
    - maxentries parameter should be rather large
    Arguments:
    - tree       the tree
    - histo      the histogram
    - what       variable/expression/varlist to be projected
    - cuts       selection/weighting criteria 
    - nentries   number of entries to process  (>0: all entries in th tree)
    - first      the first entry to process
    - maxentries chunk size for parallel processing 
    """
    if not tree:
        return 0, histo
    if not histo:
        logger.error('tproject: invalid histogram')
        return 0, histo

    import ROOT
    histo.Reset()

    num = len(tree)
    if num <= first:
        return 0, histo

    if 0 > nentries: nentries = n_large

    maxentries = long(maxentries)
    if 0 >= maxentries: maxentries = n_large

    if 0 > first: first = 0

    ## use the regular projection
    from ostap.trees.trees import _tt_project_

    fname = None
    tname = None

    if isinstance(tree, ROOT.TChain):

        if 1 == len(tree.files()):

            fname = tree.files()[0]
            tname = tree.GetName()

        else:

            logger.warning('``tproject'
                           ' method is TTree-specific, skip parallelization')
            return _tt_project_(tree, histo, what, cuts, '', nentries, first)

    else:

        tdir = tree.GetDirectory()
        ftree = tdir.GetFile()
        if not ftree:
            logger.debug('TTree is not file resident, skip parallelization')
            return _tt_project_(tree, histo, what, cuts, '', total, first)
        fname = ftree.GetName()
        tpath = tdir.GetPath()
        pr, d, path = tpath.rpartition(':')
        tname = path + '/' + tree.GetName()

    if not fname:
        logger.info("Can't determine fname, skip parallelization")
        return _tt_project_(tree, histo, what, cuts, '', total, first)

    if not tname:
        logger.info("Can't determine tname, skip parallelization")
        return _tt_project_(tree, histo, what, cuts, '', total, first)

    #
    if isinstance(cuts, ROOT.TCut): cuts = str(cuts)
    if isinstance(what, ROOT.TCut): what = str(what)
    ##
    if isinstance(what, str): what = what.split(',')
    if isinstance(what, str): what = what.split(',')
    if isinstance(what, str): what = what.split(';')
    if isinstance(what, str): what = [what]

    ## nothing to project
    if not what:
        return 0, histo

    ## total number of events to process :
    total = min(num - first, nentries)

    ## the event range is rather short, no real need  in parallel processing
    if total * len(what) < maxentries and len(what) < 4:
        return _tt_project_(tree, histo, what, cuts, '', total, first)

    ## number of chunks & reminder
    nchunks, rest = divmod(total, maxentries)
    csize = int(total / nchunks)  ## chunk size

    ## final list of parameters [ (file_name, what , cuts , first_event , num_events ) , ... ]
    params = []

    for i in range(nchunks):
        for w in what:
            params.append(
                (fname, tname, str(w), cuts, first + i * csize, csize))

    if rest:
        nchunks += 1
        for w in what:
            params.append(
                (fname, tname, str(w), cuts, first + nchunks * csize, rest))

    task = ProjectTask(histo)
    wmgr = Parallel.WorkManager()
    wmgr.process(task, params)

    filtered = task.output[0]
    histo += task.output[1]

    return filtered, histo