def add_new_branch(chain, branch_name, function, verbose=True): """Add new branch for loong chain in parallel - see ROOT.TTree.add_new_branch >>> chain = .... >>> chain.padd_new_branch ( 'new_branch' , 'px*py' ) """ from ostap.trees.trees import Chain from ostap.trees.trees import add_new_branch as _add_branch_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass elif isinstance(chain, ROOT.TTree): return _add_branch_(chain, branch_name, function, verbose=False) ch = Chain(chain) task = AddBranch(branch_name, function) wmgr = WorkManager(silent=not verbose) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) return nc
def cproject ( chain , histo , what , cuts , nentries = -1 , first = 0 , chunk_size = -1 , max_files = 5 , silent = False , **kwargs ) : """Make a projection of the loooong chain into histogram >>> chain = ... ## large chain >>> histo = ... ## histogram template >>> cproject ( chain , histo , 'mass' , 'pt>10' ) >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto For 12-core machine, clear speedup factor of about 8 is achieved """ # from ostap.trees.trees import Chain ch = Chain ( chain , first = first , nevents = nentries ) task = ProjectTask ( histo , what , cuts ) wmgr = WorkManager ( silent = silent , **kwargs ) wmgr.process ( task , ch.split ( chunk_size = chunk_size , max_files = max_files ) ) ## unpack results _f , _h = task.results () filtered = _f histo += _h del _h return filtered , histo
def add_new_branch ( chain , branch_name , function , verbose = True ) : """Add new branch for loong chain in parallel - see ROOT.TTree.add_new_branch >>> chain = .... >>> chain.padd_new_branch ( 'new_branch' , 'px*py' ) """ from ostap.trees.trees import Chain from ostap.trees.trees import add_new_branch as _add_branch_ if isinstance ( chain , ROOT.TChain ) and 1 < len ( chain.files () ) : pass elif isinstance ( chain , ROOT.TTree ) : return _add_branch_ ( chain , branch_name , function , verbose = False ) ch = Chain ( chain ) branches = set ( chain.branches() ) task = AddBranch ( branch_name , function ) wmgr = WorkManager ( silent = not verbose ) trees = ch.split ( max_files = 1 ) wmgr.process ( task , trees ) nc = ROOT.TChain ( chain.name ) for f in ch.files : nc.Add ( f ) nb = list ( set ( nc.branches () ) - branches ) if nb : logger.info ( 'Added branches:\n%s' % nc.table ( variables = nb , prefix = '# ' ) ) return nc
def addChoppingResponse( chain, ## input dataset to be updated chopper, ## chopping category/formula N, ## number of categrories inputs, ## input variables weights_files, ## files with TMVA weigths (tar/gz or xml) category_name='chopping', ## category name prefix='tmva_', ## prefix for TMVA-variable suffix='_response', ## suffix for TMVA-variable options='', ## TMVA-reader options verbose=True, ## verbosity flag aux=0.9): """ Helper function to add TMVA/chopping response into dataset >>> tar_file = trainer.tar_file >>> chain = ... >>> inputs = [ 'var1' , 'var2' , 'var2' ] ## input varibales to TMVA >>> addChoppingResponse ( chain , chopper , inputs , tar_file , prefix = 'tmva_' ) """ from ostap.tools.chopping import addChoppingResponse as _add_response_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass else: return _add_response_(dataset=chain, chopper=chopper, N=N, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, options=options, verbose=verbose, aux=aux) from ostap.trees.trees import Chain ch = Chain(chain) task = AddChopping(chopper=chopper, N=N, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, options=options, verbose=verbose, aux=aux) wmgr = WorkManager(silent=False) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) return nc
def addTMVAResponse( chain, ## input chain inputs, ## input variables weights_files, ## files with TMVA weigths (tar/gz or xml) prefix='tmva_', ## prefix for TMVA-variable suffix='_response', ## suffix for TMVA-variable options='', ## TMVA-reader options verbose=True, ## verbosity flag aux=0.9, **kwargs): ## for Cuts method : efficiency cut-off """ Helper function to add TMVA response into loong TChain >>> tar_file = trainer.tar_file >>> dataset = ... >>> inputs = [ 'var1' , 'var2' , 'var2' ] >>> dataset.addTMVAResponse ( inputs , tar_file , prefix = 'tmva_' ) """ from ostap.tools.tmva import addTMVAResponse as _add_response_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass else: return _add_response_(dataset=chain, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, verbose=verbose, aux=aux) from ostap.trees.trees import Chain ch = Chain(chain) branches = set(chain.branches()) ## create the task task = AddTMVA(inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, verbose=verbose, aux=aux) wmgr = WorkManager(silent=False, **kwargs) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) nb = list(set(nc.branches()) - branches) if nb: logger.info('Added branches:\n%s' % nc.table(variables=nb, prefix='# ')) return nc
def pprocess( chain, selector, nevents=-1, first=0, shortcut=True, ## important chunk_size=100000, ## important max_files=5, ppservers=(), use_frame=20000, ## important silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial, use_frame) wmgr = WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention(skipped) if stat.skipped else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat.processed, stat.total, skipped, selector.cuts(), dataset)) return 1
def pStatVar(chain, what, cuts='', nevents=-1, first=0, chunk_size=250000, max_files=1, silent=True, **kwargs): """ Parallel processing of loooong chain/tree >>> chain = ... >>> chain.pstatVar( 'mass' , 'pt>1') """ ## few special/trivial cases print('I am pStatVar') last = min(n_large, first + nevents if 0 < nevents else n_large) if 0 <= first and 0 < nevents < chunk_size: print('I am pStatVar/0') return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TChain): if 1 == chain.nFiles() and len(chain) < chunk_size: print('I am pStatVar/1') return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size: print('I am pStatVar/2') return chain.statVar(what, cuts, first, last) from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nevents) task = StatVarTask(what, cuts) wmgr = WorkManager(silent=silent, **kwargs) trees = ch.split(chunk_size=chunk_size, max_files=max_files) print('statvar-pprocess', chain.GetName(), len(trees)) wmgr.process(task, trees) del trees del ch results = task.results() return results
def pStatVar(chain, what, cuts='', nevents=-1, first=0, chunk_size=100000, max_files=10, ppservers=(), silent=True): """ Parallel processing of loooong chain/tree >>> chain = ... >>> chain.pstatVar( 'mass' , 'pt>1') """ ## few special/trivial cases last = min(n_large, first + nevents if 0 < nevents else n_large) if 0 <= first and 0 < nevents < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TChain): if chain.nFiles() < 5 and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nevents) task = StatVarTask(what, cuts) wmgr = WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees del ch results = task.results() return results
def tproject( tree, ## the tree histo, ## histogram what, ## variable/expression/list to be projected cuts='', ## selection/weighting criteria nentries=-1, ## number of entries first=0, ## the first entry chunk_size=1000000, ## chunk size max_files=50, ## not-used .... silent=False): ## silent processing """Make a projection of the loooong tree into histogram >>> tree = ... ## large chain >>> histo = ... ## histogram template >>> tproject ( tree , histo , 'mass' , 'pt>10' ) >>> tree.pproject ( histo , 'mass' , 'pt>10' ) ## ditto - significant gain can be achieved for very large TTrees with complicated expressions and cuts - maxentries parameter should be rather large Arguments: - tree the tree - histo the histogram - what variable/expression/varlist to be projected - cuts selection/weighting criteria - nentries number of entries to process (>0: all entries in th tree) - first the first entry to process - maxentries chunk size for parallel processing """ from ostap.trees.trees import Tree ch = Tree(tree, first=first, nevents=nentries) task = ProjectTask(histo, what, cuts) wmgr = WorkManager(silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) ## unpack results _f, _h = task.results() filtered = _f histo += _h del _h return filtered, histo
def copy_files(file_pairs, progress=True, maxfiles=5, copier=None, **kwargs): """Copy files in parallel """ if not copier: from ostap.utils.utils import copy_file copier = copy_file task = CopyTask(copier=copier) wmgr = WorkManager(silent=not progress, **kwargs) if maxfiles < 1: maxfiles = 1 from ostap.utils.utils import chunked data = chunked(file_pairs, maxfiles) wmgr.process(task, data) copied = task.results() return copied
def chopping_training(chopper, **kwargs): """Perform parallel traning of TMVA/Chopping - internal function for ostap.tools.chopping.Trainer - see ostap.tools.chopping.Trainer """ import sys task = ChopperTraining() wmgr = WorkManager(silent=False, **kwargs) params = [(i, chopper) for i in range(chopper.N)] sys.stdout.flush() sys.stderr.flush() wmgr.process(task, params) sys.stdout.flush() sys.stderr.flush() return task.results()
def parallel_fill ( chain , selector , nevents = -1 , first = 0 , shortcut = True , ## important chunk_size = 1000000 , ## important max_files = 5 , use_frame = 20000 , ## important silent = False , job_chunk = -1 , **kwargs ) : """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ import ostap.fitting.roofit from ostap.fitting.pyselectors import SelectorWithVars from ostap.trees.trees import Chain assert isinstance ( selector , SelectorWithVars ) , \ "Invalid type of ``selector'': %s" % type ( selector ) ch = Chain ( chain ) selection = selector.selection variables = selector.variables roo_cuts = selector.roo_cuts ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and ( 0 > nevents or len ( chain ) <= nevents ) if all and trivial and 1 < len( ch.files ) : logger.info ("Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask ( variables = variables , selection = selection , roo_cuts = roo_cuts , trivial = trivial , use_frame = use_frame ) wmgr = WorkManager ( silent = silent , **kwargs ) trees = ch.split ( chunk_size = chunk_size , max_files = max_files ) wmgr.process( task , trees , chunk_size = job_chunk ) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention ( skipped ) if stat.skipped else '' logger.info ( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n%s' % ( selector.name , stat.processed , stat.total , skipped , selector.cuts() , dataset.table ( prefix = '# ' ) ) ) return dataset, stat
def makePlots(the_func, particle, stripping, polarity, trackcuts, runMin=0, runMax=-1, verbose=True, maxFiles=-1, parallel=False): #********************************************************************** from PIDPerfScripts.DataFuncs import CheckStripVer, CheckMagPol, CheckPartType CheckStripVer(stripping) CheckMagPol(polarity) CheckPartType(particle) #====================================================================== # Create dictionary holding: # - Reconstruction version ['RecoVer'] # - np.array of: # - MagUp run limits ['UpRuns'] # - MagDown run limits ['DownRuns'] #====================================================================== from PIDPerfScripts.DataFuncs import GetRunDictionary DataDict = GetRunDictionary(stripping, particle, verbose=verbose) if trackcuts and 0 < runMin: trackcuts += ' && runNumber>=%d ' % runMin if trackcuts and 0 < runMax: trackcuts += ' && runNumber<=%d ' % runMax #====================================================================== # Determine min and max file indicies #====================================================================== if runMax < runMin: runMax = None from PIDPerfScripts.DataFuncs import GetMinMaxFileDictionary IndexDict = GetMinMaxFileDictionary(DataDict, polarity, runMin, runMax, maxFiles, verbose) #====================================================================== # Append runNumber limits to TrackCuts #====================================================================== logger.debug('Track Cuts: %s ' % trackcuts) #====================================================================== # Declare default list of PID plots #====================================================================== plots = [] minEntries = 1000 #====================================================================== # Loop over all calibration subsamples #====================================================================== mn = IndexDict['minIndex'] mx = IndexDict['maxIndex'] from ostap.utils.memory import memory from ostap.utils.utils import NoContext if parallel: logger.info('Parallel processing %d datafiles %s %s %s ' % (mx - mn + 1, particle, stripping, polarity)) task = PidCalibTask(the_func, getconfig={ 'particle': particle, 'stripping': stripping, 'polarity': polarity, 'trackcuts': trackcuts }, verbose=False) from ostap.parallel.parallel import WorkManager wmgr = WorkManager(silent=False) wmgr.process(task, range(mn, mx + 1)) return task.results() logger.info('Start the loop over %d datafiles %s %s %s ' % (mx - mn + 1, particle, stripping, polarity)) from ostap.utils.progress_bar import progress_bar for index in progress_bar(xrange(mn, mx + 1)): manager = memory() if verbose else NoContext() with manager: dataset = getDataSet(particle, stripping, polarity, trackcuts, index, verbose=verbose) if not dataset: continue new_plots = plots = the_func(particle, dataset, plots, verbose) if not plots: plots = new_plots else: for oh, nh in zip(plots, new_plots): oh.Add(nh) dataset.reset() if dataset: del dataset return plots
def reduce(chain, selection={}, save_vars=(), new_vars={}, no_vars=(), output='', name='', addselvars=False, silent=False, **kwargs): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain from ostap.frames.tree_reduce import ReduceTree if isinstance(chain, ROOT.TChain) and 1 >= len(chain.files()): return chain.reduce(selection=selection, save_vars=save_vars, new_vars=new_vars, no_vars=no_vars, output=output, name=name, addselvars=addselvars, silent=silent) nb0 = len(chain.branches()) ne0 = len(chain) ch = Chain(chain) task = ReduceTask(selection=selection, save_vars=save_vars, new_vars=new_vars, addselvars=addselvars, name=name) wmgr = WorkManager(silent=silent, **kwargs) trees = ch.split(max_files=1) wmgr.process(task, trees) result, table = task.results() for i in result.files: result.trash.add(i) if output: ## merge results into single output file reduced = ReduceTree(result.chain, selection='', save_vars=(), addselvars=False, silent=True, output=output, name=name) result = Chain(reduced.chain) if not silent: from ostap.frames.frames import report_print_table title = 'Tree -> Frame -> Tree filter/transformation' logger.info('Reduce tree:\n%s' % report_print_table(table, title, '# ')) nb = len(result.chain.branches()) ne = len(result.chain) f = float(nb0 * ne0) / (nb * ne) logger.info('reduce: (%dx%d) -> (%dx%d) %.1f (branches x entries) ' % (nb0, ne0, nb, ne, f)) return result
def parallel_toys2( gen_pdf, ## PDF to generate toys fit_pdf, ## PDF to generate toys nToys, ## total number of toys nSplit, ## split into <code>nSplit</code> subjobs data, ## template for dataset/variables gen_config, ## parameters for <code>pdf.generate</code> fit_config={}, ## parameters for <code>pdf.fitTo</code> gen_pars={}, fit_pars={}, more_vars={}, gen_fun=None, ## generator function ( pdf , varset , **gen_config ) fit_fun=None, ## fit function ( pdf , dataset , **fit_config ) accept_fun=None, ## accept function ( fit-result, pdf, dataset ) silent=True, progress=False, **kwargs): """Make `ntoys` pseudoexperiments, splitting them into `nSplit` subjobs to be executed in parallel - Schematically: >>> for toy in range ( nToys ) : >>> ... dataset = gen_fun ( gen_pdf , ... , **gen_config ) >>> ... result = fit_fun ( fit_pdf , dataset , **fit_config ) >>> ... if not accept_fun ( result , fit_pdf , dataset ) : continue >>> .... < collect statistics here > For each experiment: 1. generate dataset using `pdf` with variables specified in `data` and configuration specified via `gen_config` for each generation the parameters of `pdf` are reset for their initial values and valeus from `init_pars` 2. fit generated dataset with `pdf` using configuration specified via `fit_config` - pdf PDF to be used for generation and fitting - nToys total number of pseudoexperiments to generate - nSplit split total number of pseudoexperiments into `nSplit` subjobs - data variable list of variables to be used for dataset generation - gen_config configuration of <code>pdf.generate</code> - fit_config configuration of <code>pdf.fitTo</code> - gen_pars redefine these parameters for generation of each pseudoexperiment - fit_pars redefine these parameters for fitting of each pseudoexperiment - more_vars dictionary of functions to define the additional results - silent silent toys? - progress show progress bar? It returns a dictionary with fit results for the toys and a dictionary of statistics >>> pdf = ... ... results, stats = parallel_toys2 ( ... gen_pdf = gen_pdf , ## PDF to generate toys ... fit_pdf = gen_pdf , ## PDF to fit toys ... nToys = 100000 , ## total number of toys ... nSplit = 100 , ## split them into `nSplit` subjobs ... data = [ 'mass' ] , ## varibales in dataset ... gen_config = { 'nEvents' : 5000 } , ## configuration of `pdf.generate` ... fit_config = { 'ncpus' : 2 } , ## configuration of `pdf.fitTo` ... gen_pars = { 'mean' : 0.0 , 'sigma' : 1.0 } ## parameters to use for generation ... fit_pars = { 'meanG' : 0.0 , 'sigmaG' : 1.0 } ## parameters to use for fitting ... ) Derived parameters can be also retrived via <code>more_vars</code> argument: >>> ratio = lambda res,pdf : res.ratio('x','y') >>> more_vars = { 'Ratio' : ratio } >>> r, s = parallel_toys2 ( .... , more_vars = more_vars , ... ) Parallelization is controlled by two arguments - `ncpus` : number of local cpus to use, default is `'autodetect'`, that means all local processors - `ppservers`: list of serevers to be used (for parallel python) """ from ostap.core.ostap_types import integer_types assert gen_config and 'nEvents' in gen_config,\ 'Number of events per toy must be specified via "gen_config" %s' % gen_config assert isinstance ( nToys , integer_types ) and 0 < nToys ,\ 'Jobid %s: Invalid "nToys" argument %s/%s' % ( jobid , nToys , type ( nToys ) ) assert isinstance ( nSplit , integer_types ) and 0 < nSplit ,\ 'Jobid %s: Invalid "nSplit" argument %s/%s' % ( jobid , nSplit , type ( nSplit ) ) import ostap.fitting.toys as Toys if 1 == nSplit: return Toys.make_toys2(gen_pdf=gen_pdf, fit_pdf=fit_pdf, nToys=nToys, data=data, gen_config=gen_config, fit_config=fit_config, gen_pars=gen_pars, fit_pars=fit_pars, more_vars=more_vars, gen_fun=gen_fun, fit_fun=fit_fun, accept_fun=accept_fun, silent=silent, progress=progress) import ostap.fitting.roofit import ostap.fitting.dataset import ostap.fitting.variables import ostap.fitting.roofitresult params = gen_pdf.params() toy_data = [] if isinstance(data, ROOT.RooAbsData): varset = data.varset() for v in varset: toy_data.append(v.GetName()) else: for v in data: if isinstance(v, ROOT.RooAbsArg): toy_data.append(v.GetName()) elif isinstance(v, string_types) and v in params: toy_data.append(v) else: raise TypeError("Invalid type of variable %s/%s" % (v, type(v))) gen_init_pars = Toys.vars_transform(gen_pars) fit_init_pars = Toys.vars_transform(fit_pars) # ======================================================================== if nToys <= nSplit: nToy = 1 nSplit = nToys nRest = 0 else: nToy, nRest = divmod(nToys, nSplit) task = ToysTask2(gen_pdf=gen_pdf, fit_pdf=fit_pdf, data=toy_data, gen_config=gen_config, fit_config=fit_config, gen_pars=gen_init_pars, fit_pars=fit_init_pars, more_vars=more_vars, gen_fun=gen_fun, fit_fun=fit_fun, accept_fun=accept_fun, silent=silent, progress=progress) wmgr = WorkManager(silent=False, **kwargs) data = nSplit * [nToy] if nRest: data.append(nRest) wmgr.process(task, data) results, stats = task.results() Toys.print_stats(stats, nToys) return results, stats