def add_new_branch(chain, branch_name, function, verbose=True): """Add new branch for loong chain in parallel - see ROOT.TTree.add_new_branch >>> chain = .... >>> chain.padd_new_branch ( 'new_branch' , 'px*py' ) """ from ostap.trees.trees import Chain from ostap.trees.trees import add_new_branch as _add_branch_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass elif isinstance(chain, ROOT.TTree): return _add_branch_(chain, branch_name, function, verbose=False) ch = Chain(chain) task = AddBranch(branch_name, function) wmgr = WorkManager(silent=not verbose) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) return nc
def cproject(chain, histo, what, cuts, nentries=-1, first=0, chunk_size=1000000, silent=False): """Make a projection of the loooong chain into histogram >>> chain = ... ## large chain >>> histo = ... ## histogram template >>> cproject ( chain , histo , 'mass' , 'pt>10' ) >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto For 12-core machine, clear speedup factor of about 8 is achieved """ # from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nentries) task = ProjectTask(histo, what, cuts) wmgr = Parallel.WorkManager(silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) filtered = task.output[0] histo += task.output[1] return filtered, histo
def cproject ( chain , histo , what , cuts , nentries = -1 , first = 0 , chunk_size = -1 , max_files = 5 , silent = False , **kwargs ) : """Make a projection of the loooong chain into histogram >>> chain = ... ## large chain >>> histo = ... ## histogram template >>> cproject ( chain , histo , 'mass' , 'pt>10' ) >>> chain.ppropject ( histo , 'mass' , 'pt>0' ) ## ditto >>> chain.cpropject ( histo , 'mass' , 'pt>0' ) ## ditto For 12-core machine, clear speedup factor of about 8 is achieved """ # from ostap.trees.trees import Chain ch = Chain ( chain , first = first , nevents = nentries ) task = ProjectTask ( histo , what , cuts ) wmgr = WorkManager ( silent = silent , **kwargs ) wmgr.process ( task , ch.split ( chunk_size = chunk_size , max_files = max_files ) ) ## unpack results _f , _h = task.results () filtered = _f histo += _h del _h return filtered , histo
def add_new_branch ( chain , branch_name , function , verbose = True ) : """Add new branch for loong chain in parallel - see ROOT.TTree.add_new_branch >>> chain = .... >>> chain.padd_new_branch ( 'new_branch' , 'px*py' ) """ from ostap.trees.trees import Chain from ostap.trees.trees import add_new_branch as _add_branch_ if isinstance ( chain , ROOT.TChain ) and 1 < len ( chain.files () ) : pass elif isinstance ( chain , ROOT.TTree ) : return _add_branch_ ( chain , branch_name , function , verbose = False ) ch = Chain ( chain ) branches = set ( chain.branches() ) task = AddBranch ( branch_name , function ) wmgr = WorkManager ( silent = not verbose ) trees = ch.split ( max_files = 1 ) wmgr.process ( task , trees ) nc = ROOT.TChain ( chain.name ) for f in ch.files : nc.Add ( f ) nb = list ( set ( nc.branches () ) - branches ) if nb : logger.info ( 'Added branches:\n%s' % nc.table ( variables = nb , prefix = '# ' ) ) return nc
def addChoppingResponse( chain, ## input dataset to be updated chopper, ## chopping category/formula N, ## number of categrories inputs, ## input variables weights_files, ## files with TMVA weigths (tar/gz or xml) category_name='chopping', ## category name prefix='tmva_', ## prefix for TMVA-variable suffix='_response', ## suffix for TMVA-variable options='', ## TMVA-reader options verbose=True, ## verbosity flag aux=0.9): """ Helper function to add TMVA/chopping response into dataset >>> tar_file = trainer.tar_file >>> chain = ... >>> inputs = [ 'var1' , 'var2' , 'var2' ] ## input varibales to TMVA >>> addChoppingResponse ( chain , chopper , inputs , tar_file , prefix = 'tmva_' ) """ from ostap.tools.chopping import addChoppingResponse as _add_response_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass else: return _add_response_(dataset=chain, chopper=chopper, N=N, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, options=options, verbose=verbose, aux=aux) from ostap.trees.trees import Chain ch = Chain(chain) task = AddChopping(chopper=chopper, N=N, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, options=options, verbose=verbose, aux=aux) wmgr = WorkManager(silent=False) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) return nc
def addTMVAResponse( chain, ## input chain inputs, ## input variables weights_files, ## files with TMVA weigths (tar/gz or xml) prefix='tmva_', ## prefix for TMVA-variable suffix='_response', ## suffix for TMVA-variable options='', ## TMVA-reader options verbose=True, ## verbosity flag aux=0.9, **kwargs): ## for Cuts method : efficiency cut-off """ Helper function to add TMVA response into loong TChain >>> tar_file = trainer.tar_file >>> dataset = ... >>> inputs = [ 'var1' , 'var2' , 'var2' ] >>> dataset.addTMVAResponse ( inputs , tar_file , prefix = 'tmva_' ) """ from ostap.tools.tmva import addTMVAResponse as _add_response_ if isinstance(chain, ROOT.TChain) and 1 < len(chain.files()): pass else: return _add_response_(dataset=chain, inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, verbose=verbose, aux=aux) from ostap.trees.trees import Chain ch = Chain(chain) branches = set(chain.branches()) ## create the task task = AddTMVA(inputs=inputs, weights_files=weights_files, prefix=prefix, suffix=suffix, verbose=verbose, aux=aux) wmgr = WorkManager(silent=False, **kwargs) trees = ch.split(max_files=1) wmgr.process(task, trees) nc = ROOT.TChain(chain.name) for f in ch.files: nc.Add(f) nb = list(set(nc.branches()) - branches) if nb: logger.info('Added branches:\n%s' % nc.table(variables=nb, prefix='# ')) return nc
def pprocess( chain, selector, nevents=-1, first=0, shortcut=True, ## important chunk_size=100000, ## important max_files=5, ppservers=(), use_frame=20000, ## important silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial, use_frame) wmgr = WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention(skipped) if stat.skipped else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat.processed, stat.total, skipped, selector.cuts(), dataset)) return 1
def reduce(tree, selection, save_vars=(), new_vars={}, no_vars=(), output='', name='', addselvars=False, silent=False): """ Powerful method to reduce/tranform the tree/chain. It relies on Ostap.DataFrame ( alias for ROOT.ROOT.DataFrame) and allows - filter entries from TTree/TChain - add new colums - remove unnesessary columns >>> tree = .... >>> reduced1 = tree.reduce ( 'pt>1' ) >>> reduced2 = tree.reduce ( 'pt>1' , vars = [ 'p', 'pt' ,'q' ] ) >>> reduced3 = tree.reduce ( 'pt>1' , no_vars = [ 'Q', 'z' ,'x' ] ) >>> reduced4 = tree.reduce ( 'pt>1' , new_vars = { 'pt2' : 'pt*pt' } ) >>> reduced5 = tree.reduce ( 'pt>1' , new_vars = { 'pt2' : 'pt*pt' } , output = 'OUTPUT.root' ) """ nb0 = len(tree.branches()) ne0 = len(tree) reduced = ReduceTree(tree, selection=selection, save_vars=save_vars, new_vars=new_vars, no_vars=no_vars, output=output, name=name, addselvars=addselvars, tmp_keep=True, silent=silent) from ostap.trees.trees import Chain result = Chain(reduced.chain) if not output: result.trash.add(reduced.output) if not silent: logger.info('Reduce: %s' % str(reduced)) else: nb = len(result.chain.branches()) ne = len(result.chain) f = float(nb0 * ne0) / (nb * ne) logger.info('reduce: (%dx%d) -> (%dx%d) %.1f (branches x entries) ' % (nb0, ne0, nb, ne, f)) return result
def process(self, jobid, item): """The actual processing ``params'' is assumed to be a tuple-like entity: - the file name - the tree name in the file - the variable/expression/expression list of quantities to project - the selection/weighting criteria - the first entry in tree to process - number of entries to process """ import ROOT from ostap.logger.utils import logWarning with logWarning(): import ostap.core.pyrouts import ostap.trees.trees import ostap.histos.histos import ostap.frames.frames from ostap.trees.trees import Chain, Tree input = Chain(name=item.name, files=item.files, first=item.first, nevents=item.nevents) chain = input.chain first = input.first nevents = input.nevents ## use the regular projection from ostap.trees.trees import _tt_project_ ## Create the output histogram NB! (why here???) from ostap.core.core import ROOTCWD with ROOTCWD(): ROOT.gROOT.cd() histo = self.histo.Clone() self.__output = 0, histo from ostap.trees.trees import _tt_project_ self.__output = _tt_project_(tree=chain, histo=histo, what=self.what, cuts=self.cuts, options='', nentries=nevents, firstentry=first) del item return self.__output
def pStatVar(chain, what, cuts='', nevents=-1, first=0, chunk_size=250000, max_files=1, silent=True, **kwargs): """ Parallel processing of loooong chain/tree >>> chain = ... >>> chain.pstatVar( 'mass' , 'pt>1') """ ## few special/trivial cases print('I am pStatVar') last = min(n_large, first + nevents if 0 < nevents else n_large) if 0 <= first and 0 < nevents < chunk_size: print('I am pStatVar/0') return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TChain): if 1 == chain.nFiles() and len(chain) < chunk_size: print('I am pStatVar/1') return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size: print('I am pStatVar/2') return chain.statVar(what, cuts, first, last) from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nevents) task = StatVarTask(what, cuts) wmgr = WorkManager(silent=silent, **kwargs) trees = ch.split(chunk_size=chunk_size, max_files=max_files) print('statvar-pprocess', chain.GetName(), len(trees)) wmgr.process(task, trees) del trees del ch results = task.results() return results
def _pprocess_(chain, selector, nevents=-1, first=0, shortcut=True, chunk_size=100000, ppservers=(), silent=False): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain ch = Chain(chain) selection = selector.selection variables = selector.variables trivial = selector.trivial all = 0 == first and (0 > nevents or len(chain) <= nevents) if all and trivial and 1 < len(ch.files): logger.info( "Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask(variables, selection, trivial) wmgr = Parallel.WorkManager(ppservers=ppservers, silent=silent) wmgr.process(task, ch.split(chunk_size=chunk_size)) dataset, stat = task.output selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat[2] skipped = '/' + attention(skipped) if stat[2] else '' logger.info( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n# %s' % (selector.name, stat[1], stat[0], skipped, selector.cuts(), dataset)) return 1
def pStatVar(chain, what, cuts='', nevents=-1, first=0, chunk_size=100000, max_files=10, ppservers=(), silent=True): """ Parallel processing of loooong chain/tree >>> chain = ... >>> chain.pstatVar( 'mass' , 'pt>1') """ ## few special/trivial cases last = min(n_large, first + nevents if 0 < nevents else n_large) if 0 <= first and 0 < nevents < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TChain): if chain.nFiles() < 5 and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) elif isinstance(chain, ROOT.TTree) and len(chain) < chunk_size: return chain.statVar(what, cuts, first, last) from ostap.trees.trees import Chain ch = Chain(chain, first=first, nevents=nevents) task = StatVarTask(what, cuts) wmgr = WorkManager(ppservers=ppservers, silent=silent) trees = ch.split(chunk_size=chunk_size, max_files=max_files) wmgr.process(task, trees) del trees del ch results = task.results() return results
def process(self, jobid, item): import ROOT import ostap.core.pyrouts from ostap.trees.trees import Chain from ostap.frames.tree_reduce import ReduceTree ## unpack the input data chain = item.chain rt = ReduceTree( chain, selection=self.selection, save_vars=self.save_vars, new_vars=self.new_vars, addselvars=self.addselvars, name=self.name, tmp_keep=True, ## attention! True is here! silent=True) cname = rt.chain.GetName() cfile = rt.output return Chain(name=cname, files=[cfile]), rt.table
def parallel_fill ( chain , selector , nevents = -1 , first = 0 , shortcut = True , ## important chunk_size = 1000000 , ## important max_files = 5 , use_frame = 20000 , ## important silent = False , job_chunk = -1 , **kwargs ) : """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ import ostap.fitting.roofit from ostap.fitting.pyselectors import SelectorWithVars from ostap.trees.trees import Chain assert isinstance ( selector , SelectorWithVars ) , \ "Invalid type of ``selector'': %s" % type ( selector ) ch = Chain ( chain ) selection = selector.selection variables = selector.variables roo_cuts = selector.roo_cuts ## trivial = selector.trivial_vars and not selector.morecuts trivial = selector.really_trivial and not selector.morecuts all = 0 == first and ( 0 > nevents or len ( chain ) <= nevents ) if all and trivial and 1 < len( ch.files ) : logger.info ("Configuration is ``trivial'': redefine ``chunk-size'' to -1") chunk_size = -1 task = FillTask ( variables = variables , selection = selection , roo_cuts = roo_cuts , trivial = trivial , use_frame = use_frame ) wmgr = WorkManager ( silent = silent , **kwargs ) trees = ch.split ( chunk_size = chunk_size , max_files = max_files ) wmgr.process( task , trees , chunk_size = job_chunk ) del trees dataset, stat = task.results() selector.data = dataset selector.stat = stat from ostap.logger.logger import attention skipped = 'Skipped:%d' % stat.skipped skipped = '/' + attention ( skipped ) if stat.skipped else '' logger.info ( 'Selector(%s): Events Processed:%d/Total:%d%s CUTS: "%s"\n%s' % ( selector.name , stat.processed , stat.total , skipped , selector.cuts() , dataset.table ( prefix = '# ' ) ) ) return dataset, stat
def __init__( self, category, ## accessor to category N, ## number of categories methods, ## list of TMVA methods variables, ## list of variables signal, ## signal tree background, ## background tree signal_cuts='', ## signal cuts background_cuts='', ## background cuts spectators=[], bookingoptions="Transformations=I;D;P;G,D", configuration="nTrain_Signal=0:nTrain_Background=0:SplitMode=Random:NormMode=NumEvents:!V", signal_weight=None, background_weight=None, name='TMVAChopper', ## the name verbose=False, ## verbose ? chop_signal=False, ## chop the signal ? chop_background=True): ## chop the background ? """Create TMVA ``chopping'' trainer >>> N = 11 >>> trainer = Trainer ( ... category = '137*evt+813*run' , ... N = N , ... methods = [ # type name configuration ... ( ROOT.TMVA.Types.kMLP , 'MLP' , 'H:!V:EstimatorType=CE:VarTransform=N:NCycles=200:HiddenLayers=N+3:TestRate=5:!UseRegulator' ) , ... ( ROOT.TMVA.Types.kBDT , 'BDTG' , 'H:!V:NTrees=100:MinNodeSize=2.5%:BoostType=Grad:Shrinkage=0.10:UseBaggedBoost:BaggedSampleFraction=0.5:nCuts=20:MaxDepth=2' ) , ... ( ROOT.TMVA.Types.kCuts , 'Cuts' , 'H:!V:FitMethod=MC:EffSel:SampleSize=200000:VarProp=FSmart' ) , ... ( ROOT.TMVA.Types.kFisher , 'Fisher' , 'H:!V:Fisher:VarTransform=None:CreateMVAPdfs:PDFInterpolMVAPdf=Spline2:NbinsMVAPdf=50:NsmoothMVAPdf=10' ), ... ( ROOT.TMVA.Types.kLikelihood , 'Likelihood' , 'H:!V:TransformOutput:PDFInterpol=Spline2:NSmoothSig[0]=20:NSmoothBkg[0]=20:NSmoothBkg[1]=10:NSmooth=1:NAvEvtPerBin=50' ) ] , ... variables = [ 'var1' , 'var2' , 'var3' ] , ## Variables to use in the training ... signal = signal_tree , ## TTree/TChain with ``signal'' sample ... background = background_tree , ## TTree/TChain with ``background'' sample ... name = 'TMVAChopper' , ... verbose = False ) """ assert isinstance( N, (int, long)) and 1 < N, "Invalid number of categories" self.__chop_signal = True if chop_signal else False self.__chop_background = True if chop_background else False assert self.__chop_signal or self.__chop_background, "Neither signal nor background chopping" self.__category = category self.__N = N self.__signal = signal self.__background = background self.__methods = tuple(methods) self.__signal_weight = signal_weight self.__signal_cuts = ROOT.TCut(signal_cuts) self.__background_weight = background_weight self.__background_cuts = ROOT.TCut(background_cuts) self.__variables = tuple(variables) self.__spectators = tuple(spectators) self.__bookingoptions = bookingoptions self.__configuration = configuration self.__name = name self.__verbose = True if verbose else False self.__sig_histos = () self.__bkg_histos = () cat = '(%s)%%%d' % (self.category, self.N) if self.chop_signal: hs1 = ROOT.TH1F(hID(), 'Signal categories', self.N * 5, -0.5, self.N - 1) hs2 = h1_axis([-0.5 + i for i in range(self.N + 1)], title=hs1.GetTitle()) self.signal.project(hs1, cat, self.signal_cuts) self.signal.project(hs2, cat, self.signal_cuts) self.__sig_histos = hs1, hs2 st = hs2.stat() if 0 >= st.min(): logger.warning("Some signal categories are empty!") logger.info('Signal category population mean/rms: %s/%s' % (st.mean(), st.rms())) if self.chop_background: hb1 = ROOT.TH1F(hID(), 'Background categories', self.N * 5, -0.5, self.N - 1) hb2 = h1_axis([-0.5 + i for i in range(self.N + 1)], title=hb1.GetTitle()) self.background.project(hb1, cat, self.background_cuts) self.background.project(hb2, cat, self.background_cuts) self.__bkg_histos = hb1, hb2 ## st = hb2.stat() if 0 >= st.min(): logger.warning("Some background categories are empty!") logger.info('Background category population mean/rms: %s/%s' % (st.mean(), st.rms())) ## trick to please Kisa from ostap.trees.trees import Chain self.__signal = Chain(signal) self.__background = Chain(background) ## book the trainers self.__trainers = () self.__weights_files = [] self.__class_files = [] self.__output_files = [] self.__tar_file = None self.__log_file = None
def reduce(chain, selection={}, save_vars=(), new_vars={}, no_vars=(), output='', name='', addselvars=False, silent=False, **kwargs): """ Parallel processing of loooong chain/tree >>>chain = ... >>> selector = ... >>> chain.pprocess ( selector ) """ from ostap.trees.trees import Chain from ostap.frames.tree_reduce import ReduceTree if isinstance(chain, ROOT.TChain) and 1 >= len(chain.files()): return chain.reduce(selection=selection, save_vars=save_vars, new_vars=new_vars, no_vars=no_vars, output=output, name=name, addselvars=addselvars, silent=silent) nb0 = len(chain.branches()) ne0 = len(chain) ch = Chain(chain) task = ReduceTask(selection=selection, save_vars=save_vars, new_vars=new_vars, addselvars=addselvars, name=name) wmgr = WorkManager(silent=silent, **kwargs) trees = ch.split(max_files=1) wmgr.process(task, trees) result, table = task.results() for i in result.files: result.trash.add(i) if output: ## merge results into single output file reduced = ReduceTree(result.chain, selection='', save_vars=(), addselvars=False, silent=True, output=output, name=name) result = Chain(reduced.chain) if not silent: from ostap.frames.frames import report_print_table title = 'Tree -> Frame -> Tree filter/transformation' logger.info('Reduce tree:\n%s' % report_print_table(table, title, '# ')) nb = len(result.chain.branches()) ne = len(result.chain) f = float(nb0 * ne0) / (nb * ne) logger.info('reduce: (%dx%d) -> (%dx%d) %.1f (branches x entries) ' % (nb0, ne0, nb, ne, f)) return result