def customInit(self, initVars): self.n_excluded = 0 self.n_kept = 0 self.n_skipped = 0 self.sample = initVars['sample'] self.config = initVars['config'] if self.sample.identifier in self.applyToSamples: self.excludedEvents = {} excludedSampleTree = SampleTree([self.excludeTreeFileName], config=self.config) excludedSampleTree.enableBranches(['run','event']) print "INFO: loading list of events to filter" n_events = 0 for ev in excludedSampleTree: if ev.run not in self.excludedEvents: self.excludedEvents[ev.run] = {} if ev.event not in self.excludedEvents[ev.run]: self.excludedEvents[ev.run][ev.event] = 0 self.excludedEvents[ev.run][ev.event] += 1 if self.excludedEvents[ev.run][ev.event]==1: n_events += 1 intrinsicDuplicates = sum([[[event,run,count] for event,count in self.excludedEvents[run].items() if count > 1] for run in self.excludedEvents.keys()], []) print "INFO: done => ", n_events, "distinct events will be filtered out of", self.applyToSamples if len(intrinsicDuplicates) > 0: print "INFO: the event list provided contains",len(intrinsicDuplicates),"duplicates itself!" else: print "INFO: event number filter disdable for this sample"
def customInit(self, initVars): self.n_excluded = 0 self.n_kept = 0 self.n_skipped = 0 self.sample = initVars['sample'] self.config = initVars['config'] if self.sample.identifier in self.applyToSamples: self.excludedEvents = {} excludedSampleTree = SampleTree([self.excludeTreeFileName], config=self.config) excludedSampleTree.enableBranches(['run', 'event']) print "INFO: loading list of events to filter" n_events = 0 for ev in excludedSampleTree: if ev.run not in self.excludedEvents: self.excludedEvents[ev.run] = {} if ev.event not in self.excludedEvents[ev.run]: self.excludedEvents[ev.run][ev.event] = 0 self.excludedEvents[ev.run][ev.event] += 1 if self.excludedEvents[ev.run][ev.event] == 1: n_events += 1 intrinsicDuplicates = sum( [[[event, run, count] for event, count in self.excludedEvents[run].items() if count > 1] for run in self.excludedEvents.keys()], []) print "INFO: done => ", n_events, "distinct events will be filtered out of", self.applyToSamples if len(intrinsicDuplicates) > 0: print "INFO: the event list provided contains", len( intrinsicDuplicates), "duplicates itself!" else: print "INFO: event number filter disdable for this sample"
def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option('Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval(self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print("\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def run(self): inputFileNames = [ "{path}/{sample}/{fileName}".format( path=self.config.get('Directories', 'HADDin'), sample=self.sampleIdentifier, fileName=self.fileLocator.getFilenameAfterPrep(fileName)) for fileName in self.fileNames ] outputFileName = self.getTemporaryFileName() self.fileLocator.makedirs('/'.join(outputFileName.split('/')[:-1])) command = self.commandTemplate.format(output=outputFileName, inputs=' '.join(inputFileNames), f="-f" if self.force else "") if self.debug: print("DEBUG: run \x1b[34m", command, "\x1b[0m") if self.useChain: # use sampleTree class (can e.g. drop branches at the same time) sampleTree = SampleTree(inputFileNames, config=self.config) try: removeBranches = eval( self.config.get('General', 'remove_branches')) for removeBranch in removeBranches: sampleTree.addBranchToBlacklist(removeBranch) print("DEBUG: disable branch ", removeBranch) except Exception as e: print("DEBUG: could not disable branch:", e) sampleTree.addOutputTree(outputFileName, cut='1', branches='*') sampleTree.process() result = 0 else: # standard hadd result = self.fileLocator.runCommand(command) print("INFO: hadd returned ", result) if result == 0: finalOutputFileName = self.getOutputFileName() print("move file to final destination: \x1b[34m", finalOutputFileName, "\x1b[0m") self.fileLocator.makedirs('/'.join( finalOutputFileName.split('/')[:-1])) resultCopy = self.fileLocator.cp(outputFileName, finalOutputFileName, self.force) if not resultCopy: print("\x1b[31mERROR: copy failed\n from:", outputFileName, "\n to:", finalOutputFileName, "\n force:", self.force, "\x1b[0m") raise Exception("FileCopyError") # try to delete temporary file try: self.fileLocator.rm(outputFileName) except Exception as e: print("ERROR: could not delete temporary file:", outputFileName, " => ", e) print("INFO: done.") else: raise Exception("HaddError")
def test_MultiOutput(self): sampleTree = self.getTree() # define some random cuts cuts = [ "nJet==5&&Sum$(Jet)>500", "nJet==6&&Sum$(Jet)>600", "nJet==7&&Sum$(Jet)>700", "nJet==8&&Sum$(Jet)>800", "nJet==9 && Sum$(Jet)>800 && a<0 && (b>30 || b > 50)", "nJet==9 && Sum$(Jet)>800 && (a<0 && (b>30 || b > 50)) || (a>0 && (b>10 || b > 90)) || (a>0.8 && (b>5 || b > 50))", ] # add some more random cuts for j in range(5): randomCuts = ["(a<%f && (b>%f || c > %f))"%(random.gauss(0,0.5), random.uniform(0,50), random.uniform(0,2)) for i in range(50)] cuts.append('||'.join(randomCuts)) # write skimmed subtrees to file for i, cut in enumerate(cuts): sampleTree.addOutputTree(TestSampleTreeMethods.scratchDirectory + '/tree_skimmed_%d.root'%i, cut, '') sampleTree.process() # load subtrees and count events newSampleTrees = [SampleTree([TestSampleTreeMethods.scratchDirectory + '/tree_skimmed_%d.root'%i]) for i, cut in enumerate(cuts)] resultsMethodA = [newSampleTree.tree.GetEntries() for newSampleTree in newSampleTrees] # count directly resultsMethodB = [sampleTree.tree.Draw("a", cut, "goff") for i, cut in enumerate(cuts)] print(resultsMethodA) print(resultsMethodB) self.assertTrue(all([resultsMethodA[i] == resultsMethodB[i] for i in range(len(resultsMethodA))]))
def test_SampleTree_Callback_1(self): sampleTree = self.getTree() # define some random cuts cuts = [ "b>444.4&&b<444.5", "nJet==5&&Sum$(Jet)>500", "nJet==6&&Sum$(Jet)>600", "nJet==7&&Sum$(Jet)>700", "nJet==8&&Sum$(Jet)>800", ] # write skimmed subtrees to file for i, cut in enumerate(cuts): sampleTree.addOutputTree( outputFileName=TestSampleTreeCallbacksMethods.scratchDirectory + '/tree_test_%d.root'%i, cut=cut, callbacks={ 'beforeLoop': self.callback_before_loop, 'afterWrite': self.callback_after_write, }, branches='*', ) sampleTree.setCallback('event', self.event_callback) sampleTree.process() # check otuput sampleTree2 = SampleTree([TestSampleTreeCallbacksMethods.scratchDirectory + '/tree_test_0.root']) resultsMethodB = sampleTree2.tree.GetEntries() print(sampleTree2.tree) print("events which triggered callback:", self.nEventsFound) print("events in tree 0:", resultsMethodB) self.assertEqual(self.nEventsFound, resultsMethodB) self.assertTrue(self.nEventsFound > 0)
def getEventCount(config, sampleIdentifier, cut="1"): sampleTree = SampleTree( { 'name': sampleIdentifier, 'folder': config.get('Directories', 'PREPout').strip() }, config=config) nEvents = sampleTree.tree.Draw("1", cut, "goff") print sampleIdentifier, " =>", nEvents return nEvents
def getEventCount(config, sampleIdentifier, cut="1", sampleTree=None, sample=None): if not sampleTree: sampleTree = SampleTree( { 'name': sampleIdentifier, 'folder': config.get('Directories', args.fromFolder).strip() }, config=config) h1 = ROOT.TH1D("h1", "h1", 1, 0, 2) scaleToXs = sampleTree.getScale(sample) #nEvents = sampleTree.tree.Draw("1>>h1", "(" + cut + ")*genWeight*%1.6f"%scaleToXs, "goff") nEvents = sampleTree.tree.Draw("1>>h1", cut, "goff") nEventsWeighted = h1.GetBinContent(1) #print("DEBUG:", sampleIdentifier, cut, " MC events:", nEvents, " (weighted:", nEventsWeighted, ")") h1.Delete() return nEvents
def getEventCount(config, sampleIdentifier, cut="1"): sysOut = config.get('Directories','SYSout').strip() t3proto = 'root://t3dcachedb.psi.ch:1094' sysOutMountedPath = sysOut.replace(t3proto,'').replace('root://t3dcachedb03.psi.ch:1094','') fileMask = "{path}/{sample}/{tree}.root".format(path=sysOutMountedPath, sample=sampleIdentifier, tree='*') sampleFiles = [t3proto + x for x in glob.glob(fileMask)] sampleTree = SampleTree(sampleFiles, config=config) nEvents = sampleTree.tree.Draw("1", cut, "goff") print sampleIdentifier,"(",len(sampleFiles),"files) =>",nEvents return nEvents
def test_TreeCutDict(self): def flattenDict(cutDict): if type(cutDict) == str: return cutDict elif type(cutDict) == dict: if 'OR' in cutDict: return '||'.join(['(%s)'%flattenDict(x) for x in cutDict['OR']]) elif 'AND' in cutDict: return '&&'.join(['(%s)'%flattenDict(x) for x in cutDict['AND']]) else: raise Exception('BadTreeTypeCutDict') sampleTree = self.getTree() cutDict = {'OR': [ { 'AND': [ 'nJet>8', 'a>0', 'a>c'] }, 'nJet==6&&Sum$(Jet)>600', 'c>1', { 'OR': [ 'Sum$(Jet)>1200', 'a>0.8', 'b>80', { 'AND': [ 'a>0.9', 'b>80', 'c>0.9', ] } ] } ] } cutFlat = flattenDict(cutDict) print ("flat:", cutFlat) sampleTree.addOutputTree(TestSampleTreeMethods.scratchDirectory + '/tree_dummy.root', cutDict, cutSequenceMode='TREE') sampleTree.process() # count number of entries written to output file skimmedSampleTree = SampleTree([TestSampleTreeMethods.scratchDirectory + '/tree_dummy.root']) resultsMethodA = skimmedSampleTree.tree.GetEntries() resultsMethodB = sampleTree.tree.Draw("a", cutFlat, "goff") self.assertEqual(resultsMethodA, resultsMethodB)
def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option( 'Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval( self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[ self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print( "\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def run(self): inputFileNames = ["{path}/{sample}/{fileName}".format(path=self.config.get('Directories','HADDin'), sample=self.sampleIdentifier, fileName=self.fileLocator.getFilenameAfterPrep(fileName)) for fileName in self.fileNames] outputFileName = self.getTemporaryFileName() self.fileLocator.makedirs('/'.join(outputFileName.split('/')[:-1])) command = self.commandTemplate.format(output=outputFileName, inputs=' '.join(inputFileNames), f="-f" if self.force else "") if self.debug: print ("DEBUG: run \x1b[34m", command, "\x1b[0m") if self.useChain: # use sampleTree class (can e.g. drop branches at the same time) sampleTree = SampleTree(inputFileNames, config=self.config) try: removeBranches = eval(self.config.get('General', 'remove_branches')) for removeBranch in removeBranches: sampleTree.addBranchToBlacklist(removeBranch) print("DEBUG: disable branch ", removeBranch) except Exception as e: print("DEBUG: could not disable branch:", e) sampleTree.addOutputTree(outputFileName, cut='1', branches='*') sampleTree.process() result = 0 else: # standard hadd result = self.fileLocator.runCommand(command) print ("INFO: hadd returned ", result) if result == 0: finalOutputFileName = self.getOutputFileName() print("move file to final destination: \x1b[34m", finalOutputFileName, "\x1b[0m") self.fileLocator.makedirs('/'.join(finalOutputFileName.split('/')[:-1])) resultCopy = self.fileLocator.cp(outputFileName, finalOutputFileName, self.force) if not resultCopy: print("\x1b[31mERROR: copy failed\n from:", outputFileName, "\n to:", finalOutputFileName, "\n force:", self.force, "\x1b[0m") raise Exception("FileCopyError") # try to delete temporary file try: self.fileLocator.rm(outputFileName) except Exception as e: print("ERROR: could not delete temporary file:", outputFileName, " => ", e) print("INFO: done.") else: raise Exception("HaddError")
def run(self): nFilesProcessed = 0 nFilesFailed = 0 for subJob in self.subJobs: # only process if output is non-existing/broken or --force was used if self.opts.force or not self.fileLocator.isValidRootFile(subJob['outputFileName']): # create directories outputFolder = '/'.join(subJob['outputFileName'].split('/')[:-1]) tmpFolder = '/'.join(subJob['tmpFileName'].split('/')[:-1]) self.fileLocator.makedirs(outputFolder) self.fileLocator.makedirs(tmpFolder) # load sample tree sampleTree = SampleTree(subJob['localInputFileNames'], config=self.config) if not sampleTree.tree: print "trying fallback...", len(subJob['inputFileNames']) if len(subJob['inputFileNames']) == 1: # try original naming scheme if reading directly from Heppy/Nano ntuples (without prep) fileNameOriginal = self.pathIN + '/' + subJob['inputFileNames'][0] print "FO:", fileNameOriginal xrootdRedirector = self.fileLocator.getRedirector(fileNameOriginal) sampleTree = SampleTree([fileNameOriginal], config=self.config, xrootdRedirector=xrootdRedirector) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" nFilesFailed += 1 continue else: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED! (old naming scheme not supported for joining multipel files)\x1b[0m" nFilesFailed += 1 continue # to use this syntax, use "--addCollections Sys.Vtype" for a config file entry like this: # [Sys] # Vtype = VtypeCorrector.VtypeCorrector(channel='Zll') # (instead of passing the tree in the constructor, the setTree method can be used) pyModules = [] for collection in self.collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] pyCode = self.config.get(section, key) # import module from myutils moduleName = pyCode.split('(')[0].split('.')[0].strip() if self.debug: print "DEBUG: import module:", moduleName print("\x1b[33mDEBUG: " + collection + ": run PYTHON code:\n"+pyCode+"\x1b[0m") globals()[moduleName] = importlib.import_module(".{module}".format(module=moduleName), package="myutils") # get object wObject = eval(pyCode) # pass the tree and other variables if needed to finalize initialization if hasattr(wObject, "customInit") and callable(getattr(wObject, "customInit")): wObject.customInit({'config': self.config, 'sampleTree': sampleTree, 'tree': sampleTree.tree, 'sample': self.sample, 'channel': self.channel, 'pathIN': self.pathIN, 'pathOUT': self.pathOUT, }) # add callbacks if the objects provides any if hasattr(wObject, "processEvent") and callable(getattr(wObject, "processEvent")): sampleTree.addCallback('event', wObject.processEvent) # add branches if hasattr(wObject, "getBranches") and callable(getattr(wObject, "getBranches")): sampleTree.addOutputBranches(wObject.getBranches()) pyModules.append(wObject) # DEPRECATED, do not use anymore ---> use BranchTools.TreeFormulas() if 'addbranches' in self.collections: writeNewVariables = eval(self.config.get("Regression", "writeNewVariablesDict")) sampleTree.addOutputBranches(writeNewVariables) # DEPRECATED, do not use anymore ---> use BranchTools.Drop() if 'removebranches' in self.collections: bl_branch = eval(config.get('Branches', 'useless_branch')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) bl_branch = eval(config.get('Branches', 'useless_after_sys')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) # define output file sampleTree.addOutputTree(subJob['tmpFileName'], cut='1', branches='*', friend=self.opts.friend) # run processing for pyModule in pyModules: if hasattr(pyModule, "beforeProcessing"): getattr(pyModule, "beforeProcessing")() sampleTree.process() for pyModule in pyModules: if hasattr(pyModule, "afterProcessing"): getattr(pyModule, "afterProcessing")() # if output trees have been produced: copy temporary file to output folder if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'copy ', subJob['tmpFileName'], subJob['outputFileName'] if self.verifyCopy: if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print 'INFO: output at final destination broken, try to copy again from scratch disk to final destination...' self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'INFO: second attempt copy done!' if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print '\x1b[31mERROR: output still broken!\x1b[0m' nFilesFailed += 1 raise Exception("FileCopyError") else: print 'INFO: file is good after second attempt!' except Exception as e: print e print "\x1b[31mERROR: copy from scratch to final destination failed!!\x1b[0m" # delete temporary file try: self.fileLocator.rm(subJob['tmpFileName']) except Exception as e: print e print "WARNING: could not delete file on scratch!" # clean up if hasattr(wObject, "cleanUp") and callable(getattr(wObject, "cleanUp")): getattr(wObject, "cleanUp")() else: print 'SKIP:', subJob['inputFileNames'] if nFilesFailed > 0: raise Exception("ProcessingIncomplete")
class CachePlot(object): def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesDefinitions = self.config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = eval(self.config.get('Plot_general', 'samples')) self.dataNames = eval(self.config.get('Plot_general', 'Data')) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) def printInfo(self): print("REGION:".ljust(24), "CUT:") for region, regionInfo in self.regionsDict.iteritems(): print(" > ", region.ljust(20), regionInfo['cut']) def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval( self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval( self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): if section.startswith('plotDef:') and self.config.has_option( section, 'relPath'): keepBranchesPlot.append(self.config.get( section, 'relPath')) except Exception as e: print(e) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region, regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList( keepBranchesPlot).getListOfBranches() # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] for sample in subsamples: # add cuts for all training regions for region, regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s' % region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append( self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general', 'addBlindingCut'): sampleCuts.append( self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format( region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("INFO: already cached!", tc, "(", tc.hash, ")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
for sampleGroup in sampleGroups: count = 0 for sampleIdentifier in sampleGroup: print "\x1b[32m", sampleIdentifier, "\x1b[0m" countDict[sampleIdentifier] = {} samples_matching = [ x for x in mcSamples if x.identifier == sampleIdentifier ] if len(samples_matching) > 0: sample = samples_matching[0] sampleTree = SampleTree( { 'sample': sample, 'folder': config.get('Directories', args.fromFolder).strip() }, config=config) print "CUT=", sampleCuts, ":" for sampleCut in sampleCuts: sampleCount = getEventCount(config, sampleIdentifier, sampleCut, sampleTree=sampleTree, sample=sample) print sampleIdentifier, sampleCut, "\x1b[34m=>", sampleCount, "\x1b[0m" if sampleCut in countDict[sampleIdentifier]: print "duplicate!!", sampleIdentifier, sampleCut, countDict[ sampleIdentifier][sampleCut] raise Exception("duplicate")
print 'collections to add:', collections for fileName in filelist: localFileName = fileLocator.getFilenameAfterPrep(fileName) inputFileName = "{path}/{subfolder}/{filename}".format(path=pathIN, subfolder=sample.identifier, filename=localFileName) outputFileName = "{path}/{subfolder}/{filename}".format(path=pathOUT, subfolder=sample.identifier, filename=localFileName) tmpFileName = "{path}/{subfolder}/{filename}".format(path=tmpDir, subfolder=sample.identifier, filename=localFileName) outputFolder = '/'.join(outputFileName.split('/')[:-1]) tmpFolder = '/'.join(tmpFileName.split('/')[:-1]) fileLocator.makedirs(tmpFolder) fileLocator.makedirs(outputFolder) if opts.force or not fileLocator.isValidRootFile(outputFileName): # load sample tree and initialize vtype corrector sampleTree = SampleTree([inputFileName], config=config) if not sampleTree.tree: # try original naming scheme if reading directly from Heppy/Nano ntuples (without prep) fileNameOriginal = pathIN + '/' + fileName print "FO:", fileNameOriginal xrootdRedirector = fileLocator.getRedirector(fileNameOriginal) sampleTree = SampleTree([fileNameOriginal], config=config, xrootdRedirector=xrootdRedirector) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" continue # lists of single modules can be given instead of a module, "--addCollections Sys.all" # [Sys] # all = ['Sys.Vtype', 'Sys.Leptons', ...] collectionsListsReplaced = [] for collection in collections:
def getTree(self): fileNames = [ self.scratchDirectory + '/tree_%d.root' % i for i in range(10) ] return SampleTree(fileNames)
def prepare(self): if len(self.dcMakers) > 0: self.treeCaches = [] self.sampleTree = None # cuts allSamples = self.getAllSamples() subsamples = [ x for x in allSamples if x.identifier == self.sampleToCache ] # loop over all datacard regions for dcMaker in self.dcMakers: # loop over all subsamples (which come from the same root tree files) for sample in subsamples: # combine subcut and systematics cut with logical AND # systematics cuts are combined with logical OR, such that 1 cache file can be used for all the systematics isData = (sample.type == 'DATA') systematicsCuts = sorted( list( set([ x['cachecut'] for x in dcMaker.getSystematicsList( isData=isData) ]))) sampleCuts = { 'AND': [sample.subcut, { 'OR': systematicsCuts }] } if self.verbose: print( json.dumps(sampleCuts, sort_keys=True, indent=8, default=str)) # make list of branches to keep in root file branchList = BranchList(sample.subcut) branchList.addCut( [x['cachecut'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['cut'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['var'] for x in dcMaker.getSystematicsList()]) branchList.addCut( [x['weight'] for x in dcMaker.getSystematicsList()]) branchList.addCut(self.config.get('Weights', 'weightF')) branchList.addCut( eval(self.config.get('Branches', 'keep_branches'))) branchesToKeep = branchList.getListOfBranches() # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'dc:{region}_{sample}'.format( region=dcMaker.getRegion(), sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, cutSequenceMode='TREE', branches=branchesToKeep, inputFolder=dcMaker.path, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, config=self.config, debug=self.verbose) # check if this part of the sample is already cached isCached = tc.partIsCached() print( "check if sample \x1b[34m{sample}\x1b[0m part {part} is cached:" .format(sample=sample.name, part=self.chunkNumber), isCached) if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': dcMaker.path }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") # connect the TreeCache object to the input sampleTree and add it to the list of cached trees self.treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("WARNING: no datacard regions added, nothing to do.") return self
def getTree(self, path): fileNames = [ 'root://xrootd-cms.infn.it//store/group/phys_higgs/hbb/ntuples/V25/TT_TuneCUETP8M2T4_13TeV-powheg-pythia8/VHBB_HEPPY_V25_TT_TuneCUETP8M2T4_13TeV-powheg-Py8__RunIISummer16MAv2-PUMoriond17_80r2as_2016_TrancheIV_v6-v1/170202_212737/0000/tree_100.root' ] return SampleTree(fileNames)
import sys import os from myutils.sampleTree import SampleTree # input: file with one tree filename per line, e.g. # /path/to/tree_1.root # /path/to/tree_2.root # output: txt file with json compatible list of [run, ls] # [[304292, 29], [304663, 510], [302163, 561], ... ] print "usage: %s outputfile.txt inputfile.txt [redirector]" if os.path.isfile(sys.argv[2]): outputFileName = sys.argv[1] sampleTree = SampleTree(sys.argv[2], 'Events', xrootdRedirector=sys.argv[3] if len(sys.argv) > 3 else '') else: raise Exception("Input file not found!", sys.argv[2]) sampleTree.tree.SetBranchStatus("*", 0) sampleTree.tree.SetBranchStatus("run", 1) sampleTree.tree.SetBranchStatus("luminosityBlock", 1) runLumi = {} for i in sampleTree: if (i.run, i.luminosityBlock) not in runLumi: runLumi[(i.run, i.luminosityBlock)] = True with open(outputFileName, 'w') as f: f.write("%r"%[list(x) for x in runLumi.keys()])
print("LENGTH NOT 1") #print("\x1b[41m\x1b[32m") # in case the distinction between subsamples is needed, one could access the cut definitions for the subsamples # with: subsample.subcut for subsample in subsamples #sample = sampleInfo.getFullSample(sampleIdentifier) #subsamples = sampleInfo.getSubsamples(sampleIdentifier) #print('subsample_pirmin', subsamples) #print('sample_pirmin', sample) #print("\x1b[41m\x1b[0m") sampleTree = SampleTree({ 'sample': sample, 'folder': directory }, config=config) #raw_input() # since we load all trees, we can compute the factor to scale cross section to luminosity directly (otherwise write it to ntuples # first and then use it as branch, or compute it with full set of trees before) scaleXStoLumi = sampleTree.getScale(sample) # enable only used branches! # this will speed up processing a lot sampleTree.enableBranches( BranchList([ signalRegionSelection, weightExpression_DeepCSV, weightExpression_DeepJet, taggerExpression_DeepCSV, taggerExpression_DeepJet
def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval(self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval(self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): if section.startswith('plotDef:') and self.config.has_option(section, 'relPath'): keepBranchesPlot.append(self.config.get(section, 'relPath')) except Exception as e: print("\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m") print(e) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region,regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList(keepBranchesPlot).getListOfBranches() print("KEEP:", keepBranchesPlotFinal) # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] for sample in subsamples: # add cuts for all training regions for region,regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s'%region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append(self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general','addBlindingCut'): sampleCuts.append(self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format(region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m") raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m") raise Exception("SampleFilesHaveChanged") treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) else: print ("INFO: already cached!",tc, "(",tc.hash,")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
def getTree(self): fileNames = [TestSampleTreeCallbacksMethods.scratchDirectory + '/tree_%d.root'%i for i in range(2)] return SampleTree(fileNames)
config.set(mvaName, "checkpoint", checkpoint) config.set(mvaName, "branchName", branchName) config.set(mvaName, "nClasses", "%d"%nClasses) config.set(mvaName, "treeVarSet", "dnnVars") config.add_section("systematics") config.set("systematics", "systematics", " ".join(systematics)) config.add_section("dnnVars") for syst in systematics: config.set("dnnVars", syst, treeVarSet[syst]) # helper for fs operations fileLocator = FileLocator(config=config, xrootdRedirector=xrootdRedirector) fileLocator.mkdir(outputFolder) # load input files sampleTree = SampleTree([inputFile], treeName=inputTreeName, xrootdRedirector=xrootdRedirector) # load tensorflow evaluator tfe = tensorflowEvaluator.tensorflowEvaluator(mvaName) tfe.customInit({'config': config, 'sample': sample, 'sampleTree': sampleTree}) # register callbacks for processing sampleTree.addCallback('event', tfe.processEvent) # define new branches to add sampleTree.addOutputBranches(tfe.getBranches()) try: os.makedirs(outputFolder) except: pass
config.set(mvaName, "scalerDump", scalerDump) config.set(mvaName, "checkpoint", checkpoint) config.set(mvaName, "branchName", branchName) config.set(mvaName, "treeVarSet", "dnnVars") config.add_section("systematics") config.set("systematics", "systematics", " ".join(systematics)) config.add_section("dnnVars") for syst in systematics: config.set("dnnVars", syst, treeVarSet[syst]) # helper for fs operations fileLocator = FileLocator(config=config, xrootdRedirector=xrootdRedirector) fileLocator.mkdir(outputFolder) # load input files sampleTree = SampleTree([inputFile], treeName="tree", xrootdRedirector=xrootdRedirector) # load tensorflow evaluator tfe = tensorflowEvaluator.tensorflowEvaluator(mvaName) tfe.customInit({'config': config, 'sample': sample, 'sampleTree': sampleTree}) # register callbacks for processing sampleTree.addCallback('event', tfe.processEvent) # define new branches to add sampleTree.addOutputBranches(tfe.getBranches()) # define output file tmpFileName = scratch + '/' + inputFile.split('/')[-1] outputFileName = outputFolder + '/' + inputFile.split('/')[-1] sampleTree.addOutputTree(tmpFileName, cut='1', branches='*')
#parser.add_argument('-v', action='store', dest='variable', default='Max$(Jet_pt)', help='variable to compute the efficiency of (differentially)') #parser.add_argument('-r', action='store', dest='range', default='0,1000,100', help='min,max,nbins of variable to create histogram') parser.add_argument('-o', action='store', dest='output', default='trigeff.root', help='output .root file') #parser.add_argument('-l', action='store', dest='loose', default='((nJet>0)&&HLT_BIT_HLT_PFJet80_v)*HLT_BIT_HLT_PFJet80_v_Prescale', help='loose cut') #parser.add_argument('-t', action='store', dest='tight', default='((nJet>0)&&HLT_BIT_HLT_PFJet140_v)*HLT_BIT_HLT_PFJet140_v_Prescale', help='tight cut') args = parser.parse_args() limitTrees = int(args.limit) print('ARGS:', args) # read samples sampleTree = SampleTree(args.sample, limitFiles=limitTrees) if not sampleTree: print('creating sample tree failed!') exit(0) # trigger efficiency histograms triggerEfficiencyHistograms = [ { 'name': 'HLT_PFJet140', 'range': [0, 1000, 100], 'loose': '((nJet>0)&&HLT_BIT_HLT_PFJet80_v)*HLT_BIT_HLT_PFJet80_v_Prescale', 'tight': '((nJet>0)&&HLT_BIT_HLT_PFJet140_v)*HLT_BIT_HLT_PFJet140_v_Prescale', 'variable': 'Max$(Jet_pt)' },
print ("need exactly 1 sample identifier as input with -S !!", matchingSamples) exit(1) sample = matchingSamples[0] for fileName in filelist: localFileName = fileLocator.getFilenameAfterPrep(fileName) inputFileName = "{path}/{subfolder}/{filename}".format(path=INpath, subfolder=sample.identifier, filename=localFileName) outputFileName = "{path}/{subfolder}/{filename}".format(path=OUTpath, subfolder=sample.identifier, filename=localFileName) tmpFileName = "{path}/{subfolder}/{filename}".format(path=tmpDir, subfolder=sample.identifier, filename=localFileName) outputFolder = '/'.join(outputFileName.split('/')[:-1]) tmpFolder = '/'.join(tmpFileName.split('/')[:-1]) fileLocator.makedirs(tmpFolder) fileLocator.makedirs(outputFolder) if not fileLocator.isValidRootFile(outputFileName) or opts.force: # load sample tree sampleTree = SampleTree([inputFileName], config=config) if not sampleTree.tree: print ("\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m") continue # Set branch adress for all vars for i in range(0, len(theMVAs)): theMVAs[i].setVariables(sampleTree.tree, sample) mvaBranches = [] for i in range(0, len(theMVAs)): mvaBranches.append({ 'name': MVAinfos[i].MVAname, 'length': len(systematics.split()), 'formula': theMVAs[i].evaluate, 'leaflist': ':'.join(systematics.split())+'/F', # force 'srray-style' filling = passing the pointer to the array to the function instead of using the return value, even when the branch is a scalar, e.g. when only nominal systematic is selected 'arrayStyle': True,
class CachePlot(object): def __init__(self, config, sampleIdentifier, regions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, forceRedo=False, fileList=None): self.config = config self.sampleIdentifier = sampleIdentifier self.regions = list(set(regions)) self.forceRedo = forceRedo self.sampleTree = None self.samplesPath = self.config.get('Directories', 'plottingSamples') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.sampleNames = eval(self.config.get('Plot_general', 'samples')) self.dataNames = eval(self.config.get('Plot_general', 'Data')) self.samples = self.samplesInfo.get_samples(self.sampleNames + self.dataNames) self.regionsDict = {} for region in self.regions: treeCut = config.get('Cuts', region) self.regionsDict[region] = {'cut': treeCut} self.splitFilesChunkSize = splitFilesChunkSize self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.fileList = FileList.decompress(fileList) if fileList else None VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) def printInfo(self): print ("REGION:".ljust(24),"CUT:") for region,regionInfo in self.regionsDict.iteritems(): print (" > ",region.ljust(20), regionInfo['cut']) def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval(self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] keepBranchesPlotOld = keepBranchesPlot # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): if section.startswith('plotDef:') and self.config.has_option(section, 'relPath'): keepBranchesPlot.append(self.config.get(section, 'relPath')) except Exception as e: print(e) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass keepBranchesPlotFinal = BranchList(keepBranchesPlot).getListOfBranches() # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] for sample in subsamples: # add cuts for all training regions for region,regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s'%region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append(self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general','addBlindingCut'): sampleCuts.append(self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format(region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m") raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m") raise Exception("SampleFilesHaveChanged") treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) else: print ("INFO: already cached!",tc, "(",tc.hash,")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
#! /usr/bin/env python from __future__ import print_function import ROOT ROOT.gROOT.SetBatch(True) from myutils.XbbConfig import XbbConfigReader, XbbConfigTools from myutils.sampleTree import SampleTree as SampleTree from myutils.BranchList import BranchList config = XbbConfigTools(XbbConfigReader.read("Zvv2017")) sampleTree = SampleTree( { 'name': 'MET', 'folder': config.get('Directories', 'dcSamples') }, config=config) variables = ["H_pt", "MET_Pt", "H_pt/MET_Pt"] # enable only explicitly used branches sampleTree.enableBranches(BranchList(variables).getListOfBranches()) # create TTReeFormula's for variable in variables: sampleTree.addFormula(variable) # loop over events for event in sampleTree: print( sampleTree.tree.GetReadEntry(), ", ".join([x + "=%1.4f" % sampleTree.evaluate(x) for x in variables])) if sampleTree.tree.GetReadEntry() > 98: break
#!/usr/bin/env python import ROOT from myutils.sampleTree import SampleTree #pp #sampleTree = SampleTree('VHbbPostNano2017_V2_DoubleEG.txt', 'Events', xrootdRedirector='root://t3dcachedb03.psi.ch:1094/') sampleTree = SampleTree('VHbbPostNano2017_V2_DoubleMuon.txt', 'Events', xrootdRedirector='root://t3dcachedb03.psi.ch:1094/') outputFileName = 'existing_lumis_pp_DoubleMuon.txt' #nano #sampleTree = SampleTree('2017-94X-Nano01-DoubleEG.txt', 'Events', xrootdRedirector='root://xrootd-cms.infn.it/') #outputFileName = 'existing_lumis_nano.txt' sampleTree.tree.SetBranchStatus("*", 0) sampleTree.tree.SetBranchStatus("run", 1) sampleTree.tree.SetBranchStatus("luminosityBlock", 1) runLumi = [] for i in sampleTree: #if [i.run, i.luminosityBlock] not in runLumi and i.run>=302030 and i.run <= 303434: # runLumi.append([i.run, i.luminosityBlock]) if [i.run, i.luminosityBlock] not in runLumi: runLumi.append([i.run, i.luminosityBlock]) print runLumi with open(outputFileName, 'w') as f: f.write("%r" % runLumi)
def run(self): # keep additional branches for plotting try: keepBranchesPlot = eval( self.config.get('Branches', 'keep_branches_plot')) except: keepBranchesPlot = [] try: keepBranchesPlot += eval( self.config.get('Branches', 'keep_branches')) except: pass # also keep some branches which might be used later in variables definition and weights try: for section in self.config.sections(): try: if section.startswith( 'plotDef:') and self.config.has_option( section, 'relPath'): keepBranchesPlot.append( self.config.get(section, 'relPath')) except Exception as e: print("\x1b[31mWARNING: config error in:", section, "=>", e, "\x1b[0m") except Exception as e2: print( "\x1b[31mERROR: config file contains an error! automatic selection of branches to keep will not work!\x1b[0m" ) print(e2) try: keepBranchesPlot.append(self.config.get('Weights', 'weightF')) except: pass # plotting region cut for region, regionInfo in self.regionsDict.iteritems(): keepBranchesPlot.append(regionInfo['cut']) keepBranchesPlotFinal = BranchList( keepBranchesPlot).getListOfBranches() print("KEEP:", keepBranchesPlotFinal) # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] sampleTree = None # for all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] for sample in subsamples: # add cuts for all training regions for region, regionInfo in self.regionsDict.iteritems(): configSection = 'Plot:%s' % region # cuts sampleCuts = [sample.subcut] if regionInfo['cut']: sampleCuts.append(regionInfo['cut']) if self.config.has_option(configSection, 'Datacut'): sampleCuts.append( self.config.get(configSection, 'Datacut')) if self.config.has_option('Plot_general', 'addBlindingCut'): sampleCuts.append( self.config.has_option('Plot_general', 'addBlindingCut')) # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'plot:{region}_{sample}'.format( region=region, sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, branches=keepBranchesPlotFinal, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print( "\x1b[31mERROR: creation of sample tree failed!!\x1b[0m" ) raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk( self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print( "\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m" ) raise Exception("SampleFilesHaveChanged") treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) else: print("INFO: already cached!", tc, "(", tc.hash, ")") if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) for weightVar in trainingRegionInfo['weightVars']: branchListOfMVAVars.addCut(weightVar) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format(region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut==self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
for fileName in filelist: localFileName = fileLocator.getFilenameAfterPrep(fileName) inputFileName = "{path}/{subfolder}/{filename}".format(path=pathIN, subfolder=sample.identifier, filename=localFileName) outputFileName = "{path}/{subfolder}/{filename}".format(path=pathOUT, subfolder=sample.identifier, filename=localFileName) tmpFileName = "{path}/{subfolder}/{filename}".format(path=tmpDir, subfolder=sample.identifier, filename=localFileName) outputFolder = '/'.join(outputFileName.split('/')[:-1]) tmpFolder = '/'.join(tmpFileName.split('/')[:-1]) fileLocator.makedirs(tmpFolder) fileLocator.makedirs(outputFolder) if not fileLocator.exists(outputFileName) or opts.force: # load sample tree and initialize vtype corrector sampleTree = SampleTree([inputFileName], config=config) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" continue # lists of single modules can be given instead of a module, "--addCollections Sys.all" # [Sys] # all = ['Sys.Vtype', 'Sys.Leptons', ...] collectionsListsReplaced = [] for collection in collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] listExpression = config.get(section, key).strip() if listExpression.startswith('[') and listExpression.endswith(']'): listParsed = eval(listExpression)
def test_AddBranches(self): sampleTree = self.getTree() # you can add a string sampleTree.addOutputBranch('jetSum', 'Sum$(Jet)') sampleTree.addOutputBranch('abcSum', 'a+b+c') # or a function, including lambdas sampleTree.addOutputBranch('abcSum2', lambda tree: tree.a + tree.b + tree.c) # alternative syntax vectorLength = 4 sampleTree.addOutputBranches([ { 'name': 'abcSum3', 'formula': BlablaCorrector.applyCorrection, }, { 'name': 'stuff', 'formula': BlablaCorrector.applyOtherCorrection, }, { 'name': 'vectorstuff', 'formula': BlablaCorrector.addVector, 'length': vectorLength, }, ] ) # write output tree and apply also a cut sampleTree.addOutputTree( TestSampleTreeAddBranchesMethods.scratchDirectory + '/tree_withaddedbranches.root', cut='1', ) sampleTree.process() # compare histograms of tree with new branches with expected result newSampleTree = SampleTree([TestSampleTreeAddBranchesMethods.scratchDirectory + '/tree_withaddedbranches.root']) newTree = newSampleTree.tree outfile = ROOT.TFile.Open(TestSampleTreeAddBranchesMethods.scratchDirectory + '/histograms.root', 'recreate') h1 = ROOT.TH1F('h1', 'h1', 200, 0, 200) h2 = ROOT.TH1F('h2', 'h2', 200, 0, 200) # new branch newTree.Draw('vectorstuff[2]>>h1') # expected result sampleTree.tree.Draw('a+b+c>>h2') m1 = h1.GetMean() m2 = h2.GetMean() self.assertTrue(abs(m1-m2) < 0.00001) self.assertTrue(abs(m1/m2) < 1.00001) self.assertTrue(abs(m1/m2) > 0.99999) print("histogram means:", m1, m2, " check histograms h1 and h2 in histograms.root") outfile.Write() outfile.Close() # test if we really have a vector with 4 entries in our output tree h3 = ROOT.TH1F('h3', 'h3', 2000, -1000, 10000) h4 = ROOT.TH1F('h4', 'h4', 2000, -1000, 10000) newTree.Draw('vectorstuff>>h3') sampleTree.tree.Draw('a>>h4') self.assertTrue(h3.GetEntries() > 0) self.assertEqual(h3.GetEntries(), h4.GetEntries()*vectorLength)
config = XbbConfigTools(XbbConfigReader.read(opts.tag)) inputFolder = config.get('Directories', 'dcSamples') logFolder = config.get('Directories', 'tagDir') config.loadNamespaces() regions = config.getDatacardRegions() if len( opts.regions) < 1 else config.parseCommaSeparatedList(opts.regions) for region in regions: dataSamples = eval(config.get('dc:' + region, 'data')) for dataSample in dataSamples: sampleTree = SampleTree({ 'name': dataSample, 'folder': inputFolder }, config=config) outputFileName = logFolder + '/' + region + '_' + dataSample + '_' + opts.run + '_' + opts.event + '.txt' print("save event list to:", outputFileName) treePlayer = sampleTree.tree.GetPlayer() treePlayer.SetScanRedirect(True) treePlayer.SetScanFileName(outputFileName) branchList = BranchList(["run", "event"]) regionCut = config.get( 'Cuts', config.get('dc:' + region, 'cut') if config.has_option( 'dc:' + region, 'cut') else region) branchList.addCut(regionCut)
def prepare(self): if len(self.dcMakers) > 0: self.treeCaches = [] self.sampleTree = None # cuts allSamples = self.getAllSamples() subsamples = [x for x in allSamples if x.identifier == self.sampleToCache] # loop over all datacard regions for dcMaker in self.dcMakers: # loop over all subsamples (which come from the same root tree files) for sample in subsamples: # combine subcut and systematics cut with logical AND # systematics cuts are combined with logical OR, such that 1 cache file can be used for all the systematics isData = (sample.type == 'DATA') systematicsCuts = sorted(list(set([x['cachecut'] for x in dcMaker.getSystematicsList(isData=isData)]))) sampleCuts = {'AND': [sample.subcut, {'OR': systematicsCuts}]} if True or self.verbose: print (json.dumps(sampleCuts, sort_keys=True, indent=8, default=str)) # make list of branches to keep in root file branchList = BranchList(sample.subcut) branchList.addCut([x['cachecut'] for x in dcMaker.getSystematicsList()]) branchList.addCut([x['cut'] for x in dcMaker.getSystematicsList()]) branchList.addCut([x['var'] for x in dcMaker.getSystematicsList()]) branchList.addCut([x['weight'] for x in dcMaker.getSystematicsList()]) branchList.addCut(self.config.get('Weights', 'weightF')) branchList.addCut(eval(self.config.get('Branches', 'keep_branches'))) branchesToKeep = branchList.getListOfBranches() # arbitrary (optional) name for the output tree, used for print-out (the TreeCache object has no idea what it is doing, e.g. dc, plot etc.) cacheName = 'dc:{region}_{sample}'.format(region=dcMaker.getRegion(), sample=sample.name) # add cache object tc = TreeCache.TreeCache( name=cacheName, sample=sample.name, cutList=sampleCuts, cutSequenceMode='TREE', branches=branchesToKeep, inputFolder=dcMaker.path, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, fileList=self.fileList, config=self.config, debug=self.verbose ) # check if this part of the sample is already cached isCached = tc.partIsCached() print ("check if sample \x1b[34m{sample}\x1b[0m part {part} is cached:".format(sample=sample.name, part=self.chunkNumber), isCached) if not isCached or self.forceRedo: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': dcMaker.path}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) if not self.sampleTree or not self.sampleTree.tree: print ("\x1b[31mERROR: creation of sample tree failed!!\x1b[0m") raise Exception("CreationOfSampleTreeFailed") # consistency check on the file list at submission time and now fileListNow = self.sampleTree.getSampleFileNameChunk(self.chunkNumber) if self.fileList and (sorted(self.fileList) != sorted(fileListNow)): print ("\x1b[31mERROR: sample files have changed between submission and run of the job!\x1b[0m") raise Exception("SampleFilesHaveChanged") # connect the TreeCache object to the input sampleTree and add it to the list of cached trees self.treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) else: print("WARNING: no datacard regions added, nothing to do.") return self
#!/usr/bin/env python import ROOT from myutils.sampleTree import SampleTree #pp #sampleTree = SampleTree('VHbbPostNano2017_V2_DoubleEG.txt', 'Events', xrootdRedirector='root://t3dcachedb03.psi.ch:1094/') #sampleTree = SampleTree('VHbbPostNano2017_V2_DoubleMuon.txt', 'Events', xrootdRedirector='root://t3dcachedb03.psi.ch:1094/') #outputFileName = 'existing_lumis_pp_DoubleMuon.txt' #nano sampleTree = SampleTree('DoubleEG_RunII2017ReReco17Nov17-94X-Nano01_300122to300237.txt', 'Events', xrootdRedirector='root://xrootd-cms.infn.it/') outputFileName = 'existing_lumis_nano_realDoubleEG_300122to300237.txt' sampleTree.tree.SetBranchStatus("*", 0) sampleTree.tree.SetBranchStatus("run", 1) sampleTree.tree.SetBranchStatus("luminosityBlock", 1) runLumi = [] for i in sampleTree: #if [i.run, i.luminosityBlock] not in runLumi and i.run>=302030 and i.run <= 303434: # runLumi.append([i.run, i.luminosityBlock]) if [i.run, i.luminosityBlock] not in runLumi: runLumi.append([i.run, i.luminosityBlock]) print runLumi with open(outputFileName, 'w') as f: f.write("%r"%runLumi)
def getTree(self, path): fileNames = [path] return SampleTree(fileNames, treeName='Events')
class CacheTraining(object): def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions], []))) self.signalSampleNames = list(set(sum([eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions], []))) self.samples = self.samplesInfo.get_samples(list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() #systematics = [x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip())>0] systematics = eval(config.get(trainingRegion, 'systematics')) if config.has_option(trainingRegion, 'systematics') else [] mvaVars = config.get(treeVarSet, 'Nominal').split(' ') weightVars = [] #for systematic in systematics: for syst in systematics: systNameUp = syst+'_UP' if self.config.has_option('Weights',syst+'_UP') else syst+'_Up' systNameDown = syst+'_DOWN' if self.config.has_option('Weights',syst+'_DOWN') else syst+'_Down' weightVars += [self.config.get('Weights',systNameUp), self.config.get('Weights',systNameDown)] self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, 'weightVars': weightVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace=config.get('VHbbNameSpace','library') ROOT.gSystem.Load(VHbbNameSpace) def printInfo(self): print ("REGION:".ljust(24),"CUT:") for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): print (" > ",trainingRegion.ljust(20), trainingRegionInfo['cut']) def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print ('*'*80) print (' ',sampleToCache) print ('*'*80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [x for x in self.samples if x.identifier == sampleToCache] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) for weightVar in trainingRegionInfo['weightVars']: branchListOfMVAVars.addCut(weightVar) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion,trainingRegionInfo in self.trainingRegionsDict.iteritems(): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format(region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut==self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True ) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles(chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree({'name': sample.identifier, 'folder': self.samplesPath}, splitFilesChunkSize=self.splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append(tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print ("nothing to do!")
class CacheTraining(object): def __init__(self, config, sampleIdentifier, trainingRegions, splitFilesChunks=1, chunkNumber=1, splitFilesChunkSize=-1, force=False): self.config = config self.force = force self.sampleIdentifier = sampleIdentifier self.trainingRegions = trainingRegions self.sampleTree = None self.samplesPath = self.config.get('Directories', 'MVAin') self.samplesDefinitions = self.config.get('Directories', 'samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = self.config.get('Directories', 'samplefiles') self.backgroundSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'backgrounds')) for trainingRegion in self.trainingRegions ], []))) self.signalSampleNames = list( set( sum([ eval(self.config.get(trainingRegion, 'signals')) for trainingRegion in self.trainingRegions ], []))) self.samples = self.samplesInfo.get_samples( list(set(self.backgroundSampleNames + self.signalSampleNames))) self.trainingRegionsDict = {} for trainingRegion in self.trainingRegions: treeCutName = config.get(trainingRegion, 'treeCut') treeVarSet = config.get(trainingRegion, 'treeVarSet').strip() systematics = [ x for x in config.get('systematics', 'systematics').split(' ') if len(x.strip()) > 0 ] mvaVars = [] for systematic in systematics: mvaVars += config.get(treeVarSet, systematic).strip().split(' ') self.trainingRegionsDict[trainingRegion] = { 'cut': config.get('Cuts', treeCutName), 'vars': mvaVars, } self.TrainCut = config.get('Cuts', 'TrainCut') self.EvalCut = config.get('Cuts', 'EvalCut') self.splitFilesChunks = splitFilesChunks self.chunkNumber = chunkNumber self.splitFilesChunkSize = splitFilesChunkSize VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) def printInfo(self): print("REGION:".ljust(24), "CUT:") for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): print(" > ", trainingRegion.ljust(20), trainingRegionInfo['cut']) def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format( region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut == self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles( chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self. splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")
def run(self): nFilesProcessed = 0 nFilesFailed = 0 for subJob in self.subJobs: # only process if output is non-existing/broken or --force was used if self.opts.force or not self.fileLocator.isValidRootFile(subJob['outputFileName']): # create directories outputFolder = '/'.join(subJob['outputFileName'].split('/')[:-1]) tmpFolder = '/'.join(subJob['tmpFileName'].split('/')[:-1]) self.fileLocator.makedirs(outputFolder) self.fileLocator.makedirs(tmpFolder) # load sample tree sampleTree = SampleTree(subJob['localInputFileNames'], config=self.config) if not sampleTree.tree: print "trying fallback...", len(subJob['inputFileNames']) if len(subJob['inputFileNames']) == 1: # try original naming scheme if reading directly from Heppy/Nano ntuples (without prep) fileNameOriginal = self.pathIN + '/' + subJob['inputFileNames'][0] print "FO:", fileNameOriginal xrootdRedirector = self.fileLocator.getRedirector(fileNameOriginal) sampleTree = SampleTree([fileNameOriginal], config=self.config, xrootdRedirector=xrootdRedirector) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" nFilesFailed += 1 continue else: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED! (old naming scheme not supported for joining multipel files)\x1b[0m" nFilesFailed += 1 continue # to use this syntax, use "--addCollections Sys.Vtype" for a config file entry like this: # [Sys] # Vtype = VtypeCorrector.VtypeCorrector(channel='Zll') # (instead of passing the tree in the constructor, the setTree method can be used) pyModules = [] versionTable = [] for collection in self.collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] if self.config.has_section(section) and self.config.has_option(section, key): pyCode = self.config.get(section, key) elif '(' in collection and collection.endswith(')'): print "WARNING: config option", collection, " not found, interpreting it as Python code!" pyCode = collection else: print "\x1b[31mERROR: config option not found:", collection, ". To specify Python code directly, pass a complete constructor, e.g. --addCollections 'Module.Class()'. Module has to be placed in python/myutils/ folder.\x1b[0m" raise Exception("ConfigError") # import module from myutils moduleName = pyCode.split('(')[0].split('.')[0].strip() if self.debug: print "DEBUG: import module:", moduleName print("\x1b[33mDEBUG: " + collection + ": run PYTHON code:\n"+pyCode+"\x1b[0m") globals()[moduleName] = importlib.import_module(".{module}".format(module=moduleName), package="myutils") # get object wObject = eval(pyCode) # pass the tree and other variables if needed to finalize initialization if hasattr(wObject, "customInit") and callable(getattr(wObject, "customInit")): wObject.customInit({'config': self.config, 'sampleTree': sampleTree, 'tree': sampleTree.tree, 'sample': self.sample, 'channel': self.channel, 'pathIN': self.pathIN, 'pathOUT': self.pathOUT, }) # add callbacks if the objects provides any if hasattr(wObject, "processEvent") and callable(getattr(wObject, "processEvent")): sampleTree.addCallback('event', wObject.processEvent) for cb in ["finish", "prepareOutput"]: if hasattr(wObject, cb) and callable(getattr(wObject, cb)): sampleTree.addCallback(cb, getattr(wObject, cb)) # add branches if hasattr(wObject, "getBranches") and callable(getattr(wObject, "getBranches")): sampleTree.addOutputBranches(wObject.getBranches()) pyModules.append(wObject) versionTable.append([moduleName, wObject.getVersion() if hasattr(wObject, "getVersion") else 0]) else: print "\x1b[31mERROR: config option not found:", collection, " the format should be: [Section].[Option]\x1b[0m" raise Exception("ConfigError") for moduleName, moduleVersion in versionTable: print " > {m}:{v}".format(m=moduleName, v=moduleVersion) # DEPRECATED, do not use anymore ---> use BranchTools.TreeFormulas() if 'addbranches' in self.collections: writeNewVariables = eval(self.config.get("Regression", "writeNewVariablesDict")) sampleTree.addOutputBranches(writeNewVariables) # DEPRECATED, do not use anymore ---> use BranchTools.Drop() if 'removebranches' in self.collections: bl_branch = eval(config.get('Branches', 'useless_branch')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) bl_branch = eval(config.get('Branches', 'useless_after_sys')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) # define output file sampleTree.addOutputTree(subJob['tmpFileName'], cut='1', branches='*', friend=self.opts.friend) # run processing for pyModule in pyModules: if hasattr(pyModule, "beforeProcessing"): getattr(pyModule, "beforeProcessing")() sampleTree.process() for pyModule in pyModules: if hasattr(pyModule, "afterProcessing"): getattr(pyModule, "afterProcessing")() # if output trees have been produced: copy temporary file to output folder if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'copy ', subJob['tmpFileName'], subJob['outputFileName'] if self.verifyCopy: if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print 'INFO: output at final destination broken, try to copy again from scratch disk to final destination...' self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'INFO: second attempt copy done!' if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print '\x1b[31mERROR: output still broken!\x1b[0m' nFilesFailed += 1 raise Exception("FileCopyError") else: print 'INFO: file is good after second attempt!' except Exception as e: print e print "\x1b[31mERROR: copy from scratch to final destination failed!!\x1b[0m" # delete temporary file try: self.fileLocator.rm(subJob['tmpFileName']) except Exception as e: print e print "WARNING: could not delete file on scratch!" # clean up if hasattr(wObject, "cleanUp") and callable(getattr(wObject, "cleanUp")): getattr(wObject, "cleanUp")() else: print 'SKIP:', subJob['inputFileNames'] if nFilesFailed > 0: raise Exception("ProcessingIncomplete")
def run(self): # ---------------------------------------------------------------------------------------------------------------------- # cache samples # ---------------------------------------------------------------------------------------------------------------------- for sampleToCache in [self.sampleIdentifier]: print('*' * 80) print(' ', sampleToCache) print('*' * 80) # prepare caches for training and evaluation samples treeCaches = [] self.sampleTree = None # use all (sub)samples which come from the same files (sampleIdentifier) subsamples = [ x for x in self.samples if x.identifier == sampleToCache ] # list of branches to keep for use as MVA input variables branchListOfMVAVars = BranchList() for sample in subsamples: for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): for additionalCut in [self.TrainCut, self.EvalCut]: branchListOfMVAVars.addCut(trainingRegionInfo['vars']) branchListOfMVAVars.addCut(self.config.get('Weights', 'weightF')) mvaBranches = branchListOfMVAVars.getListOfBranches() # loop over all samples for sample in subsamples: # add cuts for all training regions for trainingRegion, trainingRegionInfo in self.trainingRegionsDict.iteritems( ): # add cuts for training and evaluation for additionalCut in [self.TrainCut, self.EvalCut]: # cuts sampleCuts = [sample.subcut] if additionalCut: sampleCuts.append(additionalCut) if trainingRegionInfo['cut']: sampleCuts.append(trainingRegionInfo['cut']) # add cache object tc = TreeCache.TreeCache( name='{region}_{sample}_{tr}'.format( region=trainingRegion, sample=sample.name, tr='TRAIN' if additionalCut == self.TrainCut else 'EVAL'), sample=sample.name, cutList=sampleCuts, inputFolder=self.samplesPath, splitFilesChunks=self.splitFilesChunks, chunkNumber=self.chunkNumber, splitFilesChunkSize=self.splitFilesChunkSize, branches=mvaBranches, config=self.config, debug=True) # check if this part of the sample is already cached isCached = tc.partIsCached() if not isCached or self.force: if isCached: tc.deleteCachedFiles( chunkNumber=self.chunkNumber) # for the first sample which comes from this files, load the tree if not self.sampleTree: self.sampleTree = SampleTree( { 'name': sample.identifier, 'folder': self.samplesPath }, splitFilesChunkSize=self. splitFilesChunkSize, chunkNumber=self.chunkNumber, config=self.config, saveMemory=True) treeCaches.append( tc.setSampleTree(self.sampleTree).cache()) if len(treeCaches) > 0: # run on the tree self.sampleTree.process() else: print("nothing to do!")