def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd self.useChain = True treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join( self.fileNames[0].split('/')[:-4] ) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root' % chunkNumber
def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None, inputDir=None, outputDir=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd # this way baskets will be also optimized and unused branches can be stripped off self.useChain = True self.inputDir = self.config.get('Directories', inputDir if inputDir else 'HADDin') self.outputDir = self.config.get('Directories', outputDir if outputDir else 'HADDout') self.scratchDir = self.config.get('Directories','scratch') treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join(self.fileNames[0].split('/')[:-4]) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root'%chunkNumber
def prepare(self): print( "INFO: starting plot for region \x1b[34m{region}\x1b[0m, variables:" .format(region=region)) for var in self.vars: print(" > {var}".format(var=var)) self.histogramStacks = {} for var in self.vars: self.histogramStacks[var] = StackMaker(self.config, var, self.region, self.signalRegion, None, '_' + self.subcutPlotName, title=self.title) fileLocator = FileLocator(config=self.config, useDirectoryListingCache=True) # add DATA + MC samples for sample in self.dataSamples + self.mcSamples: # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config, fileLocator=fileLocator) sampleTree = tc.getTree() if sampleTree: groupName = self.getSampleGroup(sample) print(" > found the tree, #entries = ", sampleTree.tree.GetEntries()) print(" > group =", groupName) print(" > now adding the tree for vars=", self.vars) # add the sample tree for all the variables for var in self.vars: self.histogramStacks[var].addSampleTree( sample=sample, sampleTree=sampleTree, groupName=groupName, cut=self.subcut if self.subcut else '1') else: print("\x1b[31mERROR: sampleTree not available for ", sample, ", run caching again!!\x1b[0m") raise Exception("CachedTreeMissing") return self
def __init__(self, config, region): self.config = config self.region = region self.fileLocator = FileLocator(config=self.config, useDirectoryListingCache=True) self.dcMaker = Datacard(config=self.config, region=region, fileLocator=self.fileLocator)
def test_xrootd(self): if 'X509_USER_PROXY' in os.environ and len(os.environ['X509_USER_PROXY'].strip()) > 0: path1 = 'root://xrootd-cms.infn.it//store/group/phys_higgs/hbb/ntuples/VHbbPostNano/2017/V11/TTToSemiLeptonic_TuneCP5_PSweights_13TeV-powheg-pythia8/RunIIFall17NanoAODv4-PU2017_1282/190510_115113/0000/tree_1.root' tree1 = self.getTree(path1) print ("ENTRIES:", tree1.GetEntries()) self.assertEqual(tree1.GetEntries(), 552904L) fileLocator = FileLocator() path2 = fileLocator.removeRedirector(path1) print ("PATH2:", path2) self.assertTrue(path2.startswith('/store/group/phys_higgs/')) self.assertTrue(path2.endswith('/tree_1.root')) path3 = fileLocator.addRedirector(redirector='root://xrootd-cms.infn.it', fileName=path2) self.assertEqual(path1, path3) else: print("INFO: this test is skipped because no X509 proxy certificate is found which is needed to access the files!")
def test_xrootd(self): if 'X509_USER_PROXY' in os.environ and len(os.environ['X509_USER_PROXY'].strip()) > 0: path1 = 'root://xrootd-cms.infn.it//store/group/phys_higgs/hbb/ntuples/V25/TT_TuneCUETP8M2T4_13TeV-powheg-pythia8/VHBB_HEPPY_V25_TT_TuneCUETP8M2T4_13TeV-powheg-Py8__RunIISummer16MAv2-PUMoriond17_80r2as_2016_TrancheIV_v6-v1/170202_212737/0000/tree_100.root' tree1 = self.getTree(path1) print ("ENTRIES:", tree1.GetEntries()) self.assertEqual(tree1.GetEntries(), 48442) fileLocator = FileLocator() path2 = fileLocator.removeRedirector(path1) print ("PATH2:", path2) self.assertTrue(path2.startswith('/store/group/phys_higgs/')) self.assertTrue(path2.endswith('/tree_100.root')) path3 = fileLocator.addRedirector(redirector='root://xrootd-cms.infn.it', fileName=path2) self.assertEqual(path1, path3) else: print("INFO: this test is skipped because no X509 proxy certificate is found which is needed to access the files!")
def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers] self.mcSamples = [x for x in self.mcSamples if x.identifier in self.sampleIdentifiers]
def test_xrootd(self): if 'X509_USER_PROXY' in os.environ and len( os.environ['X509_USER_PROXY'].strip()) > 0: path1 = 'root://xrootd-cms.infn.it//store/group/phys_higgs/hbb/ntuples/V25/TT_TuneCUETP8M2T4_13TeV-powheg-pythia8/VHBB_HEPPY_V25_TT_TuneCUETP8M2T4_13TeV-powheg-Py8__RunIISummer16MAv2-PUMoriond17_80r2as_2016_TrancheIV_v6-v1/170202_212737/0000/tree_100.root' tree1 = self.getTree(path1) print("ENTRIES:", tree1.GetEntries()) self.assertEqual(tree1.GetEntries(), 48442) fileLocator = FileLocator() path2 = fileLocator.removeRedirector(path1) print("PATH2:", path2) self.assertTrue(path2.startswith('/store/group/phys_higgs/')) self.assertTrue(path2.endswith('/tree_100.root')) path3 = fileLocator.addRedirector( redirector='root://xrootd-cms.infn.it', fileName=path2) self.assertEqual(path1, path3) else: print( "INFO: this test is skipped because no X509 proxy certificate is found which is needed to access the files!" )
def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None, inputDir=None, outputDir=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd # this way baskets will be also optimized and unused branches can be stripped off self.useChain = True self.inputDir = self.config.get('Directories', inputDir if inputDir else 'HADDin') self.outputDir = self.config.get('Directories', outputDir if outputDir else 'HADDout') self.scratchDir = self.config.get('Directories', 'scratch') treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join( self.fileNames[0].split('/')[:-4] ) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root' % chunkNumber
def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd self.useChain = True treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join(self.fileNames[0].split('/')[:-4]) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root'%chunkNumber
config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis","tag") TrainFlag = eval(config.get('Analysis','TrainFlag')) btagLibrary = config.get('BTagReshaping','library') samplesinfo=config.get('Directories','samplesinfo') channel=config.get('Configuration','channel') VHbbNameSpace=config.get('VHbbNameSpace','library') ROOT.gSystem.Load(VHbbNameSpace) pathIN = config.get('Directories','SYSin') pathOUT = config.get('Directories','SYSout') tmpDir = config.get('Directories','scratch') print 'INput samples:\t%s'%pathIN print 'OUTput samples:\t%s'%pathOUT fileLocator = FileLocator(config=config) # samples info = ParseInfo(samplesinfo, pathIN) matchingSamples = [x for x in info if x.identifier==opts.sampleIdentifier and not x.subsample] if len(matchingSamples) != 1: print "need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) sample = matchingSamples[0] # TODO: collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', collections
import sys import os from myutils.XbbConfig import XbbConfigReader, XbbConfigTools from myutils import ParseInfo from myutils.FileLocator import FileLocator from myutils.XbbTools import XbbTools argv = sys.argv parser = OptionParser() parser.add_option("-T","--tag", dest="tag", default='', help="config tag") parser.add_option("-D","--directory", dest="directory", default='MVAout', help="directory name, e.g. MVAout") parser.add_option("-S","--sample", dest="sample", default='TT*', help="sample") (opts, args) = parser.parse_args(argv) config = XbbConfigTools(config=XbbConfigReader.read(opts.tag)) path = config.get("Directories", opts.directory) sampleInfoDirectory = config.get('Directories', 'samplefiles') info = ParseInfo(samples_path=path, config=config) # only take first sample which matches sampleIdentifier = XbbTools.filterSampleList(info.getSampleIdentifiers(), XbbTools.parseSamplesList(opts.sample))[0] # get list of ORIGINAL file names for this sample: /store/... sampleTreeFileNames = XbbTools.getSampleTreeFileNames(sampleInfoDirectory, sampleIdentifier) fileLocator = FileLocator(config=config) # get local name of ffirst file localFilename = fileLocator.getFilePath(path, sampleIdentifier, sampleTreeFileNames[0]) print(localFilename)
if opts.config == "": opts.config = "config" weight = opts.weight evaluate_optimisation = False if weight != '': evaluate_optimisation = True #Import after configure to get help message from myutils import BetterConfigParser, ParseInfo, MvaEvaluator config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis", "tag") fileLocator = FileLocator(config=config) print ("OPTS", opts) if len(opts.fileList) > 0: filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print ("len(filelist)", len(filelist)) if len(filelist) > 0: print ("filelist[0]:", filelist[0]) else: filelist = SampleTree({'name': opts.sampleIdentifier, 'folder': config.get('Directories', 'MVAin')}, countOnly=True, splitFilesChunkSize=-1, config=config).getSampleFileNameChunks()[0] print ("INFO: no file list given, use all files!") print (len(filelist), filelist) #get locations: Wdir = config.get('Directories', 'Wdir') samplesinfo = config.get('Directories', 'samplesinfo')
def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split( ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [ x for x in self.dataSamples if x.identifier in self.sampleIdentifiers ] self.mcSamples = [ x for x in self.mcSamples if x.identifier in self.sampleIdentifiers ]
config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis", "tag") TrainFlag = eval(config.get('Analysis', 'TrainFlag')) btagLibrary = config.get('BTagReshaping', 'library') samplesinfo = config.get('Directories', 'samplesinfo') channel = config.get('Configuration', 'channel') VHbbNameSpace = config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) pathIN = config.get('Directories', 'SYSin') pathOUT = config.get('Directories', 'SYSout') tmpDir = config.get('Directories', 'scratch') print 'INput samples:\t%s' % pathIN print 'OUTput samples:\t%s' % pathOUT fileLocator = FileLocator(config=config) # samples info = ParseInfo(samplesinfo, pathIN) matchingSamples = [ x for x in info if x.identifier == opts.sampleIdentifier and not x.subsample ] if len(matchingSamples) != 1: print "need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) sample = matchingSamples[0] # TODO: collections = [
class XbbRun: def __init__(self, opts): # get file list self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print "len(filelist)",len(self.filelist), if len(self.filelist) > 0: print "filelist[0]:", self.filelist[0] else: print '' # config self.debug = 'XBBDEBUG' in os.environ self.verifyCopy = True self.opts = opts self.config = BetterConfigParser() self.config.read(opts.config) self.channel = self.config.get('Configuration', 'channel') # load namespace, TODO VHbbNameSpace = self.config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # directories self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') print 'INput samples:\t%s'%self.pathIN print 'OUTput samples:\t%s'%self.pathOUT self.fileLocator = FileLocator(config=self.config) # check if given sample identifier uniquely matches a samples from config matchingSamples = ParseInfo(samples_path=self.pathIN, config=self.config).find(identifier=opts.sampleIdentifier) if len(matchingSamples) != 1: print "ERROR: need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) self.sample = matchingSamples[0] # collections self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(self.collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', self.collections self.collections = self.parseCollectionList(self.collections) print 'after parsing:', self.collections # temorary folder to save the files of this job on the scratch temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex # input files self.subJobs = [] if opts.join: print("INFO: join input files! This is an experimental feature!") # translate naming convention of .txt file to imported files after the prep step inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist] self.subJobs.append({ 'inputFileNames': self.filelist, 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) else: # create separate subjob for all files (default!) for inputFileName in self.filelist: inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)] self.subJobs.append({ 'inputFileNames': [inputFileName], 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) # lists of single modules can be given instead of a module, "--addCollections Sys.all" # [Sys] # all = ['Sys.Vtype', 'Sys.Leptons', ...] # TODO: make it fully recursive def parseCollectionList(self, collections): collectionsListsReplaced = [] for collection in collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] listExpression = self.config.get(section, key).strip() if listExpression.startswith('[') and listExpression.endswith(']'): listParsed = eval(listExpression) for i in listParsed: collectionsListsReplaced.append(i) else: collectionsListsReplaced.append(collection) else: collectionsListsReplaced.append(collection) return collectionsListsReplaced # run all subjobs def run(self): nFilesProcessed = 0 nFilesFailed = 0 for subJob in self.subJobs: # only process if output is non-existing/broken or --force was used if self.opts.force or not self.fileLocator.isValidRootFile(subJob['outputFileName']): # create directories outputFolder = '/'.join(subJob['outputFileName'].split('/')[:-1]) tmpFolder = '/'.join(subJob['tmpFileName'].split('/')[:-1]) self.fileLocator.makedirs(outputFolder) self.fileLocator.makedirs(tmpFolder) # load sample tree sampleTree = SampleTree(subJob['localInputFileNames'], config=self.config) if not sampleTree.tree: print "trying fallback...", len(subJob['inputFileNames']) if len(subJob['inputFileNames']) == 1: # try original naming scheme if reading directly from Heppy/Nano ntuples (without prep) fileNameOriginal = self.pathIN + '/' + subJob['inputFileNames'][0] print "FO:", fileNameOriginal xrootdRedirector = self.fileLocator.getRedirector(fileNameOriginal) sampleTree = SampleTree([fileNameOriginal], config=self.config, xrootdRedirector=xrootdRedirector) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" nFilesFailed += 1 continue else: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED! (old naming scheme not supported for joining multipel files)\x1b[0m" nFilesFailed += 1 continue # to use this syntax, use "--addCollections Sys.Vtype" for a config file entry like this: # [Sys] # Vtype = VtypeCorrector.VtypeCorrector(channel='Zll') # (instead of passing the tree in the constructor, the setTree method can be used) pyModules = [] versionTable = [] for collection in self.collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] if self.config.has_section(section) and self.config.has_option(section, key): pyCode = self.config.get(section, key) elif '(' in collection and collection.endswith(')'): print "WARNING: config option", collection, " not found, interpreting it as Python code!" pyCode = collection else: print "\x1b[31mERROR: config option not found:", collection, ". To specify Python code directly, pass a complete constructor, e.g. --addCollections 'Module.Class()'. Module has to be placed in python/myutils/ folder.\x1b[0m" raise Exception("ConfigError") # import module from myutils moduleName = pyCode.split('(')[0].split('.')[0].strip() if self.debug: print "DEBUG: import module:", moduleName print("\x1b[33mDEBUG: " + collection + ": run PYTHON code:\n"+pyCode+"\x1b[0m") globals()[moduleName] = importlib.import_module(".{module}".format(module=moduleName), package="myutils") # get object wObject = eval(pyCode) # pass the tree and other variables if needed to finalize initialization if hasattr(wObject, "customInit") and callable(getattr(wObject, "customInit")): wObject.customInit({'config': self.config, 'sampleTree': sampleTree, 'tree': sampleTree.tree, 'sample': self.sample, 'channel': self.channel, 'pathIN': self.pathIN, 'pathOUT': self.pathOUT, }) # add callbacks if the objects provides any if hasattr(wObject, "processEvent") and callable(getattr(wObject, "processEvent")): sampleTree.addCallback('event', wObject.processEvent) for cb in ["finish", "prepareOutput"]: if hasattr(wObject, cb) and callable(getattr(wObject, cb)): sampleTree.addCallback(cb, getattr(wObject, cb)) # add branches if hasattr(wObject, "getBranches") and callable(getattr(wObject, "getBranches")): sampleTree.addOutputBranches(wObject.getBranches()) pyModules.append(wObject) versionTable.append([moduleName, wObject.getVersion() if hasattr(wObject, "getVersion") else 0]) else: print "\x1b[31mERROR: config option not found:", collection, " the format should be: [Section].[Option]\x1b[0m" raise Exception("ConfigError") for moduleName, moduleVersion in versionTable: print " > {m}:{v}".format(m=moduleName, v=moduleVersion) # DEPRECATED, do not use anymore ---> use BranchTools.TreeFormulas() if 'addbranches' in self.collections: writeNewVariables = eval(self.config.get("Regression", "writeNewVariablesDict")) sampleTree.addOutputBranches(writeNewVariables) # DEPRECATED, do not use anymore ---> use BranchTools.Drop() if 'removebranches' in self.collections: bl_branch = eval(config.get('Branches', 'useless_branch')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) bl_branch = eval(config.get('Branches', 'useless_after_sys')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) # define output file sampleTree.addOutputTree(subJob['tmpFileName'], cut='1', branches='*', friend=self.opts.friend) # run processing for pyModule in pyModules: if hasattr(pyModule, "beforeProcessing"): getattr(pyModule, "beforeProcessing")() sampleTree.process() for pyModule in pyModules: if hasattr(pyModule, "afterProcessing"): getattr(pyModule, "afterProcessing")() # if output trees have been produced: copy temporary file to output folder if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'copy ', subJob['tmpFileName'], subJob['outputFileName'] if self.verifyCopy: if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print 'INFO: output at final destination broken, try to copy again from scratch disk to final destination...' self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'INFO: second attempt copy done!' if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print '\x1b[31mERROR: output still broken!\x1b[0m' nFilesFailed += 1 raise Exception("FileCopyError") else: print 'INFO: file is good after second attempt!' except Exception as e: print e print "\x1b[31mERROR: copy from scratch to final destination failed!!\x1b[0m" # delete temporary file try: self.fileLocator.rm(subJob['tmpFileName']) except Exception as e: print e print "WARNING: could not delete file on scratch!" # clean up if hasattr(wObject, "cleanUp") and callable(getattr(wObject, "cleanUp")): getattr(wObject, "cleanUp")() else: print 'SKIP:', subJob['inputFileNames'] if nFilesFailed > 0: raise Exception("ProcessingIncomplete")
class SkimsHelper(object): def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split(',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace=config.get('VHbbNameSpace','library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print ("\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m"%returnCode) else: print ("INFO: loaded VHbbNameSpace: %s"%VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesDefinitions = config.get('Directories','samplesinfo') self.samplesInfo = ParseInfo(self.samplesDefinitions, self.samplesPath) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection='Plot:%s'%region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option('Plot_general','addBlindingCut'): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general','addBlindingCut') print ('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get(self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get('Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [x for x in self.dataSamples if x.identifier in self.sampleIdentifiers] self.mcSamples = [x for x in self.mcSamples if x.identifier in self.sampleIdentifiers] def prepare(self): # add DATA + MC samples self.fileNames = [] for sample in self.dataSamples + self.mcSamples: print(sample.identifier) # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append(self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache self.fileNames += TreeCache.TreeCache( sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config ).findCachedFileNames() if len(self.fileNames) < 1: print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m") return self def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option('Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval(self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print("\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def __init__(self, opts): # get file list self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print "len(filelist)",len(self.filelist), if len(self.filelist) > 0: print "filelist[0]:", self.filelist[0] else: print '' # config self.debug = 'XBBDEBUG' in os.environ self.verifyCopy = True self.opts = opts self.config = BetterConfigParser() self.config.read(opts.config) samplesinfo = self.config.get('Directories', 'samplesinfo') self.channel = self.config.get('Configuration', 'channel') # load namespace, TODO VHbbNameSpace = self.config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # directories self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') print 'INput samples:\t%s'%self.pathIN print 'OUTput samples:\t%s'%self.pathOUT self.fileLocator = FileLocator(config=self.config) # check if given sample identifier uniquely matches a samples from config matchingSamples = ParseInfo(samplesinfo, self.pathIN).find(identifier=opts.sampleIdentifier) if len(matchingSamples) != 1: print "ERROR: need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) self.sample = matchingSamples[0] # collections self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(self.collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', self.collections self.collections = self.parseCollectionList(self.collections) print 'after parsing:', self.collections # temorary folder to save the files of this job on the scratch temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex # input files self.subJobs = [] if opts.join: print("INFO: join input files! This is an experimental feature!") # translate naming convention of .txt file to imported files after the prep step inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist] self.subJobs.append({ 'inputFileNames': self.filelist, 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) else: # create separate subjob for all files (default!) for inputFileName in self.filelist: inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)] self.subJobs.append({ 'inputFileNames': [inputFileName], 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), })
class XbbRun: def __init__(self, opts): # get file list self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print "len(filelist)",len(self.filelist), if len(self.filelist) > 0: print "filelist[0]:", self.filelist[0] else: print '' # config self.debug = 'XBBDEBUG' in os.environ self.verifyCopy = True self.opts = opts self.config = BetterConfigParser() self.config.read(opts.config) samplesinfo = self.config.get('Directories', 'samplesinfo') self.channel = self.config.get('Configuration', 'channel') # load namespace, TODO VHbbNameSpace = self.config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # directories self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') print 'INput samples:\t%s'%self.pathIN print 'OUTput samples:\t%s'%self.pathOUT self.fileLocator = FileLocator(config=self.config) # check if given sample identifier uniquely matches a samples from config matchingSamples = ParseInfo(samplesinfo, self.pathIN).find(identifier=opts.sampleIdentifier) if len(matchingSamples) != 1: print "ERROR: need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) self.sample = matchingSamples[0] # collections self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(self.collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', self.collections self.collections = self.parseCollectionList(self.collections) print 'after parsing:', self.collections # temorary folder to save the files of this job on the scratch temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex # input files self.subJobs = [] if opts.join: print("INFO: join input files! This is an experimental feature!") # translate naming convention of .txt file to imported files after the prep step inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist] self.subJobs.append({ 'inputFileNames': self.filelist, 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) else: # create separate subjob for all files (default!) for inputFileName in self.filelist: inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)] self.subJobs.append({ 'inputFileNames': [inputFileName], 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) # lists of single modules can be given instead of a module, "--addCollections Sys.all" # [Sys] # all = ['Sys.Vtype', 'Sys.Leptons', ...] # TODO: make it fully recursive def parseCollectionList(self, collections): collectionsListsReplaced = [] for collection in collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] listExpression = self.config.get(section, key).strip() if listExpression.startswith('[') and listExpression.endswith(']'): listParsed = eval(listExpression) for i in listParsed: collectionsListsReplaced.append(i) else: collectionsListsReplaced.append(collection) else: collectionsListsReplaced.append(collection) return collectionsListsReplaced # run all subjobs def run(self): nFilesProcessed = 0 nFilesFailed = 0 for subJob in self.subJobs: # only process if output is non-existing/broken or --force was used if self.opts.force or not self.fileLocator.isValidRootFile(subJob['outputFileName']): # create directories outputFolder = '/'.join(subJob['outputFileName'].split('/')[:-1]) tmpFolder = '/'.join(subJob['tmpFileName'].split('/')[:-1]) self.fileLocator.makedirs(outputFolder) self.fileLocator.makedirs(tmpFolder) # load sample tree sampleTree = SampleTree(subJob['localInputFileNames'], config=self.config) if not sampleTree.tree: print "trying fallback...", len(subJob['inputFileNames']) if len(subJob['inputFileNames']) == 1: # try original naming scheme if reading directly from Heppy/Nano ntuples (without prep) fileNameOriginal = self.pathIN + '/' + subJob['inputFileNames'][0] print "FO:", fileNameOriginal xrootdRedirector = self.fileLocator.getRedirector(fileNameOriginal) sampleTree = SampleTree([fileNameOriginal], config=self.config, xrootdRedirector=xrootdRedirector) if not sampleTree.tree: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED!\x1b[0m" nFilesFailed += 1 continue else: print "\x1b[31mERROR: file does not exist or is broken, will be SKIPPED! (old naming scheme not supported for joining multipel files)\x1b[0m" nFilesFailed += 1 continue # to use this syntax, use "--addCollections Sys.Vtype" for a config file entry like this: # [Sys] # Vtype = VtypeCorrector.VtypeCorrector(channel='Zll') # (instead of passing the tree in the constructor, the setTree method can be used) pyModules = [] for collection in self.collections: if '.' in collection: section = collection.split('.')[0] key = collection.split('.')[1] pyCode = self.config.get(section, key) # import module from myutils moduleName = pyCode.split('(')[0].split('.')[0].strip() if self.debug: print "DEBUG: import module:", moduleName print("\x1b[33mDEBUG: " + collection + ": run PYTHON code:\n"+pyCode+"\x1b[0m") globals()[moduleName] = importlib.import_module(".{module}".format(module=moduleName), package="myutils") # get object wObject = eval(pyCode) # pass the tree and other variables if needed to finalize initialization if hasattr(wObject, "customInit") and callable(getattr(wObject, "customInit")): wObject.customInit({'config': self.config, 'sampleTree': sampleTree, 'tree': sampleTree.tree, 'sample': self.sample, 'channel': self.channel, 'pathIN': self.pathIN, 'pathOUT': self.pathOUT, }) # add callbacks if the objects provides any if hasattr(wObject, "processEvent") and callable(getattr(wObject, "processEvent")): sampleTree.addCallback('event', wObject.processEvent) # add branches if hasattr(wObject, "getBranches") and callable(getattr(wObject, "getBranches")): sampleTree.addOutputBranches(wObject.getBranches()) pyModules.append(wObject) # DEPRECATED, do not use anymore ---> use BranchTools.TreeFormulas() if 'addbranches' in self.collections: writeNewVariables = eval(self.config.get("Regression", "writeNewVariablesDict")) sampleTree.addOutputBranches(writeNewVariables) # DEPRECATED, do not use anymore ---> use BranchTools.Drop() if 'removebranches' in self.collections: bl_branch = eval(config.get('Branches', 'useless_branch')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) bl_branch = eval(config.get('Branches', 'useless_after_sys')) for br in bl_branch: sampleTree.addBranchToBlacklist(br) # define output file sampleTree.addOutputTree(subJob['tmpFileName'], cut='1', branches='*', friend=self.opts.friend) # run processing for pyModule in pyModules: if hasattr(pyModule, "beforeProcessing"): getattr(pyModule, "beforeProcessing")() sampleTree.process() for pyModule in pyModules: if hasattr(pyModule, "afterProcessing"): getattr(pyModule, "afterProcessing")() # if output trees have been produced: copy temporary file to output folder if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'copy ', subJob['tmpFileName'], subJob['outputFileName'] if self.verifyCopy: if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print 'INFO: output at final destination broken, try to copy again from scratch disk to final destination...' self.fileLocator.cp(subJob['tmpFileName'], subJob['outputFileName'], force=True) print 'INFO: second attempt copy done!' if not self.fileLocator.isValidRootFile(subJob['outputFileName']): print '\x1b[31mERROR: output still broken!\x1b[0m' nFilesFailed += 1 raise Exception("FileCopyError") else: print 'INFO: file is good after second attempt!' except Exception as e: print e print "\x1b[31mERROR: copy from scratch to final destination failed!!\x1b[0m" # delete temporary file try: self.fileLocator.rm(subJob['tmpFileName']) except Exception as e: print e print "WARNING: could not delete file on scratch!" # clean up if hasattr(wObject, "cleanUp") and callable(getattr(wObject, "cleanUp")): getattr(wObject, "cleanUp")() else: print 'SKIP:', subJob['inputFileNames'] if nFilesFailed > 0: raise Exception("ProcessingIncomplete")
config = BetterConfigParser() config.add_section(mvaName) config.set(mvaName, "tensorflowConfig", tensorflowConfig) config.set(mvaName, "scalerDump", scalerDump) config.set(mvaName, "checkpoint", checkpoint) config.set(mvaName, "branchName", branchName) config.set(mvaName, "nClasses", "%d"%nClasses) config.set(mvaName, "treeVarSet", "dnnVars") config.add_section("systematics") config.set("systematics", "systematics", " ".join(systematics)) config.add_section("dnnVars") for syst in systematics: config.set("dnnVars", syst, treeVarSet[syst]) # helper for fs operations fileLocator = FileLocator(config=config, xrootdRedirector=xrootdRedirector) fileLocator.mkdir(outputFolder) # load input files sampleTree = SampleTree([inputFile], treeName=inputTreeName, xrootdRedirector=xrootdRedirector) # load tensorflow evaluator tfe = tensorflowEvaluator.tensorflowEvaluator(mvaName) tfe.customInit({'config': config, 'sample': sample, 'sampleTree': sampleTree}) # register callbacks for processing sampleTree.addCallback('event', tfe.processEvent) # define new branches to add sampleTree.addOutputBranches(tfe.getBranches())
class PartialFileMerger(object): def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd self.useChain = True treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join( self.fileNames[0].split('/')[:-4] ) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root' % chunkNumber # return a fake name which is written to sample list .txt files in order to keep compatibility to the method of converting file names in .txt # files to file names after prep step. This conversion applied to the fake name will give the real file name. def getMergedFakeFileName(self): return self.mergedFileName # real output file name where the file is stored def getOutputFileName(self): fakeFileName = self.getMergedFakeFileName() outputFileName = self.fileLocator.getFilenameAfterPrep(fakeFileName) return "{path}/{sample}/{fileName}".format( path=self.config.get('Directories', 'HADDout'), sample=self.sampleIdentifier, fileName=outputFileName) def getTemporaryFileName(self): fakeFileName = self.getMergedFakeFileName() outputFileName = self.fileLocator.getFilenameAfterPrep(fakeFileName) return "{path}/hadd/{sample}/{fileName}".format( path=self.config.get('Directories', 'scratch'), sample=self.sampleIdentifier, fileName=outputFileName) def run(self): inputFileNames = [ "{path}/{sample}/{fileName}".format( path=self.config.get('Directories', 'HADDin'), sample=self.sampleIdentifier, fileName=self.fileLocator.getFilenameAfterPrep(fileName)) for fileName in self.fileNames ] outputFileName = self.getTemporaryFileName() self.fileLocator.makedirs('/'.join(outputFileName.split('/')[:-1])) command = self.commandTemplate.format(output=outputFileName, inputs=' '.join(inputFileNames), f="-f" if self.force else "") if self.debug: print("DEBUG: run \x1b[34m", command, "\x1b[0m") if self.useChain: # use sampleTree class (can e.g. drop branches at the same time) sampleTree = SampleTree(inputFileNames, config=self.config) try: removeBranches = eval( self.config.get('General', 'remove_branches')) for removeBranch in removeBranches: sampleTree.addBranchToBlacklist(removeBranch) print("DEBUG: disable branch ", removeBranch) except Exception as e: print("DEBUG: could not disable branch:", e) sampleTree.addOutputTree(outputFileName, cut='1', branches='*') sampleTree.process() result = 0 else: # standard hadd result = self.fileLocator.runCommand(command) print("INFO: hadd returned ", result) if result == 0: finalOutputFileName = self.getOutputFileName() print("move file to final destination: \x1b[34m", finalOutputFileName, "\x1b[0m") self.fileLocator.makedirs('/'.join( finalOutputFileName.split('/')[:-1])) resultCopy = self.fileLocator.cp(outputFileName, finalOutputFileName, self.force) if not resultCopy: print("\x1b[31mERROR: copy failed\n from:", outputFileName, "\n to:", finalOutputFileName, "\n force:", self.force, "\x1b[0m") raise Exception("FileCopyError") # try to delete temporary file try: self.fileLocator.rm(outputFileName) except Exception as e: print("ERROR: could not delete temporary file:", outputFileName, " => ", e) print("INFO: done.") else: raise Exception("HaddError")
class PartialFileMerger(object): def __init__(self, fileNames, chunkNumber, submitTime='000000_000000', force=False, config=None, sampleIdentifier=None): self.fileNames = fileNames self.debug = 'XBBDEBUG' in os.environ self.submitTime = submitTime self.chunkNumber = chunkNumber self.config = config self.fileLocator = FileLocator(config=self.config) # -O option (reoptimizing baskets) leads to crashes... self.commandTemplate = "hadd -k -ff {output} {inputs}" self.sampleIdentifier = sampleIdentifier self.force = force # use sampleTree class as replacement for hadd self.useChain = True treeHashes = [] for fileName in self.fileNames: treeHashes.append(hashlib.sha224(fileName).hexdigest()) totalHash = hashlib.sha224('-'.join(sorted(treeHashes))).hexdigest() self.mergedFileName = '/'.join(self.fileNames[0].split('/')[:-4]) + '/' + totalHash + '/' + self.submitTime + '/0000/tree_%d.root'%chunkNumber # return a fake name which is written to sample list .txt files in order to keep compatibility to the method of converting file names in .txt # files to file names after prep step. This conversion applied to the fake name will give the real file name. def getMergedFakeFileName(self): return self.mergedFileName # real output file name where the file is stored def getOutputFileName(self): fakeFileName = self.getMergedFakeFileName() outputFileName = self.fileLocator.getFilenameAfterPrep(fakeFileName) return "{path}/{sample}/{fileName}".format(path=self.config.get('Directories','HADDout'), sample=self.sampleIdentifier, fileName=outputFileName) def getTemporaryFileName(self): fakeFileName = self.getMergedFakeFileName() outputFileName = self.fileLocator.getFilenameAfterPrep(fakeFileName) return "{path}/hadd/{sample}/{fileName}".format(path=self.config.get('Directories','scratch'), sample=self.sampleIdentifier, fileName=outputFileName) def run(self): inputFileNames = ["{path}/{sample}/{fileName}".format(path=self.config.get('Directories','HADDin'), sample=self.sampleIdentifier, fileName=self.fileLocator.getFilenameAfterPrep(fileName)) for fileName in self.fileNames] outputFileName = self.getTemporaryFileName() self.fileLocator.makedirs('/'.join(outputFileName.split('/')[:-1])) command = self.commandTemplate.format(output=outputFileName, inputs=' '.join(inputFileNames), f="-f" if self.force else "") if self.debug: print ("DEBUG: run \x1b[34m", command, "\x1b[0m") if self.useChain: # use sampleTree class (can e.g. drop branches at the same time) sampleTree = SampleTree(inputFileNames, config=self.config) try: removeBranches = eval(self.config.get('General', 'remove_branches')) for removeBranch in removeBranches: sampleTree.addBranchToBlacklist(removeBranch) print("DEBUG: disable branch ", removeBranch) except Exception as e: print("DEBUG: could not disable branch:", e) sampleTree.addOutputTree(outputFileName, cut='1', branches='*') sampleTree.process() result = 0 else: # standard hadd result = self.fileLocator.runCommand(command) print ("INFO: hadd returned ", result) if result == 0: finalOutputFileName = self.getOutputFileName() print("move file to final destination: \x1b[34m", finalOutputFileName, "\x1b[0m") self.fileLocator.makedirs('/'.join(finalOutputFileName.split('/')[:-1])) resultCopy = self.fileLocator.cp(outputFileName, finalOutputFileName, self.force) if not resultCopy: print("\x1b[31mERROR: copy failed\n from:", outputFileName, "\n to:", finalOutputFileName, "\n force:", self.force, "\x1b[0m") raise Exception("FileCopyError") # try to delete temporary file try: self.fileLocator.rm(outputFileName) except Exception as e: print("ERROR: could not delete temporary file:", outputFileName, " => ", e) print("INFO: done.") else: raise Exception("HaddError")
# create minimal Xbb config config = BetterConfigParser() config.add_section(mvaName) config.set(mvaName, "tensorflowConfig", tensorflowConfig) config.set(mvaName, "scalerDump", scalerDump) config.set(mvaName, "checkpoint", checkpoint) config.set(mvaName, "branchName", branchName) config.set(mvaName, "treeVarSet", "dnnVars") config.add_section("systematics") config.set("systematics", "systematics", " ".join(systematics)) config.add_section("dnnVars") for syst in systematics: config.set("dnnVars", syst, treeVarSet[syst]) # helper for fs operations fileLocator = FileLocator(config=config, xrootdRedirector=xrootdRedirector) fileLocator.mkdir(outputFolder) # load input files sampleTree = SampleTree([inputFile], treeName="tree", xrootdRedirector=xrootdRedirector) # load tensorflow evaluator tfe = tensorflowEvaluator.tensorflowEvaluator(mvaName) tfe.customInit({'config': config, 'sample': sample, 'sampleTree': sampleTree}) # register callbacks for processing sampleTree.addCallback('event', tfe.processEvent) # define new branches to add sampleTree.addOutputBranches(tfe.getBranches())
class SkimsHelper(object): def __init__(self, config, region, sampleIdentifier=None, opts=None): self.config = config self.region = region self.sampleIdentifiers = sampleIdentifier.split( ',') if sampleIdentifier and len(sampleIdentifier) > 0 else None # VHbb namespace VHbbNameSpace = config.get('VHbbNameSpace', 'library') returnCode = ROOT.gSystem.Load(VHbbNameSpace) if returnCode != 0: print( "\x1b[31mERROR: loading VHbbNameSpace failed with code %d\x1b[0m" % returnCode) else: print("INFO: loaded VHbbNameSpace: %s" % VHbbNameSpace) # input/output paths self.fileLocator = FileLocator(config=self.config) self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') self.samplesPath = config.get('Directories', 'plottingSamples') self.samplesInfo = ParseInfo(samples_path=self.samplesPath, config=self.config) self.sampleFilesFolder = config.get('Directories', 'samplefiles') self.plotPath = config.get('Directories', 'plotpath') # plot regions self.configSection = 'Plot:%s' % region # additional cut to only plot a subset of the region self.subcut = None if self.config.has_option(self.configSection, 'subcut'): self.subcut = self.config.get(self.configSection, 'subcut') print("INFO: use cut:", self.subcut) # additional global blinding cut: self.addBlindingCut = None if self.config.has_option( 'Plot_general', 'addBlindingCut' ): #contained in plots, cut on the event number self.addBlindingCut = self.config.get('Plot_general', 'addBlindingCut') print('adding add. blinding cut:', self.addBlindingCut) # load samples self.data = eval(self.config.get( self.configSection, 'Datas')) # read the data corresponding to each CR (section) self.mc = eval(self.config.get( 'Plot_general', 'samples')) # read the list of mc samples self.total_lumi = eval(self.config.get('General', 'lumi')) self.signalRegion = False if self.config.has_option(self.configSection, 'Signal'): self.mc.append(self.config.get(self.configSection, 'Signal')) self.signalRegion = True self.dataSamples = self.samplesInfo.get_samples(self.data) self.mcSamples = self.samplesInfo.get_samples(self.mc) # filter samples used in the plot if self.sampleIdentifiers: self.dataSamples = [ x for x in self.dataSamples if x.identifier in self.sampleIdentifiers ] self.mcSamples = [ x for x in self.mcSamples if x.identifier in self.sampleIdentifiers ] def prepare(self): # add DATA + MC samples self.fileNames = [] for sample in self.dataSamples + self.mcSamples: print(sample.identifier) # cuts sampleCuts = [sample.subcut] if self.config.has_option('Cuts', self.region): sampleCuts.append(self.config.get('Cuts', self.region)) if self.config.has_option(self.configSection, 'Datacut'): sampleCuts.append( self.config.get(self.configSection, 'Datacut')) if self.addBlindingCut: sampleCuts.append(self.addBlindingCut) # get sample tree from cache tc = TreeCache.TreeCache(sample=sample, cutList=sampleCuts, inputFolder=self.samplesPath, config=config) if tc.isCached(): self.fileNames += tc.findCachedFileNames() else: print("ERROR: not cached, run cacheplot again") raise Exception("NotCached") if len(self.fileNames) < 1: print("\x1b[31mERROR: no files found, run cacheplot!\x1b[0m") return self def run(self): name = self.config.get('Configuration', 'channel') if self.config.has_option( 'Configuration', 'channel') else '_' timestamp = datetime.datetime.now().strftime("%y%m%d") tmpName = self.tmpDir + '/skim_' + name + '_' + region + '_' + timestamp + '_tmp.root' destName = self.pathOUT + '/skim_' + name + '_' + region + '_' + timestamp + '.root' sampleTree = SampleTree(self.fileNames, config=self.config) if self.config.has_option('Plot_general', 'controlSample'): controlSampleDict = eval( self.config.get('Plot_general', 'controlSample')) controlSample = controlSampleDict[ self.region] if self.region in controlSampleDict else -1 sampleTree.addOutputBranch("controlSample", lambda x: controlSample, branchType="i") print("INFO: setting controlSample to", controlSample) sampleTree.addOutputTree(tmpName, cut='1', branches='*', friend=False) sampleTree.process() # copy to final destination if sampleTree.getNumberOfOutputTrees() > 0: try: self.fileLocator.cp(tmpName, destName, force=True) print('copy ', tmpName, destName) if not self.fileLocator.isValidRootFile(destName): print( "\x1b[31mERROR: copy failed, output is broken!\x1b[0m") else: try: self.fileLocator.rm(tmpName) except Exception as e: print(e) except Exception as e: print("\x1b[31mERROR: copy failed!", e, "\x1b[0m")
def __init__(self, opts): # get file list self.filelist = FileList.decompress(opts.fileList) if len(opts.fileList) > 0 else None print "len(filelist)",len(self.filelist), if len(self.filelist) > 0: print "filelist[0]:", self.filelist[0] else: print '' # config self.debug = 'XBBDEBUG' in os.environ self.verifyCopy = True self.opts = opts self.config = BetterConfigParser() self.config.read(opts.config) self.channel = self.config.get('Configuration', 'channel') # load namespace, TODO VHbbNameSpace = self.config.get('VHbbNameSpace', 'library') ROOT.gSystem.Load(VHbbNameSpace) # directories self.pathIN = self.config.get('Directories', opts.inputDir) self.pathOUT = self.config.get('Directories', opts.outputDir) self.tmpDir = self.config.get('Directories', 'scratch') print 'INput samples:\t%s'%self.pathIN print 'OUTput samples:\t%s'%self.pathOUT self.fileLocator = FileLocator(config=self.config) # check if given sample identifier uniquely matches a samples from config matchingSamples = ParseInfo(samples_path=self.pathIN, config=self.config).find(identifier=opts.sampleIdentifier) if len(matchingSamples) != 1: print "ERROR: need exactly 1 sample identifier as input with -S !!" print matchingSamples exit(1) self.sample = matchingSamples[0] # collections self.collections = [x.strip() for x in opts.addCollections.split(',') if len(x.strip()) > 0] if len(opts.addCollections.strip())>0 else [] if len(self.collections) < 1: print "\x1b[31mWARNING: no collections added! Specify the collections to add with the --addCollections option!\x1b[0m" print 'collections to add:', self.collections self.collections = self.parseCollectionList(self.collections) print 'after parsing:', self.collections # temorary folder to save the files of this job on the scratch temporaryName = self.sample.identifier + '/' + uuid.uuid4().hex # input files self.subJobs = [] if opts.join: print("INFO: join input files! This is an experimental feature!") # translate naming convention of .txt file to imported files after the prep step inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(x) for x in self.filelist] self.subJobs.append({ 'inputFileNames': self.filelist, 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), }) else: # create separate subjob for all files (default!) for inputFileName in self.filelist: inputFileNamesAfterPrep = [self.fileLocator.getFilenameAfterPrep(inputFileName)] self.subJobs.append({ 'inputFileNames': [inputFileName], 'localInputFileNames': ["{path}/{subfolder}/{filename}".format(path=self.pathIN, subfolder=self.sample.identifier, filename=localFileName) for localFileName in inputFileNamesAfterPrep], 'outputFileName': "{path}/{subfolder}/{filename}".format(path=self.pathOUT, subfolder=self.sample.identifier, filename=inputFileNamesAfterPrep[0]), 'tmpFileName': "{path}/{subfolder}/{filename}".format(path=self.tmpDir, subfolder=temporaryName, filename=inputFileNamesAfterPrep[0]), })
action="store_true", dest="force", default=False, help="force overwriting of already cached files") (opts, args) = parser.parse_args(sys.argv) if opts.config == "": opts.config = "config" #Import after configure to get help message from myutils import BetterConfigParser, ParseInfo, MvaEvaluator config = BetterConfigParser() config.read(opts.config) anaTag = config.get("Analysis", "tag") # get list of files to process fileLocator = FileLocator(config=config) if len(opts.fileList) > 0: filelist = FileList.decompress( opts.fileList) if len(opts.fileList) > 0 else None print("len(filelist)", len(filelist)) if len(filelist) > 0: print("filelist[0]:", filelist[0]) else: filelist = SampleTree( { 'name': opts.sampleIdentifier, 'folder': config.get('Directories', 'MVAin') }, countOnly=True, splitFilesChunkSize=-1, config=config).getSampleFileNameChunks()[0]