def customInit(self, initVars): self.sample = initVars['sample'] self.sampleTree = initVars['sampleTree'] self.config = initVars['config'] self.addBranch(self.branchName) self.addBranch("weightF") self.addBranch("weightXS") if not self.sample.isData(): self.weightString = self.config.get('Weights','weightF') # per sample special weight if self.config.has_option('Weights', 'useSpecialWeight') and eval(self.config.get('Weights', 'useSpecialWeight')): specialweight = self.sample.specialweight self.weightString = "(({weight})*({specialweight}))".format(weight=self.weightString, specialweight=specialweight) print ("INFO: use specialweight: {specialweight}".format(specialweight=specialweight)) self.evalCut = self.config.get('Cuts','EvalCut') self.sampleTree.addFormula(self.weightString) self.sampleTree.addFormula(self.evalCut) self.excludeTrainingSet = False # to compute the correct scale to cross-section, all trees of the sample have to be used! sampleTreeForCount = SampleTree({'sample': self.sample, 'folder': initVars['pathIN']}, config=self.config) self.weightScaleToXS = sampleTreeForCount.getScale(self.sample) * (2.0 if self.excludeTrainingSet else 1.0) print "scale:", self.weightScaleToXS, self.sample
def customInit(self, initVars): self.sample = initVars['sample'] self.sampleTree = initVars['sampleTree'] self.config = initVars['config'] self.addBranch(self.branchName) self.addBranch("weightF") self.addBranch("weightXS") if not self.sample.isData(): self.weightString = self.config.get('Weights', 'weightF') # per sample special weight if self.config.has_option('Weights', 'useSpecialWeight') and eval( self.config.get('Weights', 'useSpecialWeight')): specialweight = self.sample.specialweight self.weightString = "(({weight})*({specialweight}))".format( weight=self.weightString, specialweight=specialweight) print("INFO: use specialweight: {specialweight}".format( specialweight=specialweight)) self.evalCut = self.config.get('Cuts', 'EvalCut') self.sampleTree.addFormula(self.weightString) self.sampleTree.addFormula(self.evalCut) self.excludeTrainingSet = False # to compute the correct scale to cross-section, all trees of the sample have to be used! sampleTreeForCount = SampleTree( { 'sample': self.sample, 'folder': initVars['pathIN'] }, config=self.config) self.weightScaleToXS = sampleTreeForCount.getScale( self.sample) * (2.0 if self.excludeTrainingSet else 1.0) print "scale:", self.weightScaleToXS, self.sample
def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None, fileLocator=None): self.config = config self.fileLocator = fileLocator if fileLocator is not None else FileLocator(config=self.config) self.debug = debug or ('XBBDEBUG' in os.environ) # SAMPLE if isinstance(sample, Sample): # sample passed as Sample object # count number of chunks the cached data is split into defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100 splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug, fileLocator=self.fileLocator).getNumberOfParts() # if sample passed as object, it can be a 'subsample' and habe different name and identifier self.sample = sample.name self.sampleIdentifier = sample.identifier if self.debug: print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks) else: # sample identifier passed as string self.sample = sample self.sampleIdentifier = sample self.name = name # CUTS self.cutList = cutList self.cutSequenceMode = cutSequenceMode self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode) # PATHS self.inputFolder = inputFolder self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder self.cachedFileNames = [] self.tmpFiles = [] self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root' # BRANCHES and chunk information self.branches = branches self.branchesForHash = None # for now make hash independent of selecte branches self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get() self.chunkNumber = chunkNumber self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1 self.splitFilesChunkSize = splitFilesChunkSize # identifier is just used as an arbitrary name for print-out cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '') self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks) self.sampleTree = None self.isCachedChecked = False self.createFolders()
def getTree(self): # if it has already been checked if tree is cached, then use this result dierctly isCached = self.isCachedChecked if not isCached: isCached = self.isCached() if isCached: self.sampleTree = SampleTree(self.cachedFileNames, config=self.config) self.sampleTree.sampleIdentifier = self.sampleIdentifier return self.sampleTree
def getTree(self, chunkSize=-1, chunkNumber=-1): # if it has already been checked if tree is cached, then use this result dierctly isCached = self.isCachedChecked if not isCached: isCached = self.isCached() if isCached: if chunkSize > 0 and chunkNumber > 0: fileNames = self.cachedFileNames[(chunkNumber-1)*chunkSize:chunkNumber*chunkSize] elif chunkSize < 0 and chunkNumber < 0: fileNames = self.cachedFileNames else: raise Exception("InvalidParameters") self.sampleTree = SampleTree(self.cachedFileNames, config=self.config, fileNamesToProcess=fileNames) self.sampleTree.sampleIdentifier = self.sampleIdentifier # check if even though all files exist, they couldn't be accessed for some reason # and therefore the tree would be incomplete if not self.sampleTree.isCompleteTree(): raise Exception("IncompleteTree") return self.sampleTree
def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None): self.config = config self.fileLocator = FileLocator(config=self.config) self.debug = debug or ('XBBDEBUG' in os.environ) # SAMPLE if isinstance(sample, Sample): # sample passed as Sample object # count number of chunks the cached data is split into defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100 splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug).getNumberOfParts() # if sample passed as object, it can be a 'subsample' and habe different name and identifier self.sample = sample.name self.sampleIdentifier = sample.identifier if self.debug: print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks) else: # sample identifier passed as string self.sample = sample self.sampleIdentifier = sample self.name = name # CUTS self.cutList = cutList self.cutSequenceMode = cutSequenceMode self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode) # PATHS self.inputFolder = inputFolder self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder self.cachedFileNames = [] self.tmpFiles = [] self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root' # BRANCHES and chunk information self.branches = branches self.branchesForHash = None # for now make hash independent of selecte branches self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get() self.chunkNumber = chunkNumber self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1 self.splitFilesChunkSize = splitFilesChunkSize # identifier is just used as an arbitrary name for print-out cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '') self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks) self.sampleTree = None self.isCachedChecked = False self.createFolders()
def setStartingState(self, garbageSize, numLinear, numCircular): assert self.N > garbageSize + numLinear + numCircular self.pool = SampleTree() numGarbage = 0 if garbageSize > 0: garbage = CircularContig(garbageSize) garbage.setDead() self.pool.insert(garbage, garbage.numBases()) numGarbage = 1 lrat = float(numLinear) / (numLinear + numCircular) crat = float(numCircular) / (numLinear + numCircular) linearBases = math.floor((self.N - garbageSize) * lrat) circularBases = math.ceil((self.N - garbageSize) * crat) assert linearBases + circularBases + garbageSize == self.N if numLinear > 0: linSize = math.floor(linearBases / numLinear) extra = linearBases % numLinear added = 0 for i in range(numLinear): size = linSize if i < extra: size += 1 # plus 1 since number of adjacencies is 1 + number of bases contig = LinearContig(size + 1) self.pool.insert(contig, contig.numBases()) added += contig.size assert added == linearBases + numLinear assert self.pool.size() == numLinear + numGarbage assert self.pool.weight() == linearBases + garbageSize if numCircular > 0: circSize = math.floor(circularBases / numCircular) extra = circularBases % numCircular added = 0 for i in range(numCircular): size = circSize if i < extra: size += 1 contig = CircularContig(size) self.pool.insert(contig, contig.numBases()) added += contig.size assert added == circularBases assert self.pool.size() == numLinear + numCircular + numGarbage assert self.pool.weight() == circularBases + linearBases + \ garbageSize
class TreeCache: def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None, fileLocator=None): self.config = config self.fileLocator = fileLocator if fileLocator is not None else FileLocator(config=self.config) self.debug = debug or ('XBBDEBUG' in os.environ) # SAMPLE if isinstance(sample, Sample): # sample passed as Sample object # count number of chunks the cached data is split into defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100 splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug, fileLocator=self.fileLocator).getNumberOfParts() # if sample passed as object, it can be a 'subsample' and habe different name and identifier self.sample = sample.name self.sampleIdentifier = sample.identifier if self.debug: print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks) else: # sample identifier passed as string self.sample = sample self.sampleIdentifier = sample self.name = name # CUTS self.cutList = cutList self.cutSequenceMode = cutSequenceMode self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode) # PATHS self.inputFolder = inputFolder self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder self.cachedFileNames = [] self.tmpFiles = [] self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root' # BRANCHES and chunk information self.branches = branches self.branchesForHash = None # for now make hash independent of selecte branches self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get() self.chunkNumber = chunkNumber self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1 self.splitFilesChunkSize = splitFilesChunkSize # identifier is just used as an arbitrary name for print-out cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '') self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks) self.sampleTree = None self.isCachedChecked = False self.createFolders() # free memory def deleteSampleTree(self): self.sampleTree = None # file, where skimmed tree is written to def getTmpFileName(self): return self.outputFileNameFormat.format( outputFolder=self.tmpFolder, hash=self.hash, part=self.chunkNumber if self.chunkNumber > 0 else 1, parts='%d'%self.splitFilesChunks ) # file, where skimmed tree is moved to after it has been written completely def getOutputFileName(self): return self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part=self.chunkNumber if self.chunkNumber > 0 else 1, parts='%d'%self.splitFilesChunks ) # check existence of files with skimmed trees def findCachedFileNames(self, chunkNumber=-1): cachedFilesMaskRaw = self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part='*' if chunkNumber < 1 else '%d'%chunkNumber, parts=self.splitFilesChunks ) cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw) # this does not work reliably on T3 worker nodes anymore #self.cachedFileNames = glob.glob(cachedFilesMask) # workaround: use xrootd for directory listing #self.cachedFileNames = self.fileLocator.glob_with_fallback(cachedFilesMaskRaw) # this solution uses a loop over all possible files and uses xrdfs stat instead of xrdfs ls self.cachedFileNames = self.fileLocator.get_numbered_file_list(cachedFilesMaskRaw, 1, self.splitFilesChunks) if self.debug: print ('DEBUG: search files:', cachedFilesMask) print ('\x1b[32mDEBUG: files:') for fileName in self.cachedFileNames: print (' > ', fileName) if len(self.cachedFileNames) < 1: print ('none!') print ('\x1b[0m(%d files found)'%len(self.cachedFileNames)) # sort self.cachedFileNames = sorted(self.cachedFileNames, key=lambda x: int(x.split('_')[-1].split('of')[0]) if 'of' in x and '_' in x else -1) return self.cachedFileNames def getTotalNumberOfOutputFiles(self): return self.splitFilesChunks # check if a single part is cached, (only checks existence of the file, not validity!) def partIsCached(self): cachedFilesMaskRaw = self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part=self.chunkNumber, parts=self.splitFilesChunks ) # this does not work reliably on T3 worker nodes anymore #cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw) #return len(glob.glob(cachedFilesMask)) > 0 return self.fileLocator.remoteFileExists(cachedFilesMaskRaw) # isCached == all files containing the skimmed tree found! def isCached(self): self.findCachedFileNames() if (len(self.cachedFileNames) != self.splitFilesChunks and self.splitFilesChunks > 1) or len(self.cachedFileNames) == 0: if self.debug: print ('\x1b[32mDEBUG: not cached:', self.identifier, '\x1b[0m') return False self.isCachedChecked = True return True # check if an existing file can be opened without errors by ROOT def checkFileValidity(self, rawFileName): xrootdFileName = self.fileLocator.getXrootdFileName(rawFileName) f = ROOT.TFile.Open(xrootdFileName, 'read') if not f or f.GetNkeys() == 0 or f.TestBit(ROOT.TFile.kRecovered) or f.IsZombie(): print ('\x1b[31mWARNING: broken file:', rawFileName, ' => redo caching!\x1b[0m') if f: f.Close() self.deleteFile(rawFileName) return False if f: f.Close() return True # check if all cached files are valid def isCachedAndValid(self): valid = True if self.isCached(): # check file integrity for fileName in self.cachedFileNames: valid = valid and self.checkFileValidity(fileName) else: valid = False return valid # set input sampleTree object def setSampleTree(self, sampleTree): self.sampleTree = sampleTree return self # this prepares the caching by telling the sampleTree object what to write during processing of the file # note: does not run the caching by itself! needs an additional sampleTree.process() def cache(self): if self.sampleTree: outputFileName = self.getTmpFileName() callbacks = {'afterWrite': self.moveFilesToFinalLocation} self.sampleTree.addOutputTree(outputFileName=outputFileName, cut=self.cutList, hash=self.hash, branches=self.branches, callbacks=callbacks, cutSequenceMode=self.cutSequenceMode, name=self.name) self.tmpFiles.append(outputFileName) if self.debug: print ('\x1b[32mDEBUG: output file for ', self.identifier, ' is ', outputFileName, '\x1b[0m') else: print ('\x1b[31mERROR: no sample tree connected!:', self.identifier, ' set the sampleTree first with "setSampleTree(sampleTree)" \x1b[0m') return self # return sample tree class of cached samples if all files found def getTree(self, chunkSize=-1, chunkNumber=-1): # if it has already been checked if tree is cached, then use this result dierctly isCached = self.isCachedChecked if not isCached: isCached = self.isCached() if isCached: if chunkSize > 0 and chunkNumber > 0: fileNames = self.cachedFileNames[(chunkNumber-1)*chunkSize:chunkNumber*chunkSize] elif chunkSize < 0 and chunkNumber < 0: fileNames = self.cachedFileNames else: raise Exception("InvalidParameters") self.sampleTree = SampleTree(self.cachedFileNames, config=self.config, fileNamesToProcess=fileNames) self.sampleTree.sampleIdentifier = self.sampleIdentifier # check if even though all files exist, they couldn't be accessed for some reason # and therefore the tree would be incomplete if not self.sampleTree.isCompleteTree(): raise Exception("IncompleteTree") return self.sampleTree # delete file def deleteFile(self, rawFileName): if self.debug: print ('DELETE:', rawFileName) self.fileLocator.rm(rawFileName) # delete cached files def deleteCachedFiles(self, chunkNumber=-1): cachedFileNames = self.findCachedFileNames(chunkNumber=chunkNumber) for fileName in cachedFileNames: if self.fileLocator.fileExists(fileName): self.deleteFile(fileName) # create folders def createFolders(self): tmpfolderLocal = self.fileLocator.getLocalFileName(self.tmpFolder) if not os.path.isdir(tmpfolderLocal): print("DOES NOT EXIST:", tmpfolderLocal) try: xrootdFileName = self.fileLocator.getXrootdFileName(self.tmpFolder) if '://' not in xrootdFileName: os.makedirs(self.tmpFolder) else: command = 'gfal-mkdir %s' % (xrootdFileName) returnCode = subprocess.call([command], shell=True) if self.debug: print(command, ' => ', returnCode) print () except: pass if not self.fileLocator.exists(self.outputFolder): print("INFO: output folder does not exist and will be created:", self.outputFolder) self.fileLocator.makedirs(self.outputFolder) # move files from temporary to final location def moveFilesToFinalLocation(self, raiseOnFailure=True): success = True # free some memory for file copy command if self.debug: print('DEBUG: max mem used A:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) self.deleteSampleTree() if self.debug: print('DEBUG: max mem used B:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) for tmpFileName in self.tmpFiles: outputFileName = self.outputFolder + '/' + self.tmpFolder.join(tmpFileName.split(self.tmpFolder)[1:]) print ('copy ', tmpFileName, ' to ', outputFileName) if self.fileLocator.fileExists(outputFileName): self.deleteFile(outputFileName) copySuccessful = self.fileLocator.cp(tmpFileName, outputFileName) if not copySuccessful: print("WARNING: first copy attempt failed! retry once!") self.fileLocator.debug = True copySuccessful = self.fileLocator.cp(tmpFileName, outputFileName) if not copySuccessful: success = False print('\x1b[31mERROR: copy failed for {tmpfile}->{outputfile} !\x1b[0m'.format(tmpfile=tmpFileName, outputfile=outputFileName)) if raiseOnFailure: raise Exception("CopyToFinalDestinationFailed") if success: # delete temporary file if copy was successful self.deleteFile(tmpFileName) return success
#print(config.get('Weights','weightF')) #config = XbbConfigReader.read('Zvv2017') inputFile = 'root://t3dcachedb03.psi.ch:1094//pnfs/psi.ch/cms/trivcat/store/user/berger_p2/VHbb/VHbbPostNano2017/V5/Zvv/rerun/v4j/eval/ggZH_HToBB_ZToNuNu_M125_13TeV_powheg_pythia8/tree_aa5e971734ef4e885512748d534e6937ff03dc61feed21b6772ba943_000000_000000_0000_9_a6c5a52b56e5e0c7ad5aec31429c8926bf32cf39adbe087f05cfb323.root' path = 'root://t3dcachedb03.psi.ch:1094//pnfs/psi.ch/cms/trivcat/store/user/berger_p2/VHbb/VHbbPostNano2017/V5/Zvv/rerun/v4j/eval/' samplefiles = '../samples/VHbbPostNano2017_V5/merged_Zvv2017/' samplesinfo = 'Zvv2017config/samples_nosplit.ini' info = ParseInfo(samples_path=path, config=config) sample = [ x for x in info if x.identifier == 'ggZH_HToBB_ZToNuNu_M125_13TeV_powheg_pythia8' ][0] # read sample sampleTree = SampleTree([inputFile], config=config) # initialize module w = WeightAsBranch() w.customInit({ 'sampleTree': sampleTree, 'config': config, 'sample': sample, 'pathIN': path }) #addAsBranch = True addAsBranch = False print 'w.getBranches()', w.getBranches()
def __init__(self): self.pool = SampleTree() self.eventQueue = EventQueue() self.__resetCounts()
class Model(object): def __init__(self): self.pool = SampleTree() self.eventQueue = EventQueue() self.__resetCounts() ################################################################## # there are five kinds of rates: # N: (fixed) number of bases in the model # rll: rate for dcj on the bases in the contig pool # rld: rate for dcj where one break is in the pool # and the other rate is in the garbage # rdd: both in garbage # fl: telomere loss modifier # fg: telomere gain modifier # pgain: dead gain probability ################################################################## def setParameters(self, N, rll, rld = 0, rdd = 0, fl = 0, fg = 0, pgain = 0): self.eventQueue.reset() self.N = N self.fl = fl self.fg = fg self.pgain = pgain if rll > 0: self.eventQueue.addEventType(N * rll, self.__llEvent) if rld > 0: self.eventQueue.addEventType(N * rld, self.__ldEvent) if rdd > 0: self.eventQueue.addEventType(N * rdd, self.__ddEvent) ################################################################## # intitialize the starting state # the the contigs will all have the same sizes (modulo rounding) # in order to satisfy the input parameters exactly ################################################################## def setStartingState(self, garbageSize, numLinear, numCircular): assert self.N > garbageSize + numLinear + numCircular self.pool = SampleTree() numGarbage = 0 if garbageSize > 0: garbage = CircularContig(garbageSize) garbage.setDead() self.pool.insert(garbage, garbage.numBases()) numGarbage = 1 lrat = float(numLinear) / (numLinear + numCircular) crat = float(numCircular) / (numLinear + numCircular) linearBases = math.floor((self.N - garbageSize) * lrat) circularBases = math.ceil((self.N - garbageSize) * crat) assert linearBases + circularBases + garbageSize == self.N if numLinear > 0: linSize = math.floor(linearBases / numLinear) extra = linearBases % numLinear added = 0 for i in range(numLinear): size = linSize if i < extra: size += 1 # plus 1 since number of adjacencies is 1 + number of bases contig = LinearContig(size + 1) self.pool.insert(contig, contig.numBases()) added += contig.size assert added == linearBases + numLinear assert self.pool.size() == numLinear + numGarbage assert self.pool.weight() == linearBases + garbageSize if numCircular > 0: circSize = math.floor(circularBases / numCircular) extra = circularBases % numCircular added = 0 for i in range(numCircular): size = circSize if i < extra: size += 1 contig = CircularContig(size) self.pool.insert(contig, contig.numBases()) added += contig.size assert added == circularBases assert self.pool.size() == numLinear + numCircular + numGarbage assert self.pool.weight() == circularBases + linearBases + \ garbageSize ################################################################## # run the simulation for the specified time ################################################################## def simulate(self, time): self.eventQueue.begin() self.__resetCounts() while True: nextEvent = self.eventQueue.next(time) if nextEvent is not None: nextEvent() else: break ################################################################## # draw (and remove) two random adajcenies and their # contigs from the pool (only if they are not dead) ################################################################## def __drawSamples(self): sampleNode1, offset1 = self.pool.uniformSample() sampleNode2, offset2 = self.pool.uniformSample() # the offset is weighted based on the number of bases # we want to translate this into number of edges (splitting) # the probability between linear and telomere edges. # so for linear contigs with zero offset, we flip a coin to # move it to the other side. if sampleNode1.data.isLinear() and offset1 == 0: if random.random() < 0.5: offset1 = sampleNode1.data.numBases() if sampleNode2 is not sampleNode1 and sampleNode2.data.isLinear() and\ offset2 == 0: if random.random() < 0.5: offset2 = sampleNode2.data.numBases() assert offset1 < sampleNode1.data.size assert offset2 < sampleNode2.data.size return (sampleNode1, offset1, sampleNode2, offset2) ################################################################## #LIVE-LIVE event. Is normal DCJ operation between two live contigs #unless the two breakpoints are identical or on telomeres, in which #case fl and fg parameters are used to use fission operations to #modifiy the number of telomeres ################################################################## def __llEvent(self): if self.pool.size() == 0 or self.pool.weight() == 1: return # draw (and remove) two random adajcenies and their #contigs from the pool (only if they are not dead) sampleNode1, offset1, sampleNode2, offset2 = self.__drawSamples() c1 = sampleNode1.data c2 = sampleNode2.data # don't deal with dead contigs in this event if c1.isDead() == True or c2.isDead() == True: return self.pool.remove(sampleNode1) if c1 is not c2: self.pool.remove(sampleNode2) # case 1) gain of telomere if sampleNode1 is sampleNode2 and offset1 == offset2: return self.__llGain(c1, c2, offset1, offset2) # case 2) loss of telomere elif c1.isLinear() and c2.isLinear() and \ (offset1 == 0 or offset1 == c1.size - 1) and \ (offset2 == 0 or offset2 == c2.size - 1): return self.__llLoss(c1, c2, offset1, offset2) # case 3) no gain or loss self.llCount += 1 forward = random.randint(0, 1) == 1 # do the dcj dcjResult = dcj(c1, offset1, c2, offset2, forward) # add the resulting contigs back to the pool for res in dcjResult: self.pool.insert(res, res.numBases()) ################################################################## # Do the fission telomere gain operation (if fg check passes) ################################################################## def __llGain(self, c1, c2, offset1, offset2): # correct "not composite check below" if c1.isCircular() or (offset1 != 0 and offset1 != c1.size - 1): forward = self.fg > random.random() if forward: self.fgCount += 1 dcjResult = dcj(c1, offset1, c2, offset2, forward) if c1.isCircular(): assert len(dcjResult) == 1 and dcjResult[0].isLinear() else: assert len(dcjResult) == 2 and dcjResult[0].isLinear() \ and dcjResult[1].isLinear() # add the resulting contigs back to the pool for res in dcjResult: self.pool.insert(res, res.numBases()) return self.pool.insert(c1, c1.numBases()) if c2 is not c1: self.pool.insert(c2, c2.numBases()) ################################################################## # Do the fission telomer loss operation (if fl check passes) ################################################################## def __llLoss(self, c1, c2, offset1, offset2): if c1 is c2: forward = self.fl / 4.0 > random.random() else: forward = self.fl / 2.0 > random.random() if forward: c1 = c1.circularize() if c1 is not c2: c2 = c2.circularize() dcjResult = dcj(c1, offset1, c2, offset2, forward) self.flCount += 1 assert len(dcjResult) == 1 if c1 is not c2: assert dcjResult[0].isLinear() else: assert dcjResult[0].isCircular() # add the resulting contigs back to the pool for res in dcjResult: self.pool.insert(res, res.numBases()) else: self.pool.insert(c1, c1.numBases()) if c2 is not c1: self.pool.insert(c2, c2.numBases()) ################################################################## #LIVE-DEAD (or DEAD-LIVE) event. One contig is alive and the #other is the unique dead contig. This can result in a loss of #live contigs and/or change in number of live bases ################################################################## def __ldEvent(self): if self.pool.size() == 0 or self.pool.weight() == 1: return # draw (and remove) two random adajcenies and their #contigs from the pool (only if they are not dead) sampleNode1, offset1, sampleNode2, offset2 = self.__drawSamples() c1 = sampleNode1.data c2 = sampleNode2.data # only deal with live / dead contigs in this event if (c1.isDead() == c2.isDead()): return self.pool.remove(sampleNode1) if c1 is not c2: self.pool.remove(sampleNode2) # make sure c1 is alive and c2 is dead if c1.isDead(): c1, c2 = c2, c1 offset1, offset2 = offset2, offset1 # do the dcj dcjResult = dcj(c1, offset1, c2, offset2, random.randint(0, 1) == 1) deadIdx = 0; if len(dcjResult) == 2 and \ random.randint(0, dcjResult[0].size + dcjResult[1].size) >= \ dcjResult[0].size: deadIdx = 1 dcjResult[deadIdx].setDead(True) if len(dcjResult) == 1: self.ldLossCount += 1 else: self.ldSwapCount += 1 # add the resulting contigs back to the pool deadCount = 0 for res in dcjResult: if res.isDead(): deadCount += 1 self.pool.insert(res, res.numBases()) assert deadCount == 1 ################################################################## #DEAD-DEAD event. The dead contig rearranges with itself. pgain #is used to decide how oftern this oepration breaks off a new circular #live chormosome ################################################################## def __ddEvent(self): if self.pool.size() == 0 or self.pool.weight() == 1: return sampleNode1, offset1, sampleNode2, offset2 = self.__drawSamples() c1 = sampleNode1.data c2 = sampleNode2.data # only deal with dead / dead contigs in this event if (c1.isDead() == False or c2.isDead() == False): return # only support single dead contig assert c1 is c2 # don't know what to do here if (offset1 == offset2): return self.pool.remove(sampleNode1) if c1 is not c2: self.pool.remove(sampleNode2) #forward means do not cut forward = random.random() > self.pgain # do the dcj dcjResult = dcj(c1, offset1, c2, offset2, forward) deadIdx = 0; if len(dcjResult) == 2 and \ random.randint(0, dcjResult[0].size + dcjResult[1].size) \ >= dcjResult[0].size: deadIdx = 1 dcjResult[deadIdx].setDead(True) if forward: self.ddSwapCount += 1 assert len(dcjResult) == 1 else: self.ddGainCount += 1 assert len(dcjResult) == 2 assert not dcjResult[0].isDead() or not dcjResult[1].isDead() # add the resulting contigs back to the pool for res in dcjResult: self.pool.insert(res, res.numBases()) ################################################################## # all counters set to zero. ################################################################## def __resetCounts(self): self.llCount = 0 self.fgCount = 0 self.flCount = 0 self.ldLossCount = 0 self.ldSwapCount = 0 self.ddGainCount = 0 self.ddSwapCount = 0
if __name__ == '__main__': config = XbbConfigReader.read('Wlv2017') info = ParseInfo(config=config) sample = [ x for x in info if x.identifier == 'WplusH_HToBB_WToLNu_M125_13TeV_powheg_pythia8' ][0] # read sample sampleTree = SampleTree([ '/store/group/phys_higgs/hbb/ntuples/VHbbPostNano/2017/V11/WplusH_HToBB_WToLNu_M125_13TeV_powheg_pythia8/adewit-crab_nano2017_WplusH_HT81/190606_065851/0000/tree_1.root' ], treeName='Events', xrootdRedirector="root://eoscms.cern.ch/") # initialize module w = JetSmearer("2017") w.customInit({ 'sampleTree': sampleTree, 'sample': sample, 'config': config }) n = 0 #for event in sampleTree: # w.processEvent(event) # n=n+1 # if n==3: break
class TreeCache: def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None): self.config = config self.fileLocator = FileLocator(config=self.config) self.debug = debug or ('XBBDEBUG' in os.environ) # SAMPLE if isinstance(sample, Sample): # sample passed as Sample object # count number of chunks the cached data is split into defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100 splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug).getNumberOfParts() # if sample passed as object, it can be a 'subsample' and habe different name and identifier self.sample = sample.name self.sampleIdentifier = sample.identifier if self.debug: print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks) else: # sample identifier passed as string self.sample = sample self.sampleIdentifier = sample self.name = name # CUTS self.cutList = cutList self.cutSequenceMode = cutSequenceMode self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode) # PATHS self.inputFolder = inputFolder self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder self.cachedFileNames = [] self.tmpFiles = [] self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root' # BRANCHES and chunk information self.branches = branches self.branchesForHash = None # for now make hash independent of selecte branches self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get() self.chunkNumber = chunkNumber self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1 self.splitFilesChunkSize = splitFilesChunkSize # identifier is just used as an arbitrary name for print-out cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '') self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks) self.sampleTree = None self.isCachedChecked = False self.createFolders() # free memory def deleteSampleTree(self): self.sampleTree = None # file, where skimmed tree is written to def getTmpFileName(self): return self.outputFileNameFormat.format( outputFolder=self.tmpFolder, hash=self.hash, part=self.chunkNumber if self.chunkNumber > 0 else 1, parts='%d'%self.splitFilesChunks ) # file, where skimmed tree is moved to after it has been written completely def getOutputFileName(self): return self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part=self.chunkNumber if self.chunkNumber > 0 else 1, parts='%d'%self.splitFilesChunks ) # check existence of files with skimmed trees def findCachedFileNames(self, chunkNumber=-1): cachedFilesMaskRaw = self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part='*' if chunkNumber < 1 else '%d'%chunkNumber, parts=self.splitFilesChunks ) cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw) self.cachedFileNames = glob.glob(cachedFilesMask) if self.debug: print ('DEBUG: search files:', cachedFilesMask) print ('\x1b[32mDEBUG: files:') for fileName in self.cachedFileNames: print (' > ', fileName) if len(self.cachedFileNames) < 1: print ('none!') print ('\x1b[0m(%d files found)'%len(self.cachedFileNames)) # sort self.cachedFileNames = sorted(self.cachedFileNames, key=lambda x: int(x.split('_')[-1].split('of')[0]) if 'of' in x and '_' in x else -1) return self.cachedFileNames # check if a single part is cached, (only checks existence of the file, not validity!) def partIsCached(self): cachedFilesMaskRaw = self.outputFileNameFormat.format( outputFolder=self.outputFolder, hash=self.hash, part=self.chunkNumber, parts=self.splitFilesChunks ) cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw) return len(glob.glob(cachedFilesMask)) > 0 # isCached == all files containing the skimmed tree found! def isCached(self): self.findCachedFileNames() if (len(self.cachedFileNames) != self.splitFilesChunks and self.splitFilesChunks > 1) or len(self.cachedFileNames) == 0: if self.debug: print ('\x1b[32mDEBUG: not cached:', self.identifier, '\x1b[0m') return False self.isCachedChecked = True return True # check if an existing file can be opened without errors by ROOT def checkFileValidity(self, rawFileName): xrootdFileName = self.fileLocator.getXrootdFileName(rawFileName) f = ROOT.TFile.Open(xrootdFileName, 'read') if not f or f.GetNkeys() == 0 or f.TestBit(ROOT.TFile.kRecovered) or f.IsZombie(): print ('\x1b[31mWARNING: broken file:', rawFileName, ' => redo caching!\x1b[0m') if f: f.Close() self.deleteFile(rawFileName) return False if f: f.Close() return True # check if all cached files are valid def isCachedAndValid(self): valid = True if self.isCached(): # check file integrity for fileName in self.cachedFileNames: valid = valid and self.checkFileValidity(fileName) else: valid = False return valid # set input sampleTree object def setSampleTree(self, sampleTree): self.sampleTree = sampleTree return self # this prepares the caching by telling the sampleTree object what to write during processing of the file # note: does not run the caching by itself! needs an additional sampleTree.process() def cache(self): if self.sampleTree: outputFileName = self.getTmpFileName() callbacks = {'afterWrite': self.moveFilesToFinalLocation} self.sampleTree.addOutputTree(outputFileName=outputFileName, cut=self.cutList, hash=self.hash, branches=self.branches, callbacks=callbacks, cutSequenceMode=self.cutSequenceMode, name=self.name) self.tmpFiles.append(outputFileName) if self.debug: print ('\x1b[32mDEBUG: output file for ', self.identifier, ' is ', outputFileName, '\x1b[0m') else: print ('\x1b[31mERROR: no sample tree connected!:', self.identifier, ' set the sampleTree first with "setSampleTree(sampleTree)" \x1b[0m') return self # return sample tree class of cached samples if all files found def getTree(self, chunkSize=-1, chunkNumber=-1): # if it has already been checked if tree is cached, then use this result dierctly isCached = self.isCachedChecked if not isCached: isCached = self.isCached() if isCached: if chunkSize > 0 and chunkNumber > 0: fileNames = self.cachedFileNames[(chunkNumber-1)*chunkSize:chunkNumber*chunkSize] elif chunkSize < 0 and chunkNumber < 0: fileNames = self.cachedFileNames else: raise Exception("InvalidParameters") self.sampleTree = SampleTree(self.cachedFileNames, config=self.config, fileNamesToProcess=fileNames) self.sampleTree.sampleIdentifier = self.sampleIdentifier # check if even though all files exist, they couldn't be accessed for some reason # and therefore the tree would be incomplete if not self.sampleTree.isCompleteTree(): raise Exception("IncompleteTree") return self.sampleTree # delete file def deleteFile(self, rawFileName): if self.debug: print ('DELETE:', rawFileName) self.fileLocator.rm(rawFileName) # delete cached files def deleteCachedFiles(self, chunkNumber=-1): cachedFileNames = self.findCachedFileNames(chunkNumber=chunkNumber) for fileName in cachedFileNames: if self.fileLocator.fileExists(fileName): self.deleteFile(fileName) # create folders def createFolders(self): tmpfolderLocal = self.fileLocator.getLocalFileName(self.tmpFolder) if not os.path.isdir(tmpfolderLocal): print("DOES NOT EXIST:", tmpfolderLocal) try: xrootdFileName = self.fileLocator.getXrootdFileName(self.tmpFolder) if '://' not in xrootdFileName: os.makedirs(self.tmpFolder) else: command = 'gfal-mkdir %s' % (xrootdFileName) returnCode = subprocess.call([command], shell=True) if self.debug: print(command, ' => ', returnCode) print () except: pass if not self.fileLocator.exists(self.outputFolder): print("INFO: output folder does not exist and will be created:", self.outputFolder) self.fileLocator.makedirs(self.outputFolder) # move files from temporary to final location def moveFilesToFinalLocation(self): success = True # free some memory for file copy command if self.debug: print('DEBUG: max mem used A:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) self.deleteSampleTree() if self.debug: print('DEBUG: max mem used B:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) for tmpFileName in self.tmpFiles: outputFileName = self.outputFolder + '/' + self.tmpFolder.join(tmpFileName.split(self.tmpFolder)[1:]) print ('copy ', tmpFileName, ' to ', outputFileName) if self.fileLocator.fileExists(outputFileName): self.deleteFile(outputFileName) copySuccessful = self.fileLocator.cp(tmpFileName, outputFileName) if not copySuccessful: success = False print('\x1b[31mERROR: copy failed for {tmpfile}->{outputfile} !\x1b[0m'.format(tmpfile=tmpFileName, outputfile=outputFileName)) else: # delete temporary file if copy was successful self.deleteFile(tmpFileName) return success
# print("Processed {0} events in {1:.2f} seconds, {2:.2f} ev/s".format(self.nEvent, tot_time, self.nEvent/tot_time)) if __name__ == '__main__': config = XbbConfigReader.read('Zvv2018') info = ParseInfo(config=config) sample = [ x for x in info if x.identifier == 'ZH_HToBB_ZToNuNu_M125_13TeV_powheg_pythia8' ][0] #sampleTree = SampleTree(['/store/group/phys_higgs/hbb/ntuples/VHbbPostNano/2018/V12/ZH_HToBB_ZToNuNu_M125_13TeV_powheg_pythia8/RunIIAutumn18NanoAODv6-Nano25O133/200221_205457/0000/tree_1.root'], treeName='Events', xrootdRedirector="root://eoscms.cern.ch/") sampleTree = SampleTree([ '/store/group/phys_higgs/hbb/ntuples/VHbbPostNano/2018/V13/ZH_HToBB_ZToNuNu_M125_13TeV_powheg_pythia8/RunIIAutumn18NanoAODv7-Nano02A85/200519_095652/0000/tree_1.root' ], treeName='Events', xrootdRedirector="root://eoscms.cern.ch/") w = JECcorrelator("2018") w.customInit({ 'sampleTree': sampleTree, 'sample': sample, 'config': config }) sampleTree.addOutputBranches(w.getBranches()) histograms = {} for jec in w.JEC_reduced: histograms[jec] = {} for var in [ "Jet_pt", "Jet_mass", "MET_pt", "MET_phi", "FatJet_pt",