Exemplo n.º 1
0
    def create(self, projectRoot):
        self.projectRoot = projectRoot
        self._normalizeProjectRoot()
        self._setProjectNameFromRoot()
        self._setCodeTreeRoot()

        f = FileLocator(self.projectRoot, "(.*)\.(c|cpp|cc|h|hpp|java)$")

        projectFiles = f.findFiles()
        for absoluteFilename in projectFiles:
            filename = self._getRelativeFilename(absoluteFilename)
            # print(filename)
            dirForSourceFile = self.codeTreeRoot + filename
            self._onFileDetection(filename, dirForSourceFile, absoluteFilename)

        print("Tree saved at " + self.projectName)
Exemplo n.º 2
0
 def create(self, projectRoot):
     self.projectRoot = projectRoot
     self._normalizeProjectRoot()
     self._setProjectNameFromRoot()
     self._setCodeTreeRoot()
     
     f = FileLocator(self.projectRoot, '(.*)\.(c|cpp|cc|h|hpp|java)$')
     
     projectFiles = f.findFiles()
     for absoluteFilename in projectFiles:
         filename = self._getRelativeFilename(absoluteFilename)
         # print(filename)
         dirForSourceFile = self.codeTreeRoot + filename
         self._onFileDetection(filename, dirForSourceFile, absoluteFilename)
     
     print('Tree saved at ' + self.projectName)
Exemplo n.º 3
0
def file(source, line=-1, column=-1):
    if line == -1 and column == -1:
        from SimpleFileLocator import SimpleFileLocator
        return SimpleFileLocator(source)
    
    from FileLocator import FileLocator
    return FileLocator(source, line, column)
Exemplo n.º 4
0
    def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None):
        self.config = config
        self.fileLocator = FileLocator(config=self.config)
        self.debug = debug or ('XBBDEBUG' in os.environ)

        # SAMPLE
        if isinstance(sample, Sample):
            # sample passed as Sample object
            # count number of chunks the cached data is split into
            defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100
            splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize
            splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug).getNumberOfParts()
            # if sample passed as object, it can be a 'subsample' and habe different name and identifier
            self.sample = sample.name
            self.sampleIdentifier = sample.identifier
            if self.debug:
                print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks)
        else:
            # sample identifier passed as string
            self.sample = sample
            self.sampleIdentifier = sample
        self.name = name

        # CUTS
        self.cutList = cutList
        self.cutSequenceMode = cutSequenceMode
        self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode)

        # PATHS
        self.inputFolder = inputFolder
        self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder
        self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder
        self.cachedFileNames = []
        self.tmpFiles = []
        self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root'

        # BRANCHES and chunk information
        self.branches = branches
        self.branchesForHash = None     # for now make hash independent of selecte branches 
        self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get()
        self.chunkNumber = chunkNumber
        self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1
        self.splitFilesChunkSize = splitFilesChunkSize
        
        # identifier is just used as an arbitrary name for print-out
        cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '')
        self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks)
        self.sampleTree = None
        self.isCachedChecked = False

        self.createFolders()
Exemplo n.º 5
0
    def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None, fileLocator=None):
        self.config = config
        self.fileLocator = fileLocator if fileLocator is not None else FileLocator(config=self.config)
        self.debug = debug or ('XBBDEBUG' in os.environ)

        # SAMPLE
        if isinstance(sample, Sample):
            # sample passed as Sample object
            # count number of chunks the cached data is split into
            defaultChunkSize = int(config.get('General', 'mergeCachingSize')) if config.has_option('General', 'mergeCachingSize') else 100
            splitFilesChunkSize = sample.mergeCachingSize if sample.mergeCachingSize > 0 else defaultChunkSize
            splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug, fileLocator=self.fileLocator).getNumberOfParts()
            # if sample passed as object, it can be a 'subsample' and habe different name and identifier
            self.sample = sample.name
            self.sampleIdentifier = sample.identifier
            if self.debug:
                print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks)
        else:
            # sample identifier passed as string
            self.sample = sample
            self.sampleIdentifier = sample
        self.name = name

        # CUTS
        self.cutList = cutList
        self.cutSequenceMode = cutSequenceMode
        self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode)

        # PATHS
        self.inputFolder = inputFolder
        self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder
        self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder
        self.cachedFileNames = []
        self.tmpFiles = []
        self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root'

        # BRANCHES and chunk information
        self.branches = branches
        self.branchesForHash = None     # for now make hash independent of selecte branches 
        self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get()
        self.chunkNumber = chunkNumber
        self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1
        self.splitFilesChunkSize = splitFilesChunkSize
        
        # identifier is just used as an arbitrary name for print-out
        cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '')
        self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks)
        self.sampleTree = None
        self.isCachedChecked = False

        self.createFolders()
Exemplo n.º 6
0
class SampleTree(object):

    def __init__(self, samples, treeName=None, limitFiles=-1, splitFilesChunkSize=-1, chunkNumber=1, countOnly=False, verbose=True, config=None, saveMemory=False, xrootdRedirector=None):
        self.verbose = verbose
        self.debug = 'XBBDEBUG' in os.environ
        self.debugProfiling = 'XBBPROFILING' in os.environ
        self.config = config
        self.saveMemory = saveMemory
        self.outputTreeBasketSize = None
        if self.config and self.config.has_option('Configuration', 'outputTreeBasketSize'):
            self.outputTreeBasketSize = eval(self.config.get('Configuration', 'outputTreeBasketSize'))
        self.monitorPerformance = True
        self.disableBranchesInOutput = True
        self.samples = samples
        self.tree = None
        self.fileLocator = FileLocator(config=self.config, xrootdRedirector=xrootdRedirector)
        self.sampleIdentifier = None

        # process only partial sample root file list
        self.splitFilesChunkSize = splitFilesChunkSize
        self.chunkNumber = chunkNumber
       
        # get list of sample root files to process
        sampleFileNamesParts = self.getSampleFileNameChunks()
        if self.chunkNumber > 0 and self.chunkNumber <= self.numParts:
            if len(sampleFileNamesParts) == self.numParts:
                chunkIndex = self.chunkNumber - 1
                self.sampleFileNames = sampleFileNamesParts[chunkIndex]
            else:
                raise Exception("InvalidNumberOfSplitParts")
        else:
            print("\x1b[31mERROR: wrong chunk number ", self.chunkNumber, "\x1b[0m")
            raise Exception("InvalidChunkNumber")
        if self.verbose:
            print ("INFO: reading part ", self.chunkNumber, " of ", self.numParts)

        self.status = 0
        if not treeName:
            if self.config and self.config.has_option('Configuration', 'treeName'):
                self.treeName = self.config.get('Configuration', 'treeName')
            else:
                # HEPPY default
                self.treeName = 'tree'
        else:
            self.treeName = treeName
        self.formulas = {}
        self.formulaDefinitions = []
        self.oldTreeNum = -1
        self.limitFiles = int(limitFiles) 
        self.timeStart = time.time()
        self.timeETA = 0
        self.eventsRead = 0
        self.outputTrees = []
        self.callbacks = {}
        self.removeBranches = []

        # e.g. for additional branches to be added
        self.newBranches = []

        # check existence of sample .txt file which contains list of .root files
        self.sampleTextFileName = ''

        # add all .root files to chain and add count histograms
        self.chainedFiles = []
        self.brokenFiles = []
        self.histograms = {}
        self.nanoTreeCounts = {}
        self.totalNanoTreeCounts = {}

        if not countOnly:
            self.tree = ROOT.TChain(self.treeName)

            # loop over all given .root files 
            for rootFileName in self.sampleFileNames:
                if self.debug:
                    print('DEBUG: next file is:', rootFileName, ", check existence")

                # check root file existence
                if self.fileLocator.exists(rootFileName, attempts=5):
                    remoteRootFileName = self.fileLocator.getRemoteFileName(rootFileName)
                    input = ROOT.TFile.Open(remoteRootFileName, 'read')

                    # check file validity
                    if input and not input.IsZombie() and input.GetNkeys() > 0 and not input.TestBit(ROOT.TFile.kRecovered):
                        if self.debug:
                            print('DEBUG: file exists and is good!')

                        # add count histograms, since they are not in the TChain
                        for key in input.GetListOfKeys():
                            obj = key.ReadObj()
                            if obj.GetName() == self.treeName:
                                continue
                            histogramName = obj.GetName()

                            # nanoAOD: use branch of a tree instead of histogram for counting
                            if histogramName == 'Runs':
                                branchList = [x.GetName() for x in obj.GetListOfBranches()]
                                if self.debug:
                                    print ("DEBUG: nano counting tree has the following BRANCHES:", branchList)
                                for branch in branchList:
                                    if branch not in self.nanoTreeCounts:
                                        self.nanoTreeCounts[branch] = []
                                nEntries = obj.GetEntries()
                                for i in range(nEntries):
                                    obj.GetEntry(i)
                                    for branch in branchList:
                                        self.nanoTreeCounts[branch].append(getattr(obj, branch))

                            if histogramName in self.histograms:
                                if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                                    if self.debug:
                                        print("DEBUG: object is a tree and will be skipped:", obj.GetName())
                                else:
                                    if self.histograms[histogramName]:
                                        self.histograms[histogramName].Add(obj)
                                    else:
                                        print ("ERROR: histogram object was None!!!")
                                        raise Exception("CountHistogramMissing")
                            else:
                                # add all TH*'s in one single histogram
                                if obj.IsA().InheritsFrom(ROOT.TH1.Class()):
                                    self.histograms[histogramName] = obj.Clone(obj.GetName())
                                    self.histograms[histogramName].SetDirectory(0)
                                else:
                                    if self.debug:
                                        print("DEBUG: omitting object ", obj, type(obj), " since it is neither TH1 or TTree!")

                        input.Close()

                        # add file to chain
                        chainTree = '%s/%s'%(remoteRootFileName.strip(), self.treeName.strip())
                        if self.debug:
                            print ('\x1b[42mDEBUG: chaining '+chainTree,'\x1b[0m')
                        statusCode = self.tree.Add(chainTree)
                        if self.debug:
                            print ('\x1b[42mDEBUG: ---> %r'%statusCode,'\x1b[0m')
 
                        # check for errors in chaining the file
                        if statusCode != 1:
                            print ('ERROR: failed to chain ' + chainTree + ', returned: ' + str(statusCode), 'tree:', self.tree)
                            raise Exception("TChain method Add failure")
                        elif not self.tree:
                            print ('\x1b[31mERROR: tree died after adding %s.\x1b[0m'%rootFileName)
                        else:
                            self.treeEmpty = False
                            self.chainedFiles.append(rootFileName)
                            if self.limitFiles > 0 and len(self.chainedFiles) >= self.limitFiles:
                                print ('\x1b[35mDEBUG: limit reached! no more files will be chained!!!\x1b[0m')
                                break
                    else:
                        print ('\x1b[31mERROR: file is damaged: %s\x1b[0m'%rootFileName)
                        if input:
                            print ('DEBUG: Zombie:', input.IsZombie(), '#keys:', input.GetNkeys(), 'recovered:', input.TestBit(ROOT.TFile.kRecovered))
                        self.brokenFiles.append(rootFileName)
                else:
                    print ('\x1b[31mERROR: file is missing: %s\x1b[0m'%rootFileName)

            if self.verbose or self.debug:
                print ('INFO: # files chained: %d'%len(self.chainedFiles))
                if len(self.brokenFiles) > 0:
                    print ('INFO: # files broken : %d'%len(self.brokenFiles))
            
            if len(self.chainedFiles) < 1:
                self.tree = None

            if self.tree:
                self.tree.SetCacheSize(50*1024*1024)

            # merge nano counting trees
            if self.nanoTreeCounts:
                # TODO: per run if possible, sum LHE weights if present

                # sum the contributions from the subtrees
                self.totalNanoTreeCounts = {key: sum(values) for key,values in self.nanoTreeCounts.iteritems() if len(values) > 0 and type(values[0]) in [int, float, long]}

                # print summary table
                countBranches = self.totalNanoTreeCounts.keys()
                depth = None
                for key,values in self.nanoTreeCounts.iteritems():
                    if values and len(values)>1 and type(values[0]) in [int, float, long]:
                        depth = len(values)
                        break
                print("-"*160)
                print("tree".ljust(25), ''.join([countBranch.ljust(25) for countBranch in countBranches]))
                if depth:
                    for treeNum in range(depth):
                        print(("%d"%(treeNum+1)).ljust(25),''.join([('%r'%self.nanoTreeCounts[countBranch][treeNum]).ljust(25) for countBranch in countBranches]))
                print("\x1b[34m","sum".ljust(24), ''.join([('%r'%self.totalNanoTreeCounts[countBranch]).ljust(25) for countBranch in countBranches]),"\x1b[0m")
                print("-"*160)

                # fill summed tree (create new tree)
                self.histograms['Runs'] = ROOT.TTree('Runs', 'count histograms for nano')
                nanoTreeCountBuffers = {}
                for key, value in self.totalNanoTreeCounts.iteritems():
                    if type(value) == int:
                        # 64 bit signed int 
                        typeCode = 'L'
                    elif type(value) == long:
                        typeCode = 'L'
                    elif type(value) == float:
                        typeCode = 'f'
                    nanoTreeCountBuffers[key] = array.array(typeCode, [value])
                    self.histograms['Runs'].Branch(key, nanoTreeCountBuffers[key], '{name}/{typeCode}'.format(name=key, typeCode=typeCode))
                self.histograms['Runs'].Fill()

    def __del__(self):
        self.delete()

    def delete(self):
        self.callbacks = None
        # close possible left open files referencing the TChain and delete output trees
        try:
            if self.tree:
                self.tree.Reset()
        except:
            pass
        self.fileLocator = None
        self.config = None
        for outputTree in self.outputTrees:
            del outputTree['file']
        try:
            for formulaName, formula in self.formulas.iteritems():
                if formula:
                    del formula
                    formula = None
        except e:
            print("EXCEPTION:", e)
        try:
            for outputTree in self.outputTrees:
                if outputTree['tree']:
                    del outputTree['tree']
                    outputTree['tree'] = None
        except e:
            print("EXCEPTION:", e)
        try:
            if self.tree:
                del self.tree
                self.tree = None
        except e:
            print("EXCEPTION:", e)

    # ------------------------------------------------------------------------------
    # return full list of sample root files 
    # ------------------------------------------------------------------------------
    def getAllSampleFileNames(self): 
        # given argument is list -> this is already the list of root files
        if type(self.samples) == list:
            sampleFileNames = self.samples
        # given argument is name and folder -> glob
        elif type(self.samples) == dict:
            if 'sample' in self.samples:
                sampleName = self.samples['sample'].identifier
            else:
                sampleName = self.samples['name']
            self.sampleIdentifier = sampleName
            sampleFolder = self.samples['folder']
            samplesMask = self.fileLocator.getLocalFileName(sampleFolder) + '/' + sampleName + '/*.root'
            redirector = self.fileLocator.getRedirector(sampleFolder)
            if self.verbose:
                print ("INFO: use ", samplesMask)
            sampleFileNames = glob.glob(samplesMask)
            sampleFileNames = [self.fileLocator.addRedirector(redirector, x) for x in sampleFileNames]
            if self.verbose:
                print ("INFO: found ", len(sampleFileNames), " files.")
        # given argument is a single file name -> read this .txt file 
        else:
            sampleTextFileName = self.samples
            if os.path.isfile(sampleTextFileName):
                self.sampleTextFileName = sampleTextFileName
                if self.verbose:
                    print('open samples .txt file: %s' % self.sampleTextFileName)
            else:
                print("\x1b[31mERROR: file not found: %s \x1b[0m" % sampleTextFileName)
                return

            with open(self.sampleTextFileName, 'r') as sampleTextFile:
                sampleFileNames = sampleTextFile.readlines()
        return sampleFileNames

    # ------------------------------------------------------------------------------
    # return lists of sample root files, split into chunks with certain size  
    # ------------------------------------------------------------------------------
    def getSampleFileNameChunks(self):
        sampleFileNames = self.getAllSampleFileNames()
        if self.splitFilesChunkSize > 0 and len(sampleFileNames) > self.splitFilesChunkSize:
            sampleFileNamesParts = [sampleFileNames[i:i + self.splitFilesChunkSize] for i in xrange(0, len(sampleFileNames), self.splitFilesChunkSize)]
        else:
            sampleFileNamesParts = [sampleFileNames]
        self.numParts = len(sampleFileNamesParts)
        return sampleFileNamesParts

    # ------------------------------------------------------------------------------
    # return lists of sample root files for a single chunk  
    # ------------------------------------------------------------------------------
    def getSampleFileNameChunk(self, chunkNumber):
        chunks = self.getSampleFileNameChunks()
        if chunkNumber > 0 and chunkNumber <= len(chunks):
            return chunks[chunkNumber-1]
        else:
            print("\x1b[31mERROR: invalid chunk number {n} \x1b[0m".format(n=chunkNumber))

    def getNumberOfParts(self):
        return self.numParts

    # ------------------------------------------------------------------------------
    # add a TTreeFormula connected to the TChain
    # ------------------------------------------------------------------------------
    def addFormula(self, formulaName, formula=None):
        if formula is None:
            formula = formulaName

        # there might be an undocumented limit on the length of cutstrings in ROOT...
        if len(formula) > 1023:
            print("\x1b[41m\x1b[97m------------------------------------------------------------------------------")
            print(" WARNING !!! ROOT.TTreeFormula of length %d, this might cause problems !!"%len(formula))
            print(" reduce length of formulas if problems occur, e.g. by passing lists of cut formulas!")
            print("------------------------------------------------------------------------------\x1b[0m")

        self.formulaDefinitions.append({'name': formulaName, 'formula': formula})
        self.formulas[formulaName] = ROOT.TTreeFormula(formulaName, formula, self.tree) 
        if self.formulas[formulaName].GetNdim() == 0:
            print("DEBUG: formula is:", formula)
            print("\x1b[31mERROR: adding the tree formula failed! Check branches of input tree and loaded namespaces.\x1b[0m")
            raise Exception("SampleTreeAddTTreeFormulaFailed")

    # ------------------------------------------------------------------------------
    # return list of formulas
    # ------------------------------------------------------------------------------
    def getFormulas(self):
        return self.formulas

    # ------------------------------------------------------------------------------
    # add a new branch
    # ------------------------------------------------------------------------------
    def addOutputBranch(self, branchName, formula, branchType='f', length=1, arguments=None, leaflist=None, arrayStyle=False):
        # this is needed to overwrite the branch if it already exists!
        self.addBranchToBlacklist(branchName)

        # function
        if callable(formula):
            newBranch = {'name': branchName, 'function': formula, 'type': branchType, 'length': length}
            if arguments:
                newBranch['arguments'] = arguments
        # string which contains a TTreeFormula expression
        else:
            formulaName = 'alias:' + branchName
            self.addFormula(formulaName, formula)
            newBranch = {'name': branchName, 'formula': formulaName, 'type': branchType, 'length': length}
        if leaflist:
            newBranch['leaflist'] = leaflist
        if arrayStyle:
            newBranch['arrayStyle'] = True
        self.newBranches.append(newBranch)

    # ------------------------------------------------------------------------------
    # pass a list of dictionaries of branches to add
    # TODO: avoid detour via addOutputBranch and set dictionary directly
    # ------------------------------------------------------------------------------
    def addOutputBranches(self, branchDictList):
        for branchDict in branchDictList:
            self.addOutputBranch(
                branchName=branchDict['name'],
                formula=branchDict['formula'],
                branchType=branchDict['type'] if 'type' in branchDict else 'f',
                length=branchDict['length'] if 'length' in branchDict else 1,
                arguments=branchDict['arguments'] if 'arguments' in branchDict else None,
                leaflist=branchDict['leaflist'] if 'leaflist' in branchDict else None,
                arrayStyle=branchDict['arrayStyle'] if 'arrayStyle' in branchDict else False,
            )

    # ------------------------------------------------------------------------------
    # implement iterator for TChain, with updating TTreeFormula objects on tree
    # switching and show performance statistics during loop
    # ------------------------------------------------------------------------------
    def next(self):
        self.treeIterator.next()
        self.eventsRead += 1
        if self.debug and self.eventsRead % 1000 == 0:
            print('DEBUG: %d events read'%self.eventsRead)
        treeNum = self.tree.GetTreeNumber()
        # TTreeFormulas have to be updated when the tree number changes in a TChain
        if treeNum != self.oldTreeNum:

            # update ETA estimates
            if treeNum == 0:
                self.timeStart = time.time()
                perfStats = '?'
            else:
                fraction = 1.0*treeNum/len(self.chainedFiles)
                passedTime = time.time() - self.timeStart
                self.timeETA = (1.0-fraction)/fraction * passedTime if fraction > 0 else 0
                perfStats = 'INPUT: {erps}/s, OUTPUT: {ewps}/s '.format(erps=self.eventsRead / passedTime if passedTime>0 else 0, ewps=sum([x['passed'] for x in self.outputTrees]) / passedTime if passedTime>0 else 0)

            # output status
            if self.verbose or self.debug:
                percentage = 100.0*treeNum/len(self.chainedFiles)
                if treeNum == 0:
                    print ('INFO: time ', time.ctime())
                if self.debug:
                    perfStats = perfStats + ' max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                print ('INFO: switching trees --> %d (=%1.1f %%, ETA: %s min, %s)'%(treeNum, percentage, self.getETA(), perfStats))
                if self.debugProfiling:
                    self.tree.PrintCacheStats()
                sys.stdout.flush()
            self.oldTreeNum = treeNum
            # update TTreeFormula's
            for formulaName, treeFormula in self.formulas.iteritems():
                treeFormula.UpdateFormulaLeaves()
        return self.tree

    def __iter__(self):
        self.treeIterator = self.tree.__iter__()
        return self

    # wrapper to evaluate a formula, which has been added to the formula dictionary
    # vector valued formulas are not supported
    def evaluate(self, formulaName):
        if formulaName in self.formulas:
            if self.formulas[formulaName].GetNdata() > 0:
                return self.formulas[formulaName].EvalInstance()
            else:
                return 0
        else:
            existingFormulas = [x for x,y in self.formulas.iteritems()]
            print ("existing formulas are: ", existingFormulas)
            raise Exception("SampleTree::evaluate: formula '%s' not found!"%formulaName)

    # evaluates a vector formula and fills an array, returns the number of dimensions of the formula
    def evaluateArray(self, formulaName, destinationArray):
        if formulaName in self.formulas:
            nData = self.formulas[formulaName].GetNdata()
            for i in range(nData):
                destinationArray[i] = self.formulas[formulaName].EvalInstance(i)
            return nData
        else:
            existingFormulas = [x for x, y in self.formulas.iteritems()]
            print("existing formulas are: ", existingFormulas)
            raise Exception("SampleTree::evaluate: formula '%s' not found!" % formulaName)

    # return string of ETA in minutes
    def getETA(self):
        return '%1.1f'%(self.timeETA/60.0) if self.timeETA > 0 else '?'

    def GetListOfBranches(self):
        return self.tree.GetListOfBranches()

    # ------------------------------------------------------------------------------
    # handle 'tree-typed' cuts, passed as dictionary:
    # e.g. cut = {'OR': [{'AND': ["pt>20","eta<3"]}, "data==1"]}
    # short-circuit evaluation is handled by builtins any() and all()
    # ------------------------------------------------------------------------------
    def addCutDictRecursive(self, cutDict):
        if type(cutDict) == str:
            if cutDict not in self.formulas:
                self.addFormula(cutDict, cutDict)
        elif 'OR' in cutDict and 'AND' in cutDict:
            raise Exception("BadTreeTypeCutDict")
        elif 'OR' in cutDict:
            for subDict in cutDict['OR']:
                self.addCutDictRecursive(subDict)
        elif 'AND' in cutDict:
            for subDict in cutDict['AND']:
                self.addCutDictRecursive(subDict)
        else:
            raise Exception("BadTreeTypeCutDict")

    def evaluateCutDictRecursive(self, cutDict):
        if type(cutDict) == str:
            if self.formulaResults[cutDict] is None:
                print ("FORMULA:", cutDict)
                raise Exception("UnevaluatedFormula!!")
            return self.formulaResults[cutDict]
        elif 'OR' in cutDict and 'AND' in cutDict:
            raise Exception("BadTreeTypeCutDict")
        elif 'OR' in cutDict:
            return any([self.evaluateCutDictRecursive(subDict) for subDict in cutDict['OR']])
        elif 'AND' in cutDict:
            return all([self.evaluateCutDictRecursive(subDict) for subDict in cutDict['AND']])
        else:
            raise Exception("BadTreeTypeCutDict")
    
    # set callback function, which MUST return a boolean. To continue processing this event, the function must return True. False means skip this event!
    def setCallback(self, category, fcn):
        if category not in ['event']:
            raise Exception("CallbackEventDoesNotExist")
        if category in self.callbacks:
            print("WARNING: callback function for ", category, " is overwritten!")
        self.callbacks[category] = [fcn]

    # add callback function, which MUST return a boolean. To continue processing this event, the function must return True. False means skip this event!
    def addCallback(self, category, fcn):
        if category not in ['event']:
            raise Exception("CallbackEventDoesNotExist")
        if category not in self.callbacks:
             self.callbacks[category] = []
        self.callbacks[category].append(fcn)

    # ------------------------------------------------------------------------------
    # add output tree to be written during the process() function
    # ------------------------------------------------------------------------------
    def addOutputTree(self, outputFileName, cut, hash='', branches=None, callbacks=None, cutSequenceMode='AND', name=''):

        # write events which satisfy either ONE of the conditions given in the list or ALL
        if cutSequenceMode not in ['AND', 'OR', 'TREE']:
            raise Exception("InvalidCutSequenceMode")

        if len([x for x in self.outputTrees if x['fileName'] == outputFileName])>0:
            print("WARNING: skipping duplicate file ", outputFileName, "!")
            return False

        outputTree = {
            'tree': None, # will create this tree later, after it is known which branches will be enabled 
            'name': name,
            'fileName': outputFileName,
            'file': None, 
            'cut': cut,
            'cutSequence': [],
            'cutSequenceMode': cutSequenceMode,
            'hash': hash,
            'branches': branches,
            'callbacks': callbacks,
            'passed': 0,
        }

        self.outputTrees.append(outputTree)
    
    # these branches are ALWAYS removed (e.g. because they will be recomputed), even when they are in the 'keep_branches' list
    def addBranchToBlacklist(self, branchName):
        if branchName != '*':
            self.removeBranches.append(branchName)
        else:
            print("WARNING: can't add branch '*' to blacklist => igonre it!")

    # wrapper to enable/disable branches in the TChain
    def SetBranchStatus(self, branchName, branchStatus):
        listOfExistingBranches = self.GetListOfBranches()
        if listOfExistingBranches.FindObject(branchName) or '*' in branchName:
            self.tree.SetBranchStatus(branchName, branchStatus)

    # enables ONLY the given branches (* wildcards supported) and checks existence before enabling them to avoid warning messages during tree iteration
    def enableBranches(self, listOfBranchesToKeep):
        listOfExistingBranches = self.GetListOfBranches()
        self.tree.SetBranchStatus("*", 0)
        enabledBranches = []
        for branchName in listOfBranchesToKeep:
            if listOfExistingBranches.FindObject(branchName) or '*' in branchName:
                self.tree.SetBranchStatus(branchName, 1)
                enabledBranches.append(branchName)
        print("INFO: reduced number of enabled branches from", len(listOfExistingBranches), " to", len(enabledBranches), " (branches with wildcards may not be correctly counted)")
        if self.verbose:
            print ("INFO: branches:", BranchList(enabledBranches).getShortRepresentation())

    # ------------------------------------------------------------------------------
    # loop over all entries in the TChain and copy events to output trees, if the
    # cuts are fulfilled.
    # ------------------------------------------------------------------------------
    def process(self):
        if self.debug:
            rsrc = resource.RLIMIT_DATA
            # restrict memory
            # resource.setrlimit(rsrc, (2.0*1024*1024*1024, 6*1024*1024*1024))
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: mem limits soft/hard:', soft, hard)
            rsrc = resource.RLIMIT_AS
            # restrict memory
            # resource.setrlimit(rsrc, (2.0*1024*1024*1024, 6*1024*1024*1024))
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: AS limits soft/hard:', soft, hard)
            rsrc = resource.RLIMIT_STACK
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: stack limits soft/hard:', soft, hard)
            print('DEBUG: max mem used:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        if self.verbose:
            print ('OUTPUT TREES:')
            for outputTree in self.outputTrees:
                cutString = "%r"%outputTree['cut']
                if len(cutString) > 50:
                    cutString = cutString[0:50] + '...(%s more chars)'%(len(cutString)-50)
                print (' > ', outputTree['fileName'], ' <== ', outputTree['name'] if 'name' in outputTree else outputTree['hash'], ' cut: ', cutString)
            print ('FORMULAS:')
            for formulaName, formula in self.formulas.iteritems():
                print (' > \x1b[35m', formulaName, '\x1b[0m ==> ', formula)

        # find common set of branches which needs to be enabled for cuts and desired variables in all of the output trees
        listOfBranchesToKeep = []
        for outputTree in self.outputTrees:
            if 'branches' in outputTree and outputTree['branches']:
                listOfBranchesToKeep += outputTree['branches']
            if 'cut' in outputTree and outputTree['cut']:
                listOfBranchesToKeep += BranchList(outputTree['cut']).getListOfBranches()
            for formula in self.formulaDefinitions:
                listOfBranchesToKeep += BranchList(formula['formula']).getListOfBranches()

        # keep the branches stated in config, (unless they will be recomputed)
        if self.config:
            listOfBranchesToKeep += eval(self.config.get('Branches', 'keep_branches'))
        listOfBranchesToKeep = list(set(listOfBranchesToKeep))

        # disable the branches in the input if there is no output tree which wants to have all branches
        if '*' not in listOfBranchesToKeep and len(listOfBranchesToKeep) > 0:
            self.enableBranches(listOfBranchesToKeep)
        else:
            if len(self.removeBranches) < 1:
                print("INFO: keep all branches")
            else:
                print("INFO: keep all branches but the following:")
                print("INFO:", ", ".join(self.removeBranches))

        # now disable all branches, which will be e.g. recomputed
        for branchName in self.removeBranches:
            self.SetBranchStatus(branchName, 0)

        # initialize the output trees, this has to be called after the calls to SetBranchStatus
        for outputTree in self.outputTrees:
            outputTree['file'] = ROOT.TFile.Open(outputTree['fileName'], 'recreate')
            if not outputTree['file'] or outputTree['file'].IsZombie():
                print ("\x1b[31mERROR: output file broken\x1b[0m")
                raise Exception("OutputFileBroken")
        
            # copy count histograms to output files
            outputTree['histograms'] = {}
            for histogramName, histogram in self.histograms.iteritems():
                    outputTree['histograms'][histogramName] = histogram.Clone(histogram.GetName())
                    outputTree['histograms'][histogramName].SetDirectory(outputTree['file'])

            # clone tree structure, but don't copy any entries
            outputTree['file'].cd()
            outputTree['tree'] = self.tree.CloneTree(0)
            # can be used to reduce memory consumption
            if self.outputTreeBasketSize:
                outputTree['tree'].SetBasketSize("*", self.outputTreeBasketSize)
            if not outputTree['tree']:
                print ("\x1b[31mWARNING: output tree broken. try to recover!\x1b[0m")
                # if input tree has 0 entries, don't copy 0 entries to the output tree, but ALL of them instead! (sic!)
                # (this is done by omitting the argument to CloneTree)
                outputTree['tree'] = self.tree.CloneTree()
                if not outputTree['tree']:
                    print ("\x1b[31mERROR: output tree broken, input tree: ", self.tree, " \x1b[0m")
                else:
                    print ("\x1b[32mINFO: recovered\x1b[0m")
            outputTree['tree'].SetDirectory(outputTree['file'])

        # add CUT formulas, this has to be called after the calls to SetBranchStatus
        for outputTree in self.outputTrees:
            if outputTree['cutSequenceMode'] == 'TREE' and type(outputTree['cut']) == dict:
                outputTree['cutSequence'] = outputTree['cut']
                # now recursively parse the cut-tree and add all contained cut formulas
                self.addCutDictRecursive(outputTree['cut'])
            elif type(outputTree['cut']) == dict:
                print ("HINT: use cutSequenceMode='TREE' to pass dictionaries!")
                raise Exception("InvalidCutSequenceMode")
            else:
                # cut passed as string or list of strings
                cutList = outputTree['cut'] if type(outputTree['cut']) == list else [outputTree['cut']]
                for i, cutString in enumerate(cutList):
                    formulaName = cutString.replace(' ', '')
                    if formulaName not in self.formulas:
                        self.addFormula(formulaName, cutString)
                    outputTree['cutSequence'].append(formulaName)

        # prepare memory for new branches to be written
        pyTypes = {'O': 'i'}
        for outputTree in self.outputTrees:
            outputTree['newBranchArrays'] = {}
            outputTree['newBranches'] = {}
            for branch in self.newBranches:
                # convert ROOT type convention to python array type convetion if necessary
                pyType = pyTypes[branch['type']] if branch['type'] in pyTypes else branch['type'] 
                outputTree['newBranchArrays'][branch['name']] = array.array(pyType, [0] * branch['length'])
                if 'leaflist' in branch:
                    leafList = branch['leaflist']
                else:
                    leafList = '{name}{length}/{type}'.format(name=branch['name'], length='[%d]'%branch['length'] if branch['length'] > 1 else '', type=branch['type'].upper())
                outputTree['newBranches'][branch['name']] = outputTree['tree'].Branch(branch['name'], outputTree['newBranchArrays'][branch['name']], leafList)
        if len(self.newBranches) > 0:
            print("ADD NEW BRANCHES:")
            for branch in self.newBranches:
                print(" > \x1b[32m{name}\x1b[0m {formula}".format(
                    name=(branch['name']+('[%d]'%branch['length'] if branch['length'] > 1 else '')).ljust(30),
                    formula=branch['formula'] if 'formula' in branch else 'function:\x1b[33m{fct}\x1b[0m'.format(fct=branch['function'].__name__)
                ))

        # callbacks before loop
        for outputTree in self.outputTrees:
            if outputTree['callbacks'] and 'beforeLoop' in outputTree['callbacks']:
                outputTree['callbacks']['beforeLoop']()

        print ("------------------")
        print (" start processing ")
        print ("------------------")
        # loop over all events and write to output branches
        for event in self:

            # new event callback
            if self.callbacks and 'event' in self.callbacks:
                # if callbacks return false, skip event!
                callbackResults = [fcn(event) for fcn in self.callbacks['event']]
                if not all(callbackResults):
                    continue

            # fill branches
            for branch in self.newBranches:
                # evaluate result either as function applied on the tree entry or as TTreeFormula
                if branch['length'] == 1 and not 'arrayStyle' in branch:
                    if 'function' in branch:
                        if 'arguments' in branch:
                            branchResult = branch['function'](event, arguments=branch['arguments'])
                        else:
                            branchResult = branch['function'](event)
                    else:
                        branchResult = self.evaluate(branch['formula'])
                    if 'type' in branch and branch['type'] == 'i':
                        branchResult = int(branchResult)
                    # fill it for all the output trees
                    for outputTree in self.outputTrees:
                        outputTree['newBranchArrays'][branch['name']][0] = branchResult
                # for arrays pass the pointer to the array to the evaluation function to save the list->array conversion
                else:
                    if 'function' in branch:
                        # todo: make it more efficient by using a shared memory block for all of the output trees'
                        # todo: branches, this would help in case one adds new branches and writes to several trees at once
                        for outputTree in self.outputTrees:
                            if 'arguments' in branch:
                                branch['function'](event, destinationArray=outputTree['newBranchArrays'][branch['name']], arguments=branch['arguments'])
                            else:
                                branch['function'](event, destinationArray=outputTree['newBranchArrays'][branch['name']])
                    else:
                        for outputTree in self.outputTrees:
                            self.evaluateArray(branch['formula'], destinationArray=outputTree['newBranchArrays'][branch['name']])

            # evaluate all formulas
            self.formulaResults = {}
            for formulaName, formula in self.formulas.iteritems():
                self.formulaResults[formulaName] = self.evaluate(formulaName)

            # evaluate cuts for all output trees
            for outputTree in self.outputTrees:

                # evaluate all cuts of the sequence and abort early if one is not satisfied
                if outputTree['cutSequenceMode'] == 'AND':
                    passedCut = True
                    for cutFormulaName in outputTree['cutSequence']:
                        passedCut = passedCut and self.formulaResults[cutFormulaName]
                        if not passedCut:
                            break
                elif outputTree['cutSequenceMode'] == 'OR':
                    passedCut = False
                    for cutFormulaName in outputTree['cutSequence']:
                        passedCut = passedCut or self.formulaResults[cutFormulaName]
                        if passedCut:
                            break
                elif outputTree['cutSequenceMode'] == 'TREE':
                    passedCut = self.evaluateCutDictRecursive(outputTree['cutSequence'])
                else:
                    raise Exception("InvalidCutSequenceMode")

                # fill event if it passed the selection
                if passedCut:
                    outputTree['tree'].Fill()
                    outputTree['passed'] += 1

        print('INFO: end of processing. time ', time.ctime())
        sys.stdout.flush()

        # write files
        for outputTree in self.outputTrees:
            outputTree['file'].Write()
            outputTree['file'].Close()
        print('INFO: files written')
        print('INFO: saveMemory is ', self.saveMemory)
        sys.stdout.flush()

        if self.saveMemory:
            self.tree.Reset()
            self.tree = None
            for outputTree in self.outputTrees:
                outputTree['tree'] = None
            print('INFO: trees in memory destroyed!')

        # callbacks after having written file
        for outputTree in self.outputTrees:
            if outputTree['callbacks'] and 'afterWrite' in outputTree['callbacks']:
                try:
                    outputTree['callbacks']['afterWrite']()
                except Exception as e:
                    print("\x1b[31mWARNING: exception during callback:", e, "\x1b[0m")

        print('INFO: done. time ', time.ctime(), ' events read:', self.eventsRead)
        sys.stdout.flush()

        for outputTree in self.outputTrees:
            passedSelectionFraction = 100.0*outputTree['passed']/self.eventsRead if self.eventsRead>0 else '?'
            print (' > \x1b[34m{name}\x1b[0m {passed} ({fraction}%) => {outputFile}'.format(name=outputTree['name'], passed=outputTree['passed'], fraction=passedSelectionFraction, outputFile=outputTree['fileName']))
        sys.stdout.flush()

    @staticmethod
    def countSampleFiles(samples):
        # get list of sample root files
        if type(samples) == list:
            return len(samples)
        else:
            sampleTextFileName = samples
            if os.path.isfile(sampleTextFileName):
                with open(sampleTextFileName, 'r') as sampleTextFile:
                    sampleFileNames = sampleTextFile.readlines()
                return len(sampleFileNames)
            else:
                print ('ERROR: sample list text file does not exist:', sampleTextFileName)
        return -1

    def getNumSampleFiles(self):
        return len(self.sampleFileNames)

    # return the total scale for the sample, calculated from all count histograms from the TChain
    def getScale(self, sample, countHistogram=None):
        try:
            sample.xsec = sample.xsec[0]
        except:
            pass

        if self.totalNanoTreeCounts:
            if self.config.has_option('Configuration', 'countsFromAutoPU') and eval(self.config.get('Configuration', 'countsFromAutoPU')):
                count = self.histograms['autoPU'].GetEntries()
                countHistogram = "autoPU.GetEntries()"
            else:
                if not countHistogram:
                    countHistogram = self.config.get('Configuration', 'countTreeName') if self.config.has_option('Configuration', 'countTreeName') else 'genEventSumw'
                count = self.totalNanoTreeCounts[countHistogram]
        else:
            if not countHistogram:
                try:
                    posWeight = self.histograms['CountPosWeight'].GetBinContent(1)
                    negWeight = self.histograms['CountNegWeight'].GetBinContent(1)
                    count = posWeight - negWeight
                    countHistogram = 'CountPosWeight - CountNegWeight'
                except:
                    if self.verbose:
                        print("sampleTree: no CountPosWeight/CountNegWeight: using Count instead!!!!!!!!!!!")
                    try:
                        count = self.histograms['Count'].GetBinContent(1)
                    except Exception as e:
                        print ("EXCEPTION:", e)
                        print ("ERROR: no weight histograms found in sampleTree => terminate")
                        print ("HISTOGRAMS:", self.histograms)
                        exit(0)
            else:
                count = self.histograms[countHistogram].GetBinContent(1)

        # override event counts: config needs a section 'EventCounts' with sample identifier strings as keys and the new count as value
        # [EventCounts]
        # SampleIdentifier = 12345
        try:
            if self.sampleIdentifier and self.config.has_section('EventCounts') and self.config.has_option('EventCounts', self.sampleIdentifier):
                countNew = eval(self.config.get('EventCounts', self.sampleIdentifier))
                print("\x1b[97m\x1b[41mINFO: overwrite event counts with values from config!!!\n value from file:", count, "\n value from config:", countNew," <--- will be used!\x1b[0m")
                count = countNew
            #else:
            #    print("--> don't overwrite counts!", self.sampleIdentifier, self.config.has_section('EventCounts'), self.config.has_option('EventCounts', self.sampleIdentifier))
        except Exception as e:
            print("\x1b[31mException:",e," -> overwriting of event counts has been disabled\x1b[0m")


        lumi = float(sample.lumi)
        theScale = lumi * sample.xsec * sample.sf / float(count)

        if self.verbose:
            print("sampleTree.getScale(): sample: ", sample, "lumi: ", lumi, "xsec: ", sample.xsec, "sample.sf: ", sample.sf, "count (", countHistogram, "):", count, " ---> using scale: ", theScale)
        return theScale

    # create a unique string representation of the total cut, e.g. used to calculate the hash for cached samples 
    # this is not required to be a 'real' cut string, used by TTreeFormula etc.
    @staticmethod
    def findMinimumCut(cutList, cutSequenceMode='AND'):
        if type(cutList) == list or type(cutList) == dict:
            cuts = cutList
        else:
            cuts = [cutList]
        if cutSequenceMode == 'TREE' or type(cutList) == dict:
            minCut = "%r"%cuts
        elif cutSequenceMode == 'AND':
            minCut = '&&'.join(['(%s)'%x.replace(' ', '') for x in sorted(cuts)])
        elif cutSequenceMode == 'OR':
            minCut = '||'.join(['(%s)'%x.replace(' ', '') for x in sorted(list(set(cuts)))])
        else:
            minCut = "%r"%cuts
        return minCut

    def GetEntries(self):
        return self.tree.GetEntries()

    def Print(self):
        print("\x1b[34m\x1b[1m---- SampleTree ----")
        print("# this snippet below can be used to load this sample:")
        print("import ROOT")
        print("from myutils.sampleTree import SampleTree")
        print("sampleTree = SampleTree([")
        for fileName in self.sampleFileNames:
            print("    '" + fileName + "',")
        print("], treeName='Events', xrootdRedirector='" + self.fileLocator.getXrootdRedirector() + "')")
        print("---- end ----\x1b[0m")
Exemplo n.º 7
0
class CopyTreePSI(object):
    
    def __init__(self, config):
        self.config = config
        self.debug = 'XBBDEBUG' in os.environ
        self.fileLocator = FileLocator(config=self.config)

    def copySingleFile(self, whereToLaunch,inputFile,outputFile,skimmingCut,remove_branches):

        if self.debug:
            print("INPUT:", inputFile)
        input = ROOT.TFile.Open(inputFile,'read')
        if not input:
          print 'input file NOT EXISTING:',inputFile
          #input.Close()
          return
        try:
            __tmpPath = os.environ["TMPDIR"]
        except:
            __tmpPath = self.config.get('Directories', 'scratch')
        try:
            if not os.path.isdir(__tmpPath):
                os.makedirs(__tmpPath)
        except:
            pass
        outputFileName = outputFile.split('/')[-1]
        print 'outputFileName',__tmpPath+'/'+outputFileName
        output = ROOT.TFile.Open(__tmpPath+'/'+outputFileName,'recreate')

        inputTree = input.Get("tree")
        if not inputTree:
            inputTree = input.Get("Events")
        nEntries = inputTree.GetEntries()
        for branch in remove_branches:
          if branch and not branch.isspace():
            # print 'DROPPING BRANCHES LIKE',str(branch)
            inputTree.SetBranchStatus(str(branch), ROOT.kFALSE);

        output.cd()
        print '\n\t copy file: %s with cut: %s' %(inputFile,skimmingCut)
        outputTree = inputTree.CopyTree(skimmingCut)
        kEntries = outputTree.GetEntries()
        printc('blue','',"\t before cuts\t %s" %nEntries)
        printc('green','',"\t survived\t %s" %kEntries)
        outputTree.AutoSave()
        input.cd()
        obj = ROOT.TObject
        for key in ROOT.gDirectory.GetListOfKeys():
            input.cd()
            obj = key.ReadObj()
            # this contains the event tree, which will be copied skimmed only
            if obj.GetName() in  ['tree', 'Events']:
                continue
            if self.debug:
                print "DEBUG: clone object ", obj.GetName()
            # all other objects are just cloned
            output.cd()
            if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                objClone = obj.CloneTree(-1)
            else:
                objClone = obj
            objClone.Write(key.GetName())
        output.Write()
        output.Close()
        input.Close()
        tmpFile = __tmpPath+'/'+outputFileName
        print 'copy to final location:\x1b[34m', outputFile, '\x1b[0m'
        self.fileLocator.cp(source=tmpFile, target=outputFile)
        print 'checking if the copy worked'
        # check root file existence
        if self.fileLocator.exists(outputFile, attempts=2):
            if self.fileLocator.isValidRootFile(outputFile):
                if self.debug:
                    print('DEBUG: file exists and is good!')
            else:
                raise Exception('ERROR: file DOES NOT exist OR is corrupted!')
        else:
            raise Exception("self.fileLocator.exists(outputFile, attempts=2) failed: file DOES NOT exist")
              
        self.fileLocator.rm(tmpFile)

    def copySingleFileOneInput(self, inputs):
        return self.copySingleFile(*inputs)

    def getRedirector(self):
        # default redirector
        redirector = 'root://xrootd-cms.infn.it/'
        try:
            if 'XBBXRD' in os.environ:
                redirector = os.environ['XBBXRD']
            elif self.config.has_option('Configuration', 'xrootdRedirectorGlobal'):
                redirector = self.config.get('Configuration', 'xrootdRedirectorGlobal')
        except:
            print "could not get xrootd redirector, using default one:", redirector
            print "specify redirector in config [Directories] xrootdRedirectorGlobal=.."
        # add base path where storage is located on fs (if sample txt files don't contain absolute path)
        if self.config.has_option('Configuration', 'inputStoragePath'):
            redirector += self.config.get('Configuration', 'inputStoragePath') + '/'
        return redirector

    def copytreePSI(self, pathIN, pathOUT, folderName, skimmingCut, fileList=None):
        config = self.config
        fileLocator = self.fileLocator

        print 'start copytreePSI.py'
        fileNames = open(pathIN+'/'+folderName+'.txt').readlines() if not fileList else fileList
        print 'len(filenames)', len(fileNames), fileNames[0], skimmingCut

        ## search the folder containing the input files
        inputFiles = []
        print "##### COPY TREE - BEGIN ######"
        whereToLaunch = config.get('Configuration','whereToLaunch')
        remove_branches = config.get('General','remove_branches').replace("[","").replace("]","").replace("'","").split(',')
        print 'remove_branches:',remove_branches,'len(remove_branches):',len(remove_branches)

        redirector = self.getRedirector()
        for fileName in fileNames:
            fileName = fileName.strip()
            if fileName.lower().endswith('.root'):
                inputFiles.append(redirector + fileName)

        if len(inputFiles) == 0 :
            print "No .root files found in ", pathIN+'/'+folderName
            return

        ## prepare output folder
        outputFolder = "%s/%s/" %(pathOUT, folderName)
        fileLocator.makedirs(outputFolder)
        
        ## prepare a list of input(inputFile,outputFile,skimmingCut) for the files to be processed
        inputs=[]
        filenames=[]
        for inputFile in inputFiles:
            fileName = fileLocator.getFilenameAfterPrep(inputFile)
            outputFile = "%s/%s/%s" %(pathOUT,folderName,fileName)
            
            if fileLocator.exists(outputFile):
                if not fileLocator.isValidRootFile(outputFile):
                    fileLocator.rm(outputFile)
                    inputs.append((whereToLaunch,inputFile,outputFile,skimmingCut,remove_branches))
                else:
                    if self.debug:
                        print("SKIP INPUT:", inputFile)
            else:
                inputs.append((whereToLaunch,inputFile,outputFile,skimmingCut,remove_branches))

        # print 'inputs',inputs
        outputs = []
        multiprocess=int(config.get('Configuration','nprocesses'))
        if multiprocess>1:
            ## process the input list (using multiprocess)
            from multiprocessing import Pool
            p = Pool(multiprocess)
            outputs = p.map(copySingleFileOneInput,inputs)
        else:
            for input_ in inputs:
                    output = self.copySingleFileOneInput(input_)
                    outputs.append(output)
        
        print "##### COPY TREE - END ######"
Exemplo n.º 8
0
 def __init__(self, config):
     self.config = config
     self.debug = 'XBBDEBUG' in os.environ
     self.fileLocator = FileLocator(config=self.config)
Exemplo n.º 9
0
class TreeCache:
    def __init__(self,
                 sample,
                 cutList='1',
                 branches=None,
                 inputFolder=None,
                 tmpFolder=None,
                 outputFolder=None,
                 chunkNumber=-1,
                 splitFilesChunks=-1,
                 splitFilesChunkSize=-1,
                 debug=False,
                 fileList=None,
                 cutSequenceMode='AND',
                 name='',
                 config=None):
        self.config = config
        self.fileLocator = FileLocator(config=self.config)
        self.debug = debug or ('XBBDEBUG' in os.environ)

        # SAMPLE
        if isinstance(sample, Sample):
            # sample passed as Sample object
            # count number of chunks the cached data is split into
            splitFilesChunkSize = sample.mergeCachingSize
            splitFilesChunks = SampleTree(
                {
                    'name': sample.identifier,
                    'folder': inputFolder
                },
                countOnly=True,
                splitFilesChunkSize=splitFilesChunkSize,
                config=config,
                verbose=self.debug).getNumberOfParts()
            # if sample passed as object, it can be a 'subsample' and habe different name and identifier
            self.sample = sample.name
            self.sampleIdentifier = sample.identifier
            if self.debug:
                print("INFO: use sample=", sample.name, " #parts = ",
                      splitFilesChunks)
        else:
            # sample identifier passed as string
            self.sample = sample
            self.sampleIdentifier = sample
        self.name = name

        # CUTS
        self.cutList = cutList
        self.cutSequenceMode = cutSequenceMode
        self.minCut = SampleTree.findMinimumCut(
            self.cutList, cutSequenceMode=self.cutSequenceMode)

        # PATHS
        self.inputFolder = inputFolder
        self.outputFolder = (
            config.get('Directories', 'tmpSamples')
            if config else 'cache/') if outputFolder is None else outputFolder
        self.tmpFolder = (config.get('Directories', 'scratch') if config else
                          'tmp/') if tmpFolder is None else tmpFolder
        self.cachedFileNames = []
        self.tmpFiles = []
        self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root'

        # BRANCHES and chunk information
        self.branches = branches
        self.branchesForHash = None  # for now make hash independent of selecte branches
        self.hash = Hash(sample=sample,
                         minCut=self.minCut,
                         branches=self.branchesForHash,
                         splitFilesChunkSize=splitFilesChunkSize,
                         debug=False,
                         inputPath=self.inputFolder).get()
        self.chunkNumber = chunkNumber
        self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1
        self.splitFilesChunkSize = splitFilesChunkSize

        # identifier is just used as an arbitrary name for print-out
        cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else
                                self.minCut[0:50] + '...').replace(' ', '')
        self.identifier = '{sample}[{cut}]of{parts}'.format(
            sample=self.sample,
            cut=cutUsedForIdentifier,
            parts=self.splitFilesChunks)
        self.sampleTree = None
        self.isCachedChecked = False

        self.createFolders()

    # free memory
    def deleteSampleTree(self):
        self.sampleTree = None

    # file, where skimmed tree is written to
    def getTmpFileName(self):
        return self.outputFileNameFormat.format(
            outputFolder=self.tmpFolder,
            hash=self.hash,
            part=self.chunkNumber if self.chunkNumber > 0 else 1,
            parts='%d' % self.splitFilesChunks)

    # file, where skimmed tree is moved to after it has been written completely
    def getOutputFileName(self):
        return self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part=self.chunkNumber if self.chunkNumber > 0 else 1,
            parts='%d' % self.splitFilesChunks)

    # check existence of files with skimmed trees
    def findCachedFileNames(self, chunkNumber=-1):
        cachedFilesMaskRaw = self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part='*' if chunkNumber < 1 else '%d' % chunkNumber,
            parts=self.splitFilesChunks)
        cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw)
        self.cachedFileNames = glob.glob(cachedFilesMask)
        if self.debug:
            print('DEBUG: search files:', cachedFilesMask)
            print('\x1b[32mDEBUG: files:')
            for fileName in self.cachedFileNames:
                print(' > ', fileName)
            if len(self.cachedFileNames) < 1:
                print('none!')
            print('\x1b[0m(%d files found)' % len(self.cachedFileNames))
        return self.cachedFileNames

    # check if a single part is cached, (only checks existence of the file, not validity!)
    def partIsCached(self):
        cachedFilesMaskRaw = self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part=self.chunkNumber,
            parts=self.splitFilesChunks)
        cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw)
        return len(glob.glob(cachedFilesMask)) > 0

    # isCached == all files containing the skimmed tree found!
    def isCached(self):
        self.findCachedFileNames()
        if (len(self.cachedFileNames) != self.splitFilesChunks
                and self.splitFilesChunks > 1) or len(
                    self.cachedFileNames) == 0:
            if self.debug:
                print('\x1b[32mDEBUG: not cached:', self.identifier, '\x1b[0m')
            return False
        self.isCachedChecked = True
        return True

    # check if an existing file can be opened without errors by ROOT
    def checkFileValidity(self, rawFileName):
        xrootdFileName = self.fileLocator.getXrootdFileName(rawFileName)
        f = ROOT.TFile.Open(xrootdFileName, 'read')
        if not f or f.GetNkeys() == 0 or f.TestBit(
                ROOT.TFile.kRecovered) or f.IsZombie():
            print('\x1b[31mWARNING: broken file:', rawFileName,
                  ' => redo caching!\x1b[0m')
            if f:
                f.Close()
            self.deleteFile(rawFileName)
            return False
        if f:
            f.Close()
        return True

    # check if all cached files are valid
    def isCachedAndValid(self):
        valid = True
        if self.isCached():
            # check file integrity
            for fileName in self.cachedFileNames:
                valid = valid and self.checkFileValidity(fileName)
        else:
            valid = False
        return valid

    # set input sampleTree object
    def setSampleTree(self, sampleTree):
        self.sampleTree = sampleTree
        return self

    # this prepares the caching by telling the sampleTree object what to write during processing of the file
    # note: does not run the caching by itself! needs an additional sampleTree.process()
    def cache(self):
        if self.sampleTree:
            outputFileName = self.getTmpFileName()
            callbacks = {'afterWrite': self.moveFilesToFinalLocation}
            self.sampleTree.addOutputTree(outputFileName=outputFileName,
                                          cut=self.cutList,
                                          hash=self.hash,
                                          branches=self.branches,
                                          callbacks=callbacks,
                                          cutSequenceMode=self.cutSequenceMode,
                                          name=self.name)
            self.tmpFiles.append(outputFileName)
            if self.debug:
                print('\x1b[32mDEBUG: output file for ', self.identifier,
                      ' is ', outputFileName, '\x1b[0m')
        else:
            print(
                '\x1b[31mERROR: no sample tree connected!:', self.identifier,
                ' set the sampleTree first with "setSampleTree(sampleTree)" \x1b[0m'
            )
        return self

    # return sample tree class of cached samples if all files found
    def getTree(self):
        # if it has already been checked if tree is cached, then use this result dierctly
        isCached = self.isCachedChecked
        if not isCached:
            isCached = self.isCached()
        if isCached:
            self.sampleTree = SampleTree(self.cachedFileNames,
                                         config=self.config)
            self.sampleTree.sampleIdentifier = self.sampleIdentifier
        return self.sampleTree

    # delete file
    def deleteFile(self, rawFileName):
        if self.debug:
            print('DELETE:', rawFileName)
        self.fileLocator.rm(rawFileName)

    # delete cached files
    def deleteCachedFiles(self, chunkNumber=-1):
        cachedFileNames = self.findCachedFileNames(chunkNumber=chunkNumber)
        for fileName in cachedFileNames:
            if self.fileLocator.fileExists(fileName):
                self.deleteFile(fileName)

    # create folders
    def createFolders(self):
        tmpfolderLocal = self.fileLocator.getLocalFileName(self.tmpFolder)
        if not os.path.isdir(tmpfolderLocal):
            print("DOES NOT EXIST:", tmpfolderLocal)
            try:
                xrootdFileName = self.fileLocator.getXrootdFileName(
                    self.tmpFolder)
                if '://' not in xrootdFileName:
                    os.makedirs(self.tmpFolder)
                else:
                    command = 'gfal-mkdir %s' % (xrootdFileName)
                    returnCode = subprocess.call([command], shell=True)
                    if self.debug:
                        print(command, ' => ', returnCode)
                        print()
            except:
                pass

        if not self.fileLocator.exists(self.outputFolder):
            print("INFO: output folder does not exist and will be created:",
                  self.outputFolder)
            self.fileLocator.makedirs(self.outputFolder)

    # move files from temporary to final location
    def moveFilesToFinalLocation(self):
        success = True
        # free some memory for file copy command
        if self.debug:
            print('DEBUG: max mem used A:',
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        self.deleteSampleTree()
        if self.debug:
            print('DEBUG: max mem used B:',
                  resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        for tmpFileName in self.tmpFiles:
            outputFileName = self.outputFolder + '/' + self.tmpFolder.join(
                tmpFileName.split(self.tmpFolder)[1:])
            print('copy ', tmpFileName, ' to ', outputFileName)
            if self.fileLocator.fileExists(outputFileName):
                self.deleteFile(outputFileName)
            #command = 'xrdcp -d 1 ' + self.fileLocator.getXrootdFileName(tmpFileName) + ' ' + self.fileLocator.getXrootdFileName(outputFileName)
            #print('the command is', command)
            #sys.stdout.flush()
            #returnCode = subprocess.call([command], shell=True)
            copySuccessful = self.fileLocator.cp(tmpFileName, outputFileName)
            if not copySuccessful:
                success = False
                print(
                    '\x1b[31mERROR: copy failed for {tmpfile}->{outputfile} !\x1b[0m'
                    .format(tmpfile=tmpFileName, outputfile=outputFileName))
            else:
                # delete temporary file if copy was successful
                self.deleteFile(tmpFileName)
        return success
Exemplo n.º 10
0
class SampleTree(object):

    def __init__(self, samples, treeName=None, limitFiles=-1, splitFilesChunkSize=-1, chunkNumber=1, countOnly=False, verbose=True, config=None, saveMemory=False, xrootdRedirector=None):
        self.verbose = verbose
        self.debug = 'XBBDEBUG' in os.environ
        self.debugProfiling = 'XBBPROFILING' in os.environ
        self.config = config
        self.saveMemory = saveMemory
        self.outputTreeBasketSize = None
        if self.config and self.config.has_option('Configuration', 'outputTreeBasketSize'):
            self.outputTreeBasketSize = eval(self.config.get('Configuration', 'outputTreeBasketSize'))
        self.monitorPerformance = True
        self.disableBranchesInOutput = True
        self.samples = samples
        self.tree = None
        self.fileLocator = FileLocator(config=self.config, xrootdRedirector=xrootdRedirector)
        self.sampleIdentifier = None

        # process only partial sample root file list
        self.splitFilesChunkSize = splitFilesChunkSize
        self.chunkNumber = chunkNumber
       
        # get list of sample root files to process
        sampleFileNamesParts = self.getSampleFileNameChunks()
        if self.chunkNumber > 0 and self.chunkNumber <= self.numParts:
            if len(sampleFileNamesParts) == self.numParts:
                chunkIndex = self.chunkNumber - 1
                self.sampleFileNames = sampleFileNamesParts[chunkIndex]
            else:
                raise Exception("InvalidNumberOfSplitParts")
        else:
            print("\x1b[31mERROR: wrong chunk number ", self.chunkNumber, "\x1b[0m")
            raise Exception("InvalidChunkNumber")
        if self.verbose:
            print ("INFO: reading part ", self.chunkNumber, " of ", self.numParts)

        self.status = 0
        if not treeName:
            if self.config and self.config.has_option('Configuration', 'treeName'):
                self.treeName = self.config.get('Configuration', 'treeName')
            else:
                # HEPPY default
                self.treeName = 'tree'
        else:
            self.treeName = treeName
        self.formulas = {}
        self.formulaDefinitions = []
        self.oldTreeNum = -1
        self.limitFiles = int(limitFiles) 
        self.timeStart = time.time()
        self.timeETA = 0
        self.eventsRead = 0
        self.outputTrees = []
        self.callbacks = {}
        self.removeBranches = []

        # e.g. for additional branches to be added
        self.newBranches = []

        # check existence of sample .txt file which contains list of .root files
        self.sampleTextFileName = ''

        # add all .root files to chain and add count histograms
        self.chainedFiles = []
        self.brokenFiles = []
        self.histograms = {}
        self.nanoTreeCounts = {}
        self.totalNanoTreeCounts = {}

        if not countOnly:
            self.tree = ROOT.TChain(self.treeName)

            # loop over all given .root files 
            for rootFileName in self.sampleFileNames:
                if self.debug:
                    print('DEBUG: next file is:', rootFileName, ", check existence")

                # check root file existence, TODO: simplify
                if self.fileLocator.exists(rootFileName):
                    remoteRootFileName = self.fileLocator.getRemoteFileName(rootFileName)
                    input = ROOT.TFile.Open(remoteRootFileName, 'read')

                    # check file validity
                    if input and not input.IsZombie() and input.GetNkeys() > 0 and not input.TestBit(ROOT.TFile.kRecovered):
                        if self.debug:
                            print('DEBUG: file exists and is good!')

                        # add count histograms, since they are not in the TChain
                        for key in input.GetListOfKeys():
                            obj = key.ReadObj()
                            if obj.GetName() == self.treeName:
                                continue
                            histogramName = obj.GetName()

                            # nanoAOD: use branch of a tree instead of histogram for counting
                            if histogramName == 'Runs':
                                branchList = [x.GetName() for x in obj.GetListOfBranches()]
                                if self.debug:
                                    print ("DEBUG: nano counting tree has the following BRANCHES:", branchList)
                                for branch in branchList:
                                    if branch not in self.nanoTreeCounts:
                                        self.nanoTreeCounts[branch] = []
                                nEntries = obj.GetEntries()
                                for i in range(nEntries):
                                    obj.GetEntry(i)
                                    for branch in branchList:
                                        self.nanoTreeCounts[branch].append(getattr(obj, branch))

                            if histogramName in self.histograms:
                                if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                                    if self.debug:
                                        print("DEBUG: object is a tree and will be skipped:", obj.GetName())
                                else:
                                    if self.histograms[histogramName]:
                                        self.histograms[histogramName].Add(obj)
                                    else:
                                        print ("ERROR: histogram object was None!!!")
                                        raise Exception("CountHistogramMissing")
                            else:
                                # add all TH*'s in one single histogram
                                if obj.IsA().InheritsFrom(ROOT.TH1.Class()):
                                    self.histograms[histogramName] = obj.Clone(obj.GetName())
                                    self.histograms[histogramName].SetDirectory(0)
                                else:
                                    if self.debug:
                                        print("DEBUG: omitting object ", obj, type(obj), " since it is neither TH1 or TTree!")

                        input.Close()

                        # add file to chain
                        chainTree = '%s/%s'%(remoteRootFileName.strip(), self.treeName.strip())
                        if self.debug:
                            print ('\x1b[42mDEBUG: chaining '+chainTree,'\x1b[0m')
                        statusCode = self.tree.Add(chainTree)
                        if self.debug:
                            print ('\x1b[42mDEBUG: ---> %r'%statusCode,'\x1b[0m')
 
                        # check for errors in chaining the file
                        if statusCode != 1:
                            print ('ERROR: failed to chain ' + chainTree + ', returned: ' + str(statusCode), 'tree:', self.tree)
                            raise Exception("TChain method Add failure")
                        elif not self.tree:
                            print ('\x1b[31mERROR: tree died after adding %s.\x1b[0m'%rootFileName)
                        else:
                            self.treeEmpty = False
                            self.chainedFiles.append(rootFileName)
                            if self.limitFiles > 0 and len(self.chainedFiles) >= self.limitFiles:
                                print ('\x1b[35mDEBUG: limit reached! no more files will be chained!!!\x1b[0m')
                                break
                    else:
                        print ('\x1b[31mERROR: file is damaged: %s\x1b[0m'%rootFileName)
                        if input:
                            print ('DEBUG: Zombie:', input.IsZombie(), '#keys:', input.GetNkeys(), 'recovered:', input.TestBit(ROOT.TFile.kRecovered))
                        self.brokenFiles.append(rootFileName)
                else:
                    print ('\x1b[31mERROR: file is missing: %s\x1b[0m'%rootFileName)

            if self.verbose or self.debug:
                print ('INFO: # files chained: %d'%len(self.chainedFiles))
                if len(self.brokenFiles) > 0:
                    print ('INFO: # files broken : %d'%len(self.brokenFiles))
            
            if len(self.chainedFiles) < 1:
                self.tree = None

            if self.tree:
                self.tree.SetCacheSize(50*1024*1024)

            # merge nano counting trees
            if self.nanoTreeCounts:
                # TODO: per run if possible, sum LHE weights if present

                # sum the contributions from the subtrees
                self.totalNanoTreeCounts = {key: sum(values) for key,values in self.nanoTreeCounts.iteritems() if len(values) > 0 and type(values[0]) in [int, float, long]}

                # print summary table
                countBranches = self.totalNanoTreeCounts.keys()
		print (countBranches)
                depth = None
                for key,values in self.nanoTreeCounts.iteritems():
                    if values and len(values)>1 and type(values[0]) in [int, float, long]:
                        depth = len(values)
                        break
                print("-"*160)
                print("tree".ljust(25), ''.join([countBranch.ljust(25) for countBranch in countBranches]))
                if depth:
                    for treeNum in range(depth):
                        print(("%d"%(treeNum+1)).ljust(25),''.join([('%r'%self.nanoTreeCounts[countBranch][treeNum]).ljust(25) for countBranch in countBranches]))
                print("\x1b[34m","sum".ljust(24), ''.join([('%r'%self.totalNanoTreeCounts[countBranch]).ljust(25) for countBranch in countBranches]),"\x1b[0m")
                print("-"*160)

                # fill summed tree (create new tree)
                self.histograms['Runs'] = ROOT.TTree('Runs', 'count histograms for nano')
                nanoTreeCountBuffers = {}
                for key, value in self.totalNanoTreeCounts.iteritems():
                    print (key,"   ", value, "   here print key and value   ")
		    if (key=='run' and len(countBranches)==1): value=1
                    if type(value) == int:
                        # 64 bit signed int 
                        typeCode = 'L'
                    elif type(value) == long:
                        typeCode = 'L'
                    elif type(value) == float:
                        typeCode = 'f'
                    nanoTreeCountBuffers[key] = array.array(typeCode, [value])
                    self.histograms['Runs'].Branch(key, nanoTreeCountBuffers[key], '{name}/{typeCode}'.format(name=key, typeCode=typeCode))
                self.histograms['Runs'].Fill()

    def __del__(self):
        self.delete()

    def delete(self):
        self.callbacks = None
        # close possible left open files referencing the TChain and delete output trees
        try:
            if self.tree:
                self.tree.Reset()
        except:
            pass
        self.fileLocator = None
        self.config = None
        for outputTree in self.outputTrees:
            del outputTree['file']
        try:
            for formulaName, formula in self.formulas.iteritems():
                if formula:
                    del formula
                    formula = None
        except e:
            print("EXCEPTION:", e)
        try:
            for outputTree in self.outputTrees:
                if outputTree['tree']:
                    del outputTree['tree']
                    outputTree['tree'] = None
        except e:
            print("EXCEPTION:", e)
        try:
            if self.tree:
                del self.tree
                self.tree = None
        except e:
            print("EXCEPTION:", e)

    # ------------------------------------------------------------------------------
    # return full list of sample root files 
    # ------------------------------------------------------------------------------
    def getAllSampleFileNames(self): 
        # given argument is list -> this is already the list of root files
        if type(self.samples) == list:
            sampleFileNames = self.samples
        # given argument is name and folder -> glob
        elif type(self.samples) == dict:
            if 'sample' in self.samples:
                sampleName = self.samples['sample'].identifier
            else:
                sampleName = self.samples['name']
            self.sampleIdentifier = sampleName
            sampleFolder = self.samples['folder']
            samplesMask = self.fileLocator.getLocalFileName(sampleFolder) + '/' + sampleName + '/*.root'
            redirector = self.fileLocator.getRedirector(sampleFolder)
            if self.verbose:
                print ("INFO: use ", samplesMask)
            sampleFileNames = glob.glob(samplesMask)
            sampleFileNames = [self.fileLocator.addRedirector(redirector, x) for x in sampleFileNames]
            if self.verbose:
                print ("INFO: found ", len(sampleFileNames), " files.")
        # given argument is a single file name -> read this .txt file 
        else:
            sampleTextFileName = self.samples
            if os.path.isfile(sampleTextFileName):
                self.sampleTextFileName = sampleTextFileName
                if self.verbose:
                    print('open samples .txt file: %s' % self.sampleTextFileName)
            else:
                print("\x1b[31mERROR: file not found: %s \x1b[0m" % sampleTextFileName)
                return

            with open(self.sampleTextFileName, 'r') as sampleTextFile:
                sampleFileNames = sampleTextFile.readlines()
        return sampleFileNames

    # ------------------------------------------------------------------------------
    # return lists of sample root files, split into chunks with certain size  
    # ------------------------------------------------------------------------------
    def getSampleFileNameChunks(self):
        sampleFileNames = self.getAllSampleFileNames()
        if self.splitFilesChunkSize > 0 and len(sampleFileNames) > self.splitFilesChunkSize:
            sampleFileNamesParts = [sampleFileNames[i:i + self.splitFilesChunkSize] for i in xrange(0, len(sampleFileNames), self.splitFilesChunkSize)]
        else:
            sampleFileNamesParts = [sampleFileNames]
        self.numParts = len(sampleFileNamesParts)
        return sampleFileNamesParts

    # ------------------------------------------------------------------------------
    # return lists of sample root files for a single chunk  
    # ------------------------------------------------------------------------------
    def getSampleFileNameChunk(self, chunkNumber):
        chunks = self.getSampleFileNameChunks()
        if chunkNumber > 0 and chunkNumber <= len(chunks):
            return chunks[chunkNumber-1]
        else:
            print("\x1b[31mERROR: invalid chunk number {n} \x1b[0m".format(n=chunkNumber))

    def getNumberOfParts(self):
        return self.numParts

    # ------------------------------------------------------------------------------
    # add a TTreeFormula connected to the TChain
    # ------------------------------------------------------------------------------
    def addFormula(self, formulaName, formula=None):
        if formula is None:
            formula = formulaName

        # there might be an undocumented limit on the length of cutstrings in ROOT...
        if len(formula) > 1023:
            print("\x1b[41m\x1b[97m------------------------------------------------------------------------------")
            print(" WARNING !!! ROOT.TTreeFormula of length %d, this might cause problems !!"%len(formula))
            print(" reduce length of formulas if problems occur, e.g. by passing lists of cut formulas!")
            print("------------------------------------------------------------------------------\x1b[0m")

        self.formulaDefinitions.append({'name': formulaName, 'formula': formula})
        self.formulas[formulaName] = ROOT.TTreeFormula(formulaName, formula, self.tree) 
        if self.formulas[formulaName].GetNdim() == 0:
            print("DEBUG: formula is:", formula)
            print("\x1b[31mERROR: adding the tree formula failed! Check branches of input tree and loaded namespaces.\x1b[0m")
            raise Exception("SampleTreeAddTTreeFormulaFailed")

    # ------------------------------------------------------------------------------
    # return list of formulas
    # ------------------------------------------------------------------------------
    def getFormulas(self):
        return self.formulas

    # ------------------------------------------------------------------------------
    # add a new branch
    # ------------------------------------------------------------------------------
    def addOutputBranch(self, branchName, formula, branchType='f', length=1, arguments=None, leaflist=None, arrayStyle=False):
        # this is needed to overwrite the branch if it already exists!
        self.addBranchToBlacklist(branchName)

        # function
        if callable(formula):
            newBranch = {'name': branchName, 'function': formula, 'type': branchType, 'length': length}
            if arguments:
                newBranch['arguments'] = arguments
        # string which contains a TTreeFormula expression
        else:
            formulaName = 'alias:' + branchName
            self.addFormula(formulaName, formula)
            newBranch = {'name': branchName, 'formula': formulaName, 'type': branchType, 'length': length}
        if leaflist:
            newBranch['leaflist'] = leaflist
        if arrayStyle:
            newBranch['arrayStyle'] = True
        self.newBranches.append(newBranch)

    # ------------------------------------------------------------------------------
    # pass a list of dictionaries of branches to add
    # TODO: avoid detour via addOutputBranch and set dictionary directly
    # ------------------------------------------------------------------------------
    def addOutputBranches(self, branchDictList):
        for branchDict in branchDictList:
            self.addOutputBranch(
                branchName=branchDict['name'],
                formula=branchDict['formula'],
                branchType=branchDict['type'] if 'type' in branchDict else 'f',
                length=branchDict['length'] if 'length' in branchDict else 1,
                arguments=branchDict['arguments'] if 'arguments' in branchDict else None,
                leaflist=branchDict['leaflist'] if 'leaflist' in branchDict else None,
                arrayStyle=branchDict['arrayStyle'] if 'arrayStyle' in branchDict else False,
            )

    # ------------------------------------------------------------------------------
    # implement iterator for TChain, with updating TTreeFormula objects on tree
    # switching and show performance statistics during loop
    # ------------------------------------------------------------------------------
    def next(self):
        self.treeIterator.next()
        self.eventsRead += 1
        treeNum = self.tree.GetTreeNumber()
        # TTreeFormulas have to be updated when the tree number changes in a TChain
        if treeNum != self.oldTreeNum:

            # update ETA estimates
            if treeNum == 0:
                self.timeStart = time.time()
                perfStats = '?'
            else:
                fraction = 1.0*treeNum/len(self.chainedFiles)
                passedTime = time.time() - self.timeStart
                self.timeETA = (1.0-fraction)/fraction * passedTime if fraction > 0 else 0
                perfStats = 'INPUT: {erps}/s, OUTPUT: {ewps}/s '.format(erps=self.eventsRead / passedTime if passedTime>0 else 0, ewps=sum([x['passed'] for x in self.outputTrees]) / passedTime if passedTime>0 else 0)

            # output status
            if self.verbose or self.debug:
                percentage = 100.0*treeNum/len(self.chainedFiles)
                if treeNum == 0:
                    print ('INFO: time ', time.ctime())
                if self.debug:
                    perfStats = perfStats + ' max mem used = %d'%(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
                print ('INFO: switching trees --> %d (=%1.1f %%, ETA: %s min, %s)'%(treeNum, percentage, self.getETA(), perfStats))
                if self.debugProfiling:
                    self.tree.PrintCacheStats()
                sys.stdout.flush()
            self.oldTreeNum = treeNum
            # update TTreeFormula's
            for formulaName, treeFormula in self.formulas.iteritems():
                treeFormula.UpdateFormulaLeaves()
        return self.tree

    def __iter__(self):
        self.treeIterator = self.tree.__iter__()
        return self

    # wrapper to evaluate a formula, which has been added to the formula dictionary
    # vector valued formulas are not supported
    def evaluate(self, formulaName):
        if formulaName in self.formulas:
            if self.formulas[formulaName].GetNdata() > 0:
                return self.formulas[formulaName].EvalInstance()
            else:
                return 0
        else:
            existingFormulas = [x for x,y in self.formulas.iteritems()]
            print ("existing formulas are: ", existingFormulas)
            raise Exception("SampleTree::evaluate: formula '%s' not found!"%formulaName)

    # evaluates a vector formula and fills an array, returns the number of dimensions of the formula
    def evaluateArray(self, formulaName, destinationArray):
        if formulaName in self.formulas:
            nData = self.formulas[formulaName].GetNdata()
            for i in range(nData):
                destinationArray[i] = self.formulas[formulaName].EvalInstance(i)
            return nData
        else:
            existingFormulas = [x for x, y in self.formulas.iteritems()]
            print("existing formulas are: ", existingFormulas)
            raise Exception("SampleTree::evaluate: formula '%s' not found!" % formulaName)

    # return string of ETA in minutes
    def getETA(self):
        return '%1.1f'%(self.timeETA/60.0) if self.timeETA > 0 else '?'

    def GetListOfBranches(self):
        return self.tree.GetListOfBranches()

    # ------------------------------------------------------------------------------
    # handle 'tree-typed' cuts, passed as dictionary:
    # e.g. cut = {'OR': [{'AND': ["pt>20","eta<3"]}, "data==1"]}
    # short-circuit evaluation is handled by builtins any() and all()
    # ------------------------------------------------------------------------------
    def addCutDictRecursive(self, cutDict):
        if type(cutDict) == str:
            if cutDict not in self.formulas:
                self.addFormula(cutDict, cutDict)
        elif 'OR' in cutDict and 'AND' in cutDict:
            raise Exception("BadTreeTypeCutDict")
        elif 'OR' in cutDict:
            for subDict in cutDict['OR']:
                self.addCutDictRecursive(subDict)
        elif 'AND' in cutDict:
            for subDict in cutDict['AND']:
                self.addCutDictRecursive(subDict)
        else:
            raise Exception("BadTreeTypeCutDict")

    def evaluateCutDictRecursive(self, cutDict):
        if type(cutDict) == str:
            if self.formulaResults[cutDict] is None:
                print ("FORMULA:", cutDict)
                raise Exception("UnevaluatedFormula!!")
            return self.formulaResults[cutDict]
        elif 'OR' in cutDict and 'AND' in cutDict:
            raise Exception("BadTreeTypeCutDict")
        elif 'OR' in cutDict:
            return any([self.evaluateCutDictRecursive(subDict) for subDict in cutDict['OR']])
        elif 'AND' in cutDict:
            return all([self.evaluateCutDictRecursive(subDict) for subDict in cutDict['AND']])
        else:
            raise Exception("BadTreeTypeCutDict")
    
    # set callback function, which MUST return a boolean. To continue processing this event, the function must return True. False means skip this event!
    def setCallback(self, category, fcn):
        if category not in ['event']:
            raise Exception("CallbackEventDoesNotExist")
        if category in self.callbacks:
            print("WARNING: callback function for ", category, " is overwritten!")
        self.callbacks[category] = [fcn]

    # add callback function, which MUST return a boolean. To continue processing this event, the function must return True. False means skip this event!
    def addCallback(self, category, fcn):
        if category not in ['event']:
            raise Exception("CallbackEventDoesNotExist")
        if category not in self.callbacks:
             self.callbacks[category] = []
        self.callbacks[category].append(fcn)

    # ------------------------------------------------------------------------------
    # add output tree to be written during the process() function
    # ------------------------------------------------------------------------------
    def addOutputTree(self, outputFileName, cut, hash='', branches=None, callbacks=None, cutSequenceMode='AND', name=''):

        # write events which satisfy either ONE of the conditions given in the list or ALL
        if cutSequenceMode not in ['AND', 'OR', 'TREE']:
            raise Exception("InvalidCutSequenceMode")

        if len([x for x in self.outputTrees if x['fileName'] == outputFileName])>0:
            print("WARNING: skipping duplicate file ", outputFileName, "!")
            return False

        outputTree = {
            'tree': None, # will create this tree later, after it is known which branches will be enabled 
            'name': name,
            'fileName': outputFileName,
            'file': None, 
            'cut': cut,
            'cutSequence': [],
            'cutSequenceMode': cutSequenceMode,
            'hash': hash,
            'branches': branches,
            'callbacks': callbacks,
            'passed': 0,
        }

        self.outputTrees.append(outputTree)
    
    # these branches are ALWAYS removed (e.g. because they will be recomputed), even when they are in the 'keep_branches' list
    def addBranchToBlacklist(self, branchName):
        self.removeBranches.append(branchName)

    # wrapper to enable/disable branches in the TChain
    def SetBranchStatus(self, branchName, branchStatus):
        listOfExistingBranches = self.GetListOfBranches()
        if listOfExistingBranches.FindObject(branchName) or '*' in branchName:
            self.tree.SetBranchStatus(branchName, branchStatus)

    # enables ONLY the given branches (* wildcards supported) and checks existence before enabling them to avoid warning messages during tree iteration
    def enableBranches(self, listOfBranchesToKeep):
        listOfExistingBranches = self.GetListOfBranches()
        self.tree.SetBranchStatus("*", 0)
        enabledBranches = []
        for branchName in listOfBranchesToKeep:
            if listOfExistingBranches.FindObject(branchName) or '*' in branchName:
                self.tree.SetBranchStatus(branchName, 1)
                enabledBranches.append(branchName)
        print("INFO: reduced number of enabled branches from", len(listOfExistingBranches), " to", len(enabledBranches), " (branches with wildcards may not be correctly counted)")
        if self.verbose:
            print ("INFO: branches:", BranchList(enabledBranches).getShortRepresentation())

    # ------------------------------------------------------------------------------
    # loop over all entries in the TChain and copy events to output trees, if the
    # cuts are fulfilled.
    # ------------------------------------------------------------------------------
    def process(self):
        if self.debug:
            rsrc = resource.RLIMIT_DATA
            # restrict memory
            # resource.setrlimit(rsrc, (2.0*1024*1024*1024, 6*1024*1024*1024))
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: mem limits soft/hard:', soft, hard)
            rsrc = resource.RLIMIT_AS
            # restrict memory
            # resource.setrlimit(rsrc, (2.0*1024*1024*1024, 6*1024*1024*1024))
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: AS limits soft/hard:', soft, hard)
            rsrc = resource.RLIMIT_STACK
            soft, hard = resource.getrlimit(rsrc)
            print('DEBUG: stack limits soft/hard:', soft, hard)
            print('DEBUG: max mem used:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        if self.verbose:
            print ('OUTPUT TREES:')
            for outputTree in self.outputTrees:
                cutString = "%r"%outputTree['cut']
                if len(cutString) > 50:
                    cutString = cutString[0:50] + '...(%s more chars)'%(len(cutString)-50)
                print (' > ', outputTree['fileName'], ' <== ', outputTree['name'] if 'name' in outputTree else outputTree['hash'], ' cut: ', cutString)
            print ('FORMULAS:')
            for formulaName, formula in self.formulas.iteritems():
                print (' > \x1b[35m', formulaName, '\x1b[0m ==> ', formula)

        # find common set of branches which needs to be enabled for cuts and desired variables in all of the output trees
        listOfBranchesToKeep = []
        for outputTree in self.outputTrees:
            if 'branches' in outputTree and outputTree['branches']:
                listOfBranchesToKeep += outputTree['branches']
            if 'cut' in outputTree and outputTree['cut']:
                listOfBranchesToKeep += BranchList(outputTree['cut']).getListOfBranches()
            for formula in self.formulaDefinitions:
                listOfBranchesToKeep += BranchList(formula['formula']).getListOfBranches()

        # keep the branches stated in config, (unless they will be recomputed)
        if self.config:
            listOfBranchesToKeep += eval(self.config.get('Branches', 'keep_branches'))
        listOfBranchesToKeep = list(set(listOfBranchesToKeep))

        # disable the branches in the input if there is no output tree which wants to have all branches
        if '*' not in listOfBranchesToKeep and len(listOfBranchesToKeep) > 0:
            self.enableBranches(listOfBranchesToKeep)
        else:
            print("INFO: keep all branches")

        # now disable all branches, which will be e.g. recomputed
        for branchName in self.removeBranches:
            print ("INFO: but remove", branchName)
            self.SetBranchStatus(branchName, 0)

        # initialize the output trees, this has to be called after the calls to SetBranchStatus
        for outputTree in self.outputTrees:
            outputTree['file'] = ROOT.TFile.Open(outputTree['fileName'], 'recreate')
            if not outputTree['file'] or outputTree['file'].IsZombie():
                print ("\x1b[31mERROR: output file broken\x1b[0m")
                raise Exception("OutputFileBroken")
        
            # copy count histograms to output files
            outputTree['histograms'] = {}
            for histogramName, histogram in self.histograms.iteritems():
                    outputTree['histograms'][histogramName] = histogram.Clone(histogram.GetName())
                    outputTree['histograms'][histogramName].SetDirectory(outputTree['file'])

            # clone tree structure, but don't copy any entries
            outputTree['file'].cd()
            outputTree['tree'] = self.tree.CloneTree(0)
            # can be used to reduce memory consumption
            if self.outputTreeBasketSize:
                outputTree['tree'].SetBasketSize("*", self.outputTreeBasketSize)
            if not outputTree['tree']:
                print ("\x1b[31mWARNING: output tree broken. try to recover!\x1b[0m")
                # if input tree has 0 entries, don't copy 0 entries to the output tree, but ALL of them instead! (sic!)
                # (this is done by omitting the argument to CloneTree)
                outputTree['tree'] = self.tree.CloneTree()
                if not outputTree['tree']:
                    print ("\x1b[31mERROR: output tree broken, input tree: ", self.tree, " \x1b[0m")
                else:
                    print ("\x1b[32mINFO: recovered\x1b[0m")
            outputTree['tree'].SetDirectory(outputTree['file'])

        # add CUT formulas, this has to be called after the calls to SetBranchStatus
        for outputTree in self.outputTrees:
            if outputTree['cutSequenceMode'] == 'TREE' and type(outputTree['cut']) == dict:
                outputTree['cutSequence'] = outputTree['cut']
                # now recursively parse the cut-tree and add all contained cut formulas
                self.addCutDictRecursive(outputTree['cut'])
            elif type(outputTree['cut']) == dict:
                print ("HINT: use cutSequenceMode='TREE' to pass dictionaries!")
                raise Exception("InvalidCutSequenceMode")
            else:
                # cut passed as string or list of strings
                cutList = outputTree['cut'] if type(outputTree['cut']) == list else [outputTree['cut']]
                for i, cutString in enumerate(cutList):
                    formulaName = cutString.replace(' ', '')
                    if formulaName not in self.formulas:
                        self.addFormula(formulaName, cutString)
                    outputTree['cutSequence'].append(formulaName)

        # prepare memory for new branches to be written
        for outputTree in self.outputTrees:
            outputTree['newBranchArrays'] = {}
            outputTree['newBranches'] = {}
            for branch in self.newBranches:
                outputTree['newBranchArrays'][branch['name']] = array.array(branch['type'], [0] * branch['length'])
                if 'leaflist' in branch:
                    leafList = branch['leaflist']
                else:
                    leafList = '{name}{length}/{type}'.format(name=branch['name'], length='[%d]'%branch['length'] if branch['length'] > 1 else '', type=branch['type'].upper())
                outputTree['newBranches'][branch['name']] = outputTree['tree'].Branch(branch['name'], outputTree['newBranchArrays'][branch['name']], leafList)
        if len(self.newBranches) > 0:
            print("ADD NEW BRANCHES:")
            for branch in self.newBranches:
                print(" > \x1b[32m{name}\x1b[0m {formula}".format(
                    name=(branch['name']+('[%d]'%branch['length'] if branch['length'] > 1 else '')).ljust(30),
                    formula=branch['formula'] if 'formula' in branch else 'function:\x1b[33m{fct}\x1b[0m'.format(fct=branch['function'].__name__)
                ))

        # callbacks before loop
        for outputTree in self.outputTrees:
            if outputTree['callbacks'] and 'beforeLoop' in outputTree['callbacks']:
                outputTree['callbacks']['beforeLoop']()

        print ("------------------")
        print (" start processing ")
        print ("------------------")
        # loop over all events and write to output branches
        for event in self:

            # new event callback
            if self.callbacks and 'event' in self.callbacks:
                # if callbacks return false, skip event!
                callbackResults = [fcn(event) for fcn in self.callbacks['event']]
                if not all(callbackResults):
                    continue

            # fill branches
            for branch in self.newBranches:
                # evaluate result either as function applied on the tree entry or as TTreeFormula
                if branch['length'] == 1 and not 'arrayStyle' in branch:
                    if 'function' in branch:
                        if 'arguments' in branch:
                            branchResult = branch['function'](event, arguments=branch['arguments'])
                        else:
                            branchResult = branch['function'](event)
                    else:
                        branchResult = self.evaluate(branch['formula'])
                    if 'type' in branch and branch['type'] == 'i':
                        branchResult = int(branchResult)
                    # fill it for all the output trees
                    for outputTree in self.outputTrees:
                        outputTree['newBranchArrays'][branch['name']][0] = branchResult
                # for arrays pass the pointer to the array to the evaluation function to save the list->array conversion
                else:
                    if 'function' in branch:
                        # todo: make it more efficient by using a shared memory block for all of the output trees'
                        # todo: branches, this would help in case one adds new branches and writes to several trees at once
                        for outputTree in self.outputTrees:
                            if 'arguments' in branch:
                                branch['function'](event, destinationArray=outputTree['newBranchArrays'][branch['name']], arguments=branch['arguments'])
                            else:
                                branch['function'](event, destinationArray=outputTree['newBranchArrays'][branch['name']])
                    else:
                        for outputTree in self.outputTrees:
                            self.evaluateArray(branch['formula'], destinationArray=outputTree['newBranchArrays'][branch['name']])

            # evaluate all formulas
            self.formulaResults = {}
            for formulaName, formula in self.formulas.iteritems():
                self.formulaResults[formulaName] = self.evaluate(formulaName)

            # evaluate cuts for all output trees
            for outputTree in self.outputTrees:

                # evaluate all cuts of the sequence and abort early if one is not satisfied
                if outputTree['cutSequenceMode'] == 'AND':
                    passedCut = True
                    for cutFormulaName in outputTree['cutSequence']:
                        passedCut = passedCut and self.formulaResults[cutFormulaName]
                        if not passedCut:
                            break
                elif outputTree['cutSequenceMode'] == 'OR':
                    passedCut = False
                    for cutFormulaName in outputTree['cutSequence']:
                        passedCut = passedCut or self.formulaResults[cutFormulaName]
                        if passedCut:
                            break
                elif outputTree['cutSequenceMode'] == 'TREE':
                    passedCut = self.evaluateCutDictRecursive(outputTree['cutSequence'])
                else:
                    raise Exception("InvalidCutSequenceMode")

                # fill event if it passed the selection
                if passedCut:
                    outputTree['tree'].Fill()
                    outputTree['passed'] += 1

        print('INFO: end of processing. time ', time.ctime())
        sys.stdout.flush()

        # write files
        for outputTree in self.outputTrees:
            outputTree['file'].Write()
            outputTree['file'].Close()
        print('INFO: files written')
        print('INFO: saveMemory is ', self.saveMemory)
        sys.stdout.flush()

        if self.saveMemory:
            self.tree.Reset()
            self.tree = None
            for outputTree in self.outputTrees:
                outputTree['tree'] = None
            print('INFO: trees in memory destroyed!')

        # callbacks after having written file
        for outputTree in self.outputTrees:
            if outputTree['callbacks'] and 'afterWrite' in outputTree['callbacks']:
                try:
                    outputTree['callbacks']['afterWrite']()
                except Exception as e:
                    print("\x1b[31mWARNING: exception during callback:", e, "\x1b[0m")

        print('INFO: done. time ', time.ctime(), ' events read:', self.eventsRead)
        sys.stdout.flush()

        for outputTree in self.outputTrees:
            passedSelectionFraction = 100.0*outputTree['passed']/self.eventsRead if self.eventsRead>0 else '?'
            print (' > \x1b[34m{name}\x1b[0m {passed} ({fraction}%) => {outputFile}'.format(name=outputTree['name'], passed=outputTree['passed'], fraction=passedSelectionFraction, outputFile=outputTree['fileName']))
        sys.stdout.flush()

    @staticmethod
    def countSampleFiles(samples):
        # get list of sample root files
        if type(samples) == list:
            return len(samples)
        else:
            sampleTextFileName = samples
            if os.path.isfile(sampleTextFileName):
                with open(sampleTextFileName, 'r') as sampleTextFile:
                    sampleFileNames = sampleTextFile.readlines()
                return len(sampleFileNames)
            else:
                print ('ERROR: sample list text file does not exist:', sampleTextFileName)
        return -1

    def getNumSampleFiles(self):
        return len(self.sampleFileNames)

    # return the total scale for the sample, calculated from all count histograms from the TChain
    def getScale(self, sample, countHistogram=None):
        try:
            sample.xsec = sample.xsec[0]
        except:
            pass

        if self.totalNanoTreeCounts:
            if self.config.has_option('Configuration', 'countsFromAutoPU') and eval(self.config.get('Configuration', 'countsFromAutoPU')):
                count = self.histograms['autoPU'].GetEntries()
                countHistogram = "autoPU.GetEntries()"
            else:
                if not countHistogram:
                    countHistogram = self.config.get('Configuration', 'countTreeName') if self.config.has_option('Configuration', 'countTreeName') else 'genEventSumw'
                count = self.totalNanoTreeCounts[countHistogram]
        else:
            if not countHistogram:
                try:
                    posWeight = self.histograms['CountPosWeight'].GetBinContent(1)
                    negWeight = self.histograms['CountNegWeight'].GetBinContent(1)
                    count = posWeight - negWeight
                    countHistogram = 'CountPosWeight - CountNegWeight'
                except:
                    if self.verbose:
                        print("sampleTree: no CountPosWeight/CountNegWeight: using Count instead!!!!!!!!!!!")
                    try:
                        count = self.histograms['Count'].GetBinContent(1)
                    except Exception as e:
                        print ("EXCEPTION:", e)
                        print ("ERROR: no weight histograms found in sampleTree => terminate")
                        print ("HISTOGRAMS:", self.histograms)
                        exit(0)
            else:
                count = self.histograms[countHistogram].GetBinContent(1)

        # override event counts: config needs a section 'EventCounts' with sample identifier strings as keys and the new count as value
        # [EventCounts]
        # SampleIdentifier = 12345
        try:
            if self.sampleIdentifier and self.config.has_section('EventCounts') and self.config.has_option('EventCounts', self.sampleIdentifier):
                countNew = eval(self.config.get('EventCounts', self.sampleIdentifier))
                print("\x1b[97m\x1b[41mINFO: overwrite event counts with values from config!!!\n value from file:", count, "\n value from config:", countNew," <--- will be used!\x1b[0m")
                count = countNew
            #else:
            #    print("--> don't overwrite counts!", self.sampleIdentifier, self.config.has_section('EventCounts'), self.config.has_option('EventCounts', self.sampleIdentifier))
        except Exception as e:
            print("\x1b[31mException:",e," -> overwriting of event counts has been disabled\x1b[0m")


        lumi = float(sample.lumi)
        theScale = lumi * sample.xsec * sample.sf / float(count)

        if self.verbose:
            print("sampleTree.getScale(): sample: ", sample, "lumi: ", lumi, "xsec: ", sample.xsec, "sample.sf: ", sample.sf, "count (", countHistogram, "):", count, " ---> using scale: ", theScale)
        return theScale

    # create a unique string representation of the total cut, e.g. used to calculate the hash for cached samples 
    # this is not required to be a 'real' cut string, used by TTreeFormula etc.
    @staticmethod
    def findMinimumCut(cutList, cutSequenceMode='AND'):
        if type(cutList) == list or type(cutList) == dict:
            cuts = cutList
        else:
            cuts = [cutList]
        if cutSequenceMode == 'TREE' or type(cutList) == dict:
            minCut = "%r"%cuts
        elif cutSequenceMode == 'AND':
            minCut = '&&'.join(['(%s)'%x.replace(' ', '') for x in sorted(cuts)])
        elif cutSequenceMode == 'OR':
            minCut = '||'.join(['(%s)'%x.replace(' ', '') for x in sorted(list(set(cuts)))])
        else:
            minCut = "%r"%cuts
        return minCut

    def GetEntries(self):
        return self.tree.GetEntries()
Exemplo n.º 11
0
class XbbConfigTools(object):
    def __init__(self, config):
        self.config = config
        self.fileLocator = None
        self.samplesInfo = None

    def initFS(self, force=False):
        if self.fileLocator is None or force:
            self.fileLocator = FileLocator(config=self.config)

    def fs(self):
        if self.fileLocator is None:
            self.initFS()
        return self.fileLocator

    def loadNamespaces(self):
        #default
        try:
            defaultNamespace = self.get('VHbbNameSpace', 'library')
            ROOT.gSystem.Load(defaultNamespace)
        except Exception as e:
            print(e)

    # list of DATA sample names
    def getData(self):
        return eval(self.config.get('Plot_general', 'Data'))

    # list of MC sample names
    def getMC(self):
        return eval(self.config.get('Plot_general', 'samples'))

    def getSamplesInfo(self):
        if self.samplesInfo is None:
            self.samplesInfo = ParseInfo(config=self.config)
        return self.samplesInfo

    # processed sample identifiers (may not be actually USED)
    def getSampleIdentifiers(self, filterList=None):
        s = self.getSamplesInfo().getSampleIdentifiers()
        if filterList is not None:
            s = XbbTools.filterSampleList(s, filterList)
        s.sort()
        return s

    # list of all sample names (data + mc)
    def getUsedSamples(self):
        return self.getMC() + self.getData()

    # get list of original file names: /store/...
    def getOriginalFileNames(self, sampleIdentifier):
        return filelist(self.config.get('Directories', 'samplefiles'),
                        sampleIdentifier)

    # get list of file names (e.g. in SYSout folder)
    def getFileNames(self, sampleIdentifier, folder='SYSout'):
        self.initFS()
        try:
            originalFileNames = self.getOriginalFileNames(sampleIdentifier)
        except:
            originalFileNames = []
        samplePath = self.config.get('Directories', folder)
        fileNames = [
            "{path}/{subfolder}/{filename}".format(
                path=samplePath,
                subfolder=sampleIdentifier,
                filename=self.fileLocator.getFilenameAfterPrep(x))
            for x in originalFileNames
        ]
        return fileNames

    def parseCommaSeparatedList(self, listAsString):
        return [
            x.strip() for x in listAsString.split(',') if len(x.strip()) > 0
        ]

    def parseSpaceSeparatedList(self, listAsString):
        return [
            x.strip() for x in listAsString.split(' ') if len(x.strip()) > 0
        ]

    def getPlotRegions(self):
        return self.parseCommaSeparatedList(self.get('Plot_general', 'List'))

    def getDatacardRegions(self):
        return self.parseCommaSeparatedList(self.get('LimitGeneral', 'List'))

    def getTrainingRegions(self):
        return self.parseCommaSeparatedList(
            self.get('MVALists', 'List_for_submitscript'))

    def getPlotRegionCutName(self, plotRegion):
        configSection = 'Plot:' + plotRegion
        if self.has_option(configSection, 'Cut'):
            return self.get(configSection, 'Cut')
        else:
            return plotRegion

    def getDatacardCutName(self, datacardRegion):
        configSection = 'dc:' + datacardRegion
        if self.has_option(configSection, 'Cut'):
            return self.get(configSection, 'Cut')
        else:
            return datacardRegion

    def getTrainingRegionCutName(self, trainingRegion):
        configSection = trainingRegion
        if self.has_option(configSection, 'Cut'):
            return self.get(configSection, 'Cut')
        elif self.has_option(configSection, 'treeCut'):
            return self.get(configSection, 'treeCut')
        else:
            return trainingRegion

    def getTrainingRegionVarSet(self, trainingRegion):
        return self.get(trainingRegion, 'treeVarSet')

    def getTrainingRegionVariables(self, trainingRegion):
        treeVarSet = self.getTrainingRegionVarSet(trainingRegion)
        return self.parseSpaceSeparatedList(self.get(treeVarSet, 'Nominal'))

    def getDatacardRegionType(self, datacardRegion):
        configSection = 'dc:' + datacardRegion
        if self.has_option(configSection, 'type'):
            datacardType = self.get(configSection, 'type')
            if datacardType.lower() not in ['bdt', 'cr', 'dnn', 'mjj']:
                print("ERROR: unknown datacard type:", datacardType)
                raise Exception("DatacardTypeUnknown")
            return datacardType
        else:
            raise Exception("DatacardTypeUndefined")
        return None

    def getDatacardRegionSignals(self, datacardRegion):
        configSection = 'dc:' + datacardRegion
        if self.has_option(configSection, 'signal'):
            return eval(self.get(configSection, 'signal'))
        else:
            raise Exception("DatacardTypeUndefined")
        return None

    def getDatacardRegionBackgrounds(self, datacardRegion):
        configSection = 'dc:' + datacardRegion
        if self.has_option(configSection, 'background'):
            return eval(self.get(configSection, 'background'))
        else:
            raise Exception("DatacardTypeUndefined")
        return None

    def getPlotVariables(self):
        return self.parseCommaSeparatedList(self.get('Plot_general', 'var'))

    def getPlotVariableDefinition(self, plotVariableName):
        configSection = 'plotDef:' + plotVariableName
        if self.has_section(configSection) and self.has_option(
                configSection, 'relPath'):
            return self.get(configSection, 'relPath')
        else:
            return None

    def getCutString(self, cutName):
        return self.get('Cuts', cutName)

    def sections(self):
        return self.config.sections()

    def has_section(self, section):
        return self.config.has_section(section)

    def has_option(self, section, option):
        return self.config.has_section(section) and self.config.has_option(
            section, option)

    def get(self, section, option, default=None):
        if default is not None:
            if self.has_option(section, option):
                return self.config.get(section, option)
            else:
                return default
        else:
            return self.config.get(section, option)

    def getPlotVariableSections(self):
        return [x for x in self.config.sections() if x.startswith('plotDef:')]

    def getJECuncertainties(self, step=None):
        if step is None:
            configOption = 'JEC'
        else:
            configOption = 'JEC_' + step
        if self.config.has_option('systematics', configOption):
            systematics = eval(self.config.get('systematics', configOption))
        elif self.config.has_option('systematics', 'JEC'):
            systematics = eval(self.config.get('systematics', 'JEC'))
        else:
            # default
            #systematics = ['jer','jerReg','jesAbsoluteStat','jesAbsoluteScale','jesAbsoluteFlavMap','jesAbsoluteMPFBias','jesFragmentation','jesSinglePionECAL','jesSinglePionHCAL','jesFlavorQCD','jesRelativeJEREC1','jesRelativeJEREC2','jesRelativeJERHF','jesRelativePtBB','jesRelativePtEC1','jesRelativePtEC2','jesRelativePtHF','jesRelativeBal','jesRelativeFSR','jesRelativeStatFSR','jesRelativeStatEC','jesRelativeStatHF','jesPileUpDataMC','jesPileUpPtRef','jesPileUpPtBB','jesPileUpPtEC1','jesPileUpPtEC2','jesPileUpPtHF','jesPileUpMuZero','jesPileUpEnvelope','jesTotal']
            raise Exception(
                "ConfigError: Specify the JEC list in [systematics]")
        systematics = list(set(systematics))
        systematics.sort()
        return systematics

    def setList(self, setOptions):
        # escaping of semicolon
        semicolonEscapeSequence = '##SEMICOLON##'
        setOptions = setOptions.replace('\;', semicolonEscapeSequence)
        prevSection = None
        for optValue in setOptions.split(';'):
            optValue = optValue.replace(semicolonEscapeSequence, ';').strip()
            syntaxOk = True
            try:
                if ':=' in optValue:
                    opt = optValue.split(':=')[0]
                    value = optValue.split(':=')[1]
                elif '=' in optValue:
                    splitParts = optValue.split('=')
                    if len(splitParts) > 2:
                        print(
                            "\x1b[31mWARNING: more than one equal sign found in expression, split at the first one! use ':=' to force split at another position!\x1b[0m"
                        )
                    opt = optValue.split('=')[0]
                    value = '='.join(optValue.split('=')[1:])
                elif optValue:
                    opt = optValue.split(':')[0]
                    value = optValue.split(':')[1]
            except Exception as e:
                print("ERROR:", e)
                print("ERROR: syntax error in:", optValue)
                print(
                    "ERROR: use ; to separate options and use \; to escape semicolons in case they are inside the value. Use := for assignment."
                )
                syntaxOk = False
                raise

            if syntaxOk:

                configSection = opt.split('.')[0]
                configOption = opt.split('.')[1]

                if len(configSection.strip()) < 1:
                    if prevSection is None:
                        raise Exception("ConfigSetError")
                    else:
                        configSection = prevSection

                prevSection = configSection
                if not self.config.has_section(configSection):
                    self.config.add_section(configSection)
                if self.config.has_section(
                        configSection) and self.config.has_option(
                            configSection, configOption):
                    print("\x1b[31mCONFIG: SET",
                          "{s}.{o}".format(s=configSection, o=configOption),
                          "=", value, "\x1b[0m")
                else:
                    print("\x1b[31mCONFIG: ADD",
                          "{s}.{o}".format(s=configSection, o=configOption),
                          "=", value, "\x1b[0m")
                self.config.set(configSection, configOption, value)

    def formatSampleName(self, sampleIdentifier, maxlen=80, padding=False):
        if len(sampleIdentifier) > maxlen:
            s = sampleIdentifier[:maxlen - 7] + '...' + sampleIdentifier[-4:]
        else:
            s = sampleIdentifier
        if padding:
            s = s.ljust(maxlen)
        return s
Exemplo n.º 12
0
 def initFS(self, force=False):
     if self.fileLocator is None or force:
         self.fileLocator = FileLocator(config=self.config)
Exemplo n.º 13
0
class TreeCache:

    def __init__(self, sample, cutList='1', branches=None, inputFolder=None, tmpFolder=None, outputFolder=None, chunkNumber=-1, splitFilesChunks=-1, splitFilesChunkSize=-1, debug=False, fileList=None, cutSequenceMode='AND', name='', config=None):
        self.config = config
        self.fileLocator = FileLocator(config=self.config)
        self.debug = debug or ('XBBDEBUG' in os.environ)

        # SAMPLE
        if isinstance(sample, Sample):
            # sample passed as Sample object
            # count number of chunks the cached data is split into
            splitFilesChunkSize = sample.mergeCachingSize 
            splitFilesChunks = SampleTree({'name': sample.identifier, 'folder': inputFolder}, countOnly=True, splitFilesChunkSize=splitFilesChunkSize, config=config, verbose=self.debug).getNumberOfParts()
            # if sample passed as object, it can be a 'subsample' and habe different name and identifier
            self.sample = sample.name
            self.sampleIdentifier = sample.identifier
            if self.debug:
                print ("INFO: use sample=", sample.name, " #parts = ", splitFilesChunks)
        else:
            # sample identifier passed as string
            self.sample = sample
            self.sampleIdentifier = sample
        self.name = name

        # CUTS
        self.cutList = cutList
        self.cutSequenceMode = cutSequenceMode
        self.minCut = SampleTree.findMinimumCut(self.cutList, cutSequenceMode=self.cutSequenceMode)

        # PATHS
        self.inputFolder = inputFolder
        self.outputFolder = (config.get('Directories', 'tmpSamples') if config else 'cache/') if outputFolder is None else outputFolder
        self.tmpFolder = (config.get('Directories', 'scratch') if config else 'tmp/') if tmpFolder is None else tmpFolder
        self.cachedFileNames = []
        self.tmpFiles = []
        self.outputFileNameFormat = '{outputFolder}/tmp_{hash}_{part}of{parts}.root'

        # BRANCHES and chunk information
        self.branches = branches
        self.branchesForHash = None     # for now make hash independent of selecte branches 
        self.hash = Hash(sample=sample, minCut=self.minCut, branches=self.branchesForHash, splitFilesChunkSize=splitFilesChunkSize, debug=False, inputPath=self.inputFolder).get()
        self.chunkNumber = chunkNumber
        self.splitFilesChunks = splitFilesChunks if splitFilesChunks > 1 else 1
        self.splitFilesChunkSize = splitFilesChunkSize
        
        # identifier is just used as an arbitrary name for print-out
        cutUsedForIdentifier = (self.minCut if len(self.minCut) < 60 else self.minCut[0:50] + '...').replace(' ', '')
        self.identifier = '{sample}[{cut}]of{parts}'.format(sample=self.sample, cut=cutUsedForIdentifier, parts=self.splitFilesChunks)
        self.sampleTree = None
        self.isCachedChecked = False

        self.createFolders()

    # free memory
    def deleteSampleTree(self):
        self.sampleTree = None

    # file, where skimmed tree is written to
    def getTmpFileName(self):
        return self.outputFileNameFormat.format(
            outputFolder=self.tmpFolder,
            hash=self.hash,
            part=self.chunkNumber if self.chunkNumber > 0 else 1,
            parts='%d'%self.splitFilesChunks
        )

    # file, where skimmed tree is moved to after it has been written completely
    def getOutputFileName(self):
        return self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part=self.chunkNumber if self.chunkNumber > 0 else 1,
            parts='%d'%self.splitFilesChunks
        )

    # check existence of files with skimmed trees
    def findCachedFileNames(self, chunkNumber=-1):
        cachedFilesMaskRaw = self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part='*' if chunkNumber < 1 else '%d'%chunkNumber,
            parts=self.splitFilesChunks
        )
        cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw)
        self.cachedFileNames = glob.glob(cachedFilesMask)
        if self.debug:
            print ('DEBUG: search files:', cachedFilesMask)
            print ('\x1b[32mDEBUG: files:')
            for fileName in self.cachedFileNames:
                print (' > ', fileName)
            if len(self.cachedFileNames) < 1:
                print ('none!')
            print ('\x1b[0m(%d files found)'%len(self.cachedFileNames))
        return self.cachedFileNames

    # check if a single part is cached, (only checks existence of the file, not validity!)
    def partIsCached(self):
        cachedFilesMaskRaw = self.outputFileNameFormat.format(
            outputFolder=self.outputFolder,
            hash=self.hash,
            part=self.chunkNumber,
            parts=self.splitFilesChunks
        )
        cachedFilesMask = self.fileLocator.getLocalFileName(cachedFilesMaskRaw)
        return len(glob.glob(cachedFilesMask)) > 0

    # isCached == all files containing the skimmed tree found!
    def isCached(self):
        self.findCachedFileNames()
        if (len(self.cachedFileNames) != self.splitFilesChunks and self.splitFilesChunks > 1) or len(self.cachedFileNames) == 0:
            if self.debug:
                print ('\x1b[32mDEBUG: not cached:', self.identifier, '\x1b[0m')
            return False
        self.isCachedChecked = True
        return True

    # check if an existing file can be opened without errors by ROOT
    def checkFileValidity(self, rawFileName):
        xrootdFileName = self.fileLocator.getXrootdFileName(rawFileName)
        f = ROOT.TFile.Open(xrootdFileName, 'read')
        if not f or f.GetNkeys() == 0 or f.TestBit(ROOT.TFile.kRecovered) or f.IsZombie():
            print ('\x1b[31mWARNING: broken file:', rawFileName, ' => redo caching!\x1b[0m')
            if f:
                f.Close()
            self.deleteFile(rawFileName)
            return False
        if f:
            f.Close()
        return True

    # check if all cached files are valid
    def isCachedAndValid(self):
        valid = True
        if self.isCached():
            # check file integrity
            for fileName in self.cachedFileNames:
                valid = valid and self.checkFileValidity(fileName)
        else:
            valid = False
        return valid

    # set input sampleTree object
    def setSampleTree(self, sampleTree):
        self.sampleTree = sampleTree
        return self

    # this prepares the caching by telling the sampleTree object what to write during processing of the file
    # note: does not run the caching by itself! needs an additional sampleTree.process()
    def cache(self):
        if self.sampleTree:
            outputFileName = self.getTmpFileName()
            callbacks = {'afterWrite': self.moveFilesToFinalLocation}
            self.sampleTree.addOutputTree(outputFileName=outputFileName, cut=self.cutList, hash=self.hash, branches=self.branches, callbacks=callbacks, cutSequenceMode=self.cutSequenceMode, name=self.name)
            self.tmpFiles.append(outputFileName)
            if self.debug:
                print ('\x1b[32mDEBUG: output file for ', self.identifier, ' is ', outputFileName, '\x1b[0m')
        else:
            print ('\x1b[31mERROR: no sample tree connected!:', self.identifier, ' set the sampleTree first with "setSampleTree(sampleTree)" \x1b[0m')
        return self

    # return sample tree class of cached samples if all files found
    def getTree(self):
        # if it has already been checked if tree is cached, then use this result dierctly
        isCached = self.isCachedChecked
        if not isCached:
            isCached = self.isCached()
        if isCached:
            self.sampleTree = SampleTree(self.cachedFileNames, config=self.config)
            self.sampleTree.sampleIdentifier = self.sampleIdentifier
        return self.sampleTree

    # delete file
    def deleteFile(self, rawFileName):
        if self.debug:
            print ('DELETE:', rawFileName)
        self.fileLocator.rm(rawFileName)

    # delete cached files
    def deleteCachedFiles(self, chunkNumber=-1):
        cachedFileNames = self.findCachedFileNames(chunkNumber=chunkNumber)
        for fileName in cachedFileNames:
            if self.fileLocator.fileExists(fileName):
                self.deleteFile(fileName)

    # create folders
    def createFolders(self):
        tmpfolderLocal = self.fileLocator.getLocalFileName(self.tmpFolder)
        if not os.path.isdir(tmpfolderLocal):
            print("DOES NOT EXIST:", tmpfolderLocal)
            try:
                xrootdFileName = self.fileLocator.getXrootdFileName(self.tmpFolder)
                if '://' not in xrootdFileName:
                    os.makedirs(self.tmpFolder)
                else:
                    command = 'gfal-mkdir %s' % (xrootdFileName)
                    returnCode = subprocess.call([command], shell=True)
                    if self.debug:
                        print(command, ' => ', returnCode)
                        print ()
            except:
                pass

        if not self.fileLocator.exists(self.outputFolder):
            print("INFO: output folder does not exist and will be created:", self.outputFolder)
            self.fileLocator.makedirs(self.outputFolder)

    # move files from temporary to final location
    def moveFilesToFinalLocation(self):
        success = True
        # free some memory for file copy command
        if self.debug:
            print('DEBUG: max mem used A:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
        self.deleteSampleTree()
        if self.debug:
            print('DEBUG: max mem used B:', resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

        for tmpFileName in self.tmpFiles:
            outputFileName = self.outputFolder + '/' + self.tmpFolder.join(tmpFileName.split(self.tmpFolder)[1:])
            print ('copy ', tmpFileName, ' to ', outputFileName)
            if self.fileLocator.fileExists(outputFileName):
                self.deleteFile(outputFileName)
            #command = 'xrdcp -d 1 ' + self.fileLocator.getXrootdFileName(tmpFileName) + ' ' + self.fileLocator.getXrootdFileName(outputFileName)
            #print('the command is', command)
            #sys.stdout.flush()
            #returnCode = subprocess.call([command], shell=True)
            copySuccessful = self.fileLocator.cp(tmpFileName, outputFileName)
            if not copySuccessful:
                success = False
                print('\x1b[31mERROR: copy failed for {tmpfile}->{outputfile} !\x1b[0m'.format(tmpfile=tmpFileName,
                                                                                                outputfile=outputFileName))
            else:
                # delete temporary file if copy was successful
                self.deleteFile(tmpFileName)
        return success
Exemplo n.º 14
0
    def __init__(self, samples, treeName=None, limitFiles=-1, splitFilesChunkSize=-1, chunkNumber=1, countOnly=False, verbose=True, config=None, saveMemory=False, xrootdRedirector=None):
        self.verbose = verbose
        self.debug = 'XBBDEBUG' in os.environ
        self.debugProfiling = 'XBBPROFILING' in os.environ
        self.config = config
        self.saveMemory = saveMemory
        self.outputTreeBasketSize = None
        if self.config and self.config.has_option('Configuration', 'outputTreeBasketSize'):
            self.outputTreeBasketSize = eval(self.config.get('Configuration', 'outputTreeBasketSize'))
        self.monitorPerformance = True
        self.disableBranchesInOutput = True
        self.samples = samples
        self.tree = None
        self.fileLocator = FileLocator(config=self.config, xrootdRedirector=xrootdRedirector)
        self.sampleIdentifier = None

        # process only partial sample root file list
        self.splitFilesChunkSize = splitFilesChunkSize
        self.chunkNumber = chunkNumber
       
        # get list of sample root files to process
        sampleFileNamesParts = self.getSampleFileNameChunks()
        if self.chunkNumber > 0 and self.chunkNumber <= self.numParts:
            if len(sampleFileNamesParts) == self.numParts:
                chunkIndex = self.chunkNumber - 1
                self.sampleFileNames = sampleFileNamesParts[chunkIndex]
            else:
                raise Exception("InvalidNumberOfSplitParts")
        else:
            print("\x1b[31mERROR: wrong chunk number ", self.chunkNumber, "\x1b[0m")
            raise Exception("InvalidChunkNumber")
        if self.verbose:
            print ("INFO: reading part ", self.chunkNumber, " of ", self.numParts)

        self.status = 0
        if not treeName:
            if self.config and self.config.has_option('Configuration', 'treeName'):
                self.treeName = self.config.get('Configuration', 'treeName')
            else:
                # HEPPY default
                self.treeName = 'tree'
        else:
            self.treeName = treeName
        self.formulas = {}
        self.formulaDefinitions = []
        self.oldTreeNum = -1
        self.limitFiles = int(limitFiles) 
        self.timeStart = time.time()
        self.timeETA = 0
        self.eventsRead = 0
        self.outputTrees = []
        self.callbacks = {}
        self.removeBranches = []

        # e.g. for additional branches to be added
        self.newBranches = []

        # check existence of sample .txt file which contains list of .root files
        self.sampleTextFileName = ''

        # add all .root files to chain and add count histograms
        self.chainedFiles = []
        self.brokenFiles = []
        self.histograms = {}
        self.nanoTreeCounts = {}
        self.totalNanoTreeCounts = {}

        if not countOnly:
            self.tree = ROOT.TChain(self.treeName)

            # loop over all given .root files 
            for rootFileName in self.sampleFileNames:
                if self.debug:
                    print('DEBUG: next file is:', rootFileName, ", check existence")

                # check root file existence
                if self.fileLocator.exists(rootFileName, attempts=5):
                    remoteRootFileName = self.fileLocator.getRemoteFileName(rootFileName)
                    input = ROOT.TFile.Open(remoteRootFileName, 'read')

                    # check file validity
                    if input and not input.IsZombie() and input.GetNkeys() > 0 and not input.TestBit(ROOT.TFile.kRecovered):
                        if self.debug:
                            print('DEBUG: file exists and is good!')

                        # add count histograms, since they are not in the TChain
                        for key in input.GetListOfKeys():
                            obj = key.ReadObj()
                            if obj.GetName() == self.treeName:
                                continue
                            histogramName = obj.GetName()

                            # nanoAOD: use branch of a tree instead of histogram for counting
                            if histogramName == 'Runs':
                                branchList = [x.GetName() for x in obj.GetListOfBranches()]
                                if self.debug:
                                    print ("DEBUG: nano counting tree has the following BRANCHES:", branchList)
                                for branch in branchList:
                                    if branch not in self.nanoTreeCounts:
                                        self.nanoTreeCounts[branch] = []
                                nEntries = obj.GetEntries()
                                for i in range(nEntries):
                                    obj.GetEntry(i)
                                    for branch in branchList:
                                        self.nanoTreeCounts[branch].append(getattr(obj, branch))

                            if histogramName in self.histograms:
                                if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                                    if self.debug:
                                        print("DEBUG: object is a tree and will be skipped:", obj.GetName())
                                else:
                                    if self.histograms[histogramName]:
                                        self.histograms[histogramName].Add(obj)
                                    else:
                                        print ("ERROR: histogram object was None!!!")
                                        raise Exception("CountHistogramMissing")
                            else:
                                # add all TH*'s in one single histogram
                                if obj.IsA().InheritsFrom(ROOT.TH1.Class()):
                                    self.histograms[histogramName] = obj.Clone(obj.GetName())
                                    self.histograms[histogramName].SetDirectory(0)
                                else:
                                    if self.debug:
                                        print("DEBUG: omitting object ", obj, type(obj), " since it is neither TH1 or TTree!")

                        input.Close()

                        # add file to chain
                        chainTree = '%s/%s'%(remoteRootFileName.strip(), self.treeName.strip())
                        if self.debug:
                            print ('\x1b[42mDEBUG: chaining '+chainTree,'\x1b[0m')
                        statusCode = self.tree.Add(chainTree)
                        if self.debug:
                            print ('\x1b[42mDEBUG: ---> %r'%statusCode,'\x1b[0m')
 
                        # check for errors in chaining the file
                        if statusCode != 1:
                            print ('ERROR: failed to chain ' + chainTree + ', returned: ' + str(statusCode), 'tree:', self.tree)
                            raise Exception("TChain method Add failure")
                        elif not self.tree:
                            print ('\x1b[31mERROR: tree died after adding %s.\x1b[0m'%rootFileName)
                        else:
                            self.treeEmpty = False
                            self.chainedFiles.append(rootFileName)
                            if self.limitFiles > 0 and len(self.chainedFiles) >= self.limitFiles:
                                print ('\x1b[35mDEBUG: limit reached! no more files will be chained!!!\x1b[0m')
                                break
                    else:
                        print ('\x1b[31mERROR: file is damaged: %s\x1b[0m'%rootFileName)
                        if input:
                            print ('DEBUG: Zombie:', input.IsZombie(), '#keys:', input.GetNkeys(), 'recovered:', input.TestBit(ROOT.TFile.kRecovered))
                        self.brokenFiles.append(rootFileName)
                else:
                    print ('\x1b[31mERROR: file is missing: %s\x1b[0m'%rootFileName)

            if self.verbose or self.debug:
                print ('INFO: # files chained: %d'%len(self.chainedFiles))
                if len(self.brokenFiles) > 0:
                    print ('INFO: # files broken : %d'%len(self.brokenFiles))
            
            if len(self.chainedFiles) < 1:
                self.tree = None

            if self.tree:
                self.tree.SetCacheSize(50*1024*1024)

            # merge nano counting trees
            if self.nanoTreeCounts:
                # TODO: per run if possible, sum LHE weights if present

                # sum the contributions from the subtrees
                self.totalNanoTreeCounts = {key: sum(values) for key,values in self.nanoTreeCounts.iteritems() if len(values) > 0 and type(values[0]) in [int, float, long]}

                # print summary table
                countBranches = self.totalNanoTreeCounts.keys()
                depth = None
                for key,values in self.nanoTreeCounts.iteritems():
                    if values and len(values)>1 and type(values[0]) in [int, float, long]:
                        depth = len(values)
                        break
                print("-"*160)
                print("tree".ljust(25), ''.join([countBranch.ljust(25) for countBranch in countBranches]))
                if depth:
                    for treeNum in range(depth):
                        print(("%d"%(treeNum+1)).ljust(25),''.join([('%r'%self.nanoTreeCounts[countBranch][treeNum]).ljust(25) for countBranch in countBranches]))
                print("\x1b[34m","sum".ljust(24), ''.join([('%r'%self.totalNanoTreeCounts[countBranch]).ljust(25) for countBranch in countBranches]),"\x1b[0m")
                print("-"*160)

                # fill summed tree (create new tree)
                self.histograms['Runs'] = ROOT.TTree('Runs', 'count histograms for nano')
                nanoTreeCountBuffers = {}
                for key, value in self.totalNanoTreeCounts.iteritems():
                    if type(value) == int:
                        # 64 bit signed int 
                        typeCode = 'L'
                    elif type(value) == long:
                        typeCode = 'L'
                    elif type(value) == float:
                        typeCode = 'f'
                    nanoTreeCountBuffers[key] = array.array(typeCode, [value])
                    self.histograms['Runs'].Branch(key, nanoTreeCountBuffers[key], '{name}/{typeCode}'.format(name=key, typeCode=typeCode))
                self.histograms['Runs'].Fill()
Exemplo n.º 15
0
 def __init__(self, config):
     self.config = config
     self.debug = 'XBBDEBUG' in os.environ
     self.fileLocator = FileLocator(config=self.config)
Exemplo n.º 16
0
class CopyTreePSI(object):
    def __init__(self, config):
        self.config = config
        self.debug = 'XBBDEBUG' in os.environ
        self.fileLocator = FileLocator(config=self.config)

    def copySingleFile(self, whereToLaunch, inputFile, outputFile, skimmingCut,
                       remove_branches):

        if self.debug:
            print("INPUT:", inputFile)
        input = ROOT.TFile.Open(inputFile, 'read')
        if not input:
            print 'input file NOT EXISTING:', inputFile
            #input.Close()
            return
        try:
            __tmpPath = os.environ["TMPDIR"]
        except:
            __tmpPath = self.config.get('Directories', 'scratch')
        try:
            if not os.path.isdir(__tmpPath):
                os.makedirs(__tmpPath)
        except:
            pass
        outputFileName = outputFile.split('/')[-1]
        print 'outputFileName', __tmpPath + '/' + outputFileName
        output = ROOT.TFile.Open(__tmpPath + '/' + outputFileName, 'recreate')

        inputTree = input.Get("tree")
        if not inputTree:
            inputTree = input.Get("Events")
        nEntries = inputTree.GetEntries()
        for branch in remove_branches:
            if branch and not branch.isspace():
                # print 'DROPPING BRANCHES LIKE',str(branch)
                inputTree.SetBranchStatus(str(branch), ROOT.kFALSE)

        output.cd()
        print '\n\t copy file: %s with cut: %s' % (inputFile, skimmingCut)
        outputTree = inputTree.CopyTree(skimmingCut)
        kEntries = outputTree.GetEntries()
        printc('blue', '', "\t before cuts\t %s" % nEntries)
        printc('green', '', "\t survived\t %s" % kEntries)
        outputTree.AutoSave()
        input.cd()
        obj = ROOT.TObject
        for key in ROOT.gDirectory.GetListOfKeys():
            input.cd()
            obj = key.ReadObj()
            # this contains the event tree, which will be copied skimmed only
            if obj.GetName() in ['tree', 'Events']:
                continue
            if self.debug:
                print "DEBUG: clone object ", obj.GetName()
            # all other objects are just cloned
            output.cd()
            if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                objClone = obj.CloneTree(-1)
            else:
                objClone = obj
            objClone.Write(key.GetName())
        output.Write()
        output.Close()
        input.Close()
        tmpFile = __tmpPath + '/' + outputFileName
        self.fileLocator.cp(source=tmpFile, target=outputFile)
        print 'copy to final location:\x1b[34m', outputFile, '\x1b[0m'
        self.fileLocator.rm(tmpFile)

    def copySingleFileOneInput(self, inputs):
        return self.copySingleFile(*inputs)

    def getRedirector(self):
        # default redirector
        redirector = 'root://xrootd-cms.infn.it/'
        redirector = 'root://eoscms.cern.ch//eos/cms/'
        try:
            if 'XBBXRD' in os.environ:
                redirector = os.environ['XBBXRD']
            elif self.config.has_option('Configuration',
                                        'xrootdRedirectorGlobal'):
                redirector = self.config.get('Configuration',
                                             'xrootdRedirectorGlobal')
        except:
            print "could not get xrootd redirector, using default one:", redirector
            print "specify redirector in config [Directories] xrootdRedirectorGlobal=.."
        # add base path where storage is located on fs (if sample txt files don't contain absolute path)
        if self.config.has_option('Configuration', 'inputStoragePath'):
            redirector += self.config.get('Configuration',
                                          'inputStoragePath') + '/'
        return redirector

    def copytreePSI(self,
                    pathIN,
                    pathOUT,
                    folderName,
                    skimmingCut,
                    fileList=None):
        config = self.config
        fileLocator = self.fileLocator

        print 'start copytreePSI.py'
        fileNames = open(pathIN + '/' + folderName +
                         '.txt').readlines() if not fileList else fileList
        print 'len(filenames)', len(fileNames), fileNames[0], skimmingCut

        ## search the folder containing the input files
        inputFiles = []
        print "##### COPY TREE - BEGIN ######"
        whereToLaunch = config.get('Configuration', 'whereToLaunch')
        remove_branches = config.get('General', 'remove_branches').replace(
            "[", "").replace("]", "").replace("'", "").split(',')
        print 'remove_branches:', remove_branches, 'len(remove_branches):', len(
            remove_branches)

        redirector = self.getRedirector()
        for fileName in fileNames:
            fileName = fileName.strip()
            if fileName.lower().endswith('.root'):
                inputFiles.append(redirector + fileName)

        if len(inputFiles) == 0:
            print "No .root files found in ", pathIN + '/' + folderName
            return

        ## prepare output folder
        outputFolder = "%s/%s/" % (pathOUT, folderName)
        fileLocator.makedirs(outputFolder)

        ## prepare a list of input(inputFile,outputFile,skimmingCut) for the files to be processed
        inputs = []
        filenames = []
        for inputFile in inputFiles:
            fileName = fileLocator.getFilenameAfterPrep(inputFile)
            outputFile = "%s/%s/%s" % (pathOUT, folderName, fileName)

            if fileLocator.exists(outputFile):
                if not fileLocator.isValidRootFile(outputFile):
                    fileLocator.rm(outputFile)
                    inputs.append((whereToLaunch, inputFile, outputFile,
                                   skimmingCut, remove_branches))
                else:
                    if self.debug:
                        print("SKIP INPUT:", inputFile)
            else:
                inputs.append((whereToLaunch, inputFile, outputFile,
                               skimmingCut, remove_branches))

        # print 'inputs',inputs
        outputs = []
        multiprocess = int(config.get('Configuration', 'nprocesses'))
        if multiprocess > 1:
            ## process the input list (using multiprocess)
            from multiprocessing import Pool
            p = Pool(multiprocess)
            outputs = p.map(copySingleFileOneInput, inputs)
        else:
            for input_ in inputs:
                output = self.copySingleFileOneInput(input_)
                outputs.append(output)

        print "##### COPY TREE - END ######"
Exemplo n.º 17
0
    def __init__(self, samples, treeName=None, limitFiles=-1, splitFilesChunkSize=-1, chunkNumber=1, countOnly=False, verbose=True, config=None, saveMemory=False, xrootdRedirector=None):
        self.verbose = verbose
        self.debug = 'XBBDEBUG' in os.environ
        self.debugProfiling = 'XBBPROFILING' in os.environ
        self.config = config
        self.saveMemory = saveMemory
        self.outputTreeBasketSize = None
        if self.config and self.config.has_option('Configuration', 'outputTreeBasketSize'):
            self.outputTreeBasketSize = eval(self.config.get('Configuration', 'outputTreeBasketSize'))
        self.monitorPerformance = True
        self.disableBranchesInOutput = True
        self.samples = samples
        self.tree = None
        self.fileLocator = FileLocator(config=self.config, xrootdRedirector=xrootdRedirector)
        self.sampleIdentifier = None

        # process only partial sample root file list
        self.splitFilesChunkSize = splitFilesChunkSize
        self.chunkNumber = chunkNumber
       
        # get list of sample root files to process
        sampleFileNamesParts = self.getSampleFileNameChunks()
        if self.chunkNumber > 0 and self.chunkNumber <= self.numParts:
            if len(sampleFileNamesParts) == self.numParts:
                chunkIndex = self.chunkNumber - 1
                self.sampleFileNames = sampleFileNamesParts[chunkIndex]
            else:
                raise Exception("InvalidNumberOfSplitParts")
        else:
            print("\x1b[31mERROR: wrong chunk number ", self.chunkNumber, "\x1b[0m")
            raise Exception("InvalidChunkNumber")
        if self.verbose:
            print ("INFO: reading part ", self.chunkNumber, " of ", self.numParts)

        self.status = 0
        if not treeName:
            if self.config and self.config.has_option('Configuration', 'treeName'):
                self.treeName = self.config.get('Configuration', 'treeName')
            else:
                # HEPPY default
                self.treeName = 'tree'
        else:
            self.treeName = treeName
        self.formulas = {}
        self.formulaDefinitions = []
        self.oldTreeNum = -1
        self.limitFiles = int(limitFiles) 
        self.timeStart = time.time()
        self.timeETA = 0
        self.eventsRead = 0
        self.outputTrees = []
        self.callbacks = {}
        self.removeBranches = []

        # e.g. for additional branches to be added
        self.newBranches = []

        # check existence of sample .txt file which contains list of .root files
        self.sampleTextFileName = ''

        # add all .root files to chain and add count histograms
        self.chainedFiles = []
        self.brokenFiles = []
        self.histograms = {}
        self.nanoTreeCounts = {}
        self.totalNanoTreeCounts = {}

        if not countOnly:
            self.tree = ROOT.TChain(self.treeName)

            # loop over all given .root files 
            for rootFileName in self.sampleFileNames:
                if self.debug:
                    print('DEBUG: next file is:', rootFileName, ", check existence")

                # check root file existence, TODO: simplify
                if self.fileLocator.exists(rootFileName):
                    remoteRootFileName = self.fileLocator.getRemoteFileName(rootFileName)
                    input = ROOT.TFile.Open(remoteRootFileName, 'read')

                    # check file validity
                    if input and not input.IsZombie() and input.GetNkeys() > 0 and not input.TestBit(ROOT.TFile.kRecovered):
                        if self.debug:
                            print('DEBUG: file exists and is good!')

                        # add count histograms, since they are not in the TChain
                        for key in input.GetListOfKeys():
                            obj = key.ReadObj()
                            if obj.GetName() == self.treeName:
                                continue
                            histogramName = obj.GetName()

                            # nanoAOD: use branch of a tree instead of histogram for counting
                            if histogramName == 'Runs':
                                branchList = [x.GetName() for x in obj.GetListOfBranches()]
                                if self.debug:
                                    print ("DEBUG: nano counting tree has the following BRANCHES:", branchList)
                                for branch in branchList:
                                    if branch not in self.nanoTreeCounts:
                                        self.nanoTreeCounts[branch] = []
                                nEntries = obj.GetEntries()
                                for i in range(nEntries):
                                    obj.GetEntry(i)
                                    for branch in branchList:
                                        self.nanoTreeCounts[branch].append(getattr(obj, branch))

                            if histogramName in self.histograms:
                                if obj.IsA().InheritsFrom(ROOT.TTree.Class()):
                                    if self.debug:
                                        print("DEBUG: object is a tree and will be skipped:", obj.GetName())
                                else:
                                    if self.histograms[histogramName]:
                                        self.histograms[histogramName].Add(obj)
                                    else:
                                        print ("ERROR: histogram object was None!!!")
                                        raise Exception("CountHistogramMissing")
                            else:
                                # add all TH*'s in one single histogram
                                if obj.IsA().InheritsFrom(ROOT.TH1.Class()):
                                    self.histograms[histogramName] = obj.Clone(obj.GetName())
                                    self.histograms[histogramName].SetDirectory(0)
                                else:
                                    if self.debug:
                                        print("DEBUG: omitting object ", obj, type(obj), " since it is neither TH1 or TTree!")

                        input.Close()

                        # add file to chain
                        chainTree = '%s/%s'%(remoteRootFileName.strip(), self.treeName.strip())
                        if self.debug:
                            print ('\x1b[42mDEBUG: chaining '+chainTree,'\x1b[0m')
                        statusCode = self.tree.Add(chainTree)
                        if self.debug:
                            print ('\x1b[42mDEBUG: ---> %r'%statusCode,'\x1b[0m')
 
                        # check for errors in chaining the file
                        if statusCode != 1:
                            print ('ERROR: failed to chain ' + chainTree + ', returned: ' + str(statusCode), 'tree:', self.tree)
                            raise Exception("TChain method Add failure")
                        elif not self.tree:
                            print ('\x1b[31mERROR: tree died after adding %s.\x1b[0m'%rootFileName)
                        else:
                            self.treeEmpty = False
                            self.chainedFiles.append(rootFileName)
                            if self.limitFiles > 0 and len(self.chainedFiles) >= self.limitFiles:
                                print ('\x1b[35mDEBUG: limit reached! no more files will be chained!!!\x1b[0m')
                                break
                    else:
                        print ('\x1b[31mERROR: file is damaged: %s\x1b[0m'%rootFileName)
                        if input:
                            print ('DEBUG: Zombie:', input.IsZombie(), '#keys:', input.GetNkeys(), 'recovered:', input.TestBit(ROOT.TFile.kRecovered))
                        self.brokenFiles.append(rootFileName)
                else:
                    print ('\x1b[31mERROR: file is missing: %s\x1b[0m'%rootFileName)

            if self.verbose or self.debug:
                print ('INFO: # files chained: %d'%len(self.chainedFiles))
                if len(self.brokenFiles) > 0:
                    print ('INFO: # files broken : %d'%len(self.brokenFiles))
            
            if len(self.chainedFiles) < 1:
                self.tree = None

            if self.tree:
                self.tree.SetCacheSize(50*1024*1024)

            # merge nano counting trees
            if self.nanoTreeCounts:
                # TODO: per run if possible, sum LHE weights if present

                # sum the contributions from the subtrees
                self.totalNanoTreeCounts = {key: sum(values) for key,values in self.nanoTreeCounts.iteritems() if len(values) > 0 and type(values[0]) in [int, float, long]}

                # print summary table
                countBranches = self.totalNanoTreeCounts.keys()
		print (countBranches)
                depth = None
                for key,values in self.nanoTreeCounts.iteritems():
                    if values and len(values)>1 and type(values[0]) in [int, float, long]:
                        depth = len(values)
                        break
                print("-"*160)
                print("tree".ljust(25), ''.join([countBranch.ljust(25) for countBranch in countBranches]))
                if depth:
                    for treeNum in range(depth):
                        print(("%d"%(treeNum+1)).ljust(25),''.join([('%r'%self.nanoTreeCounts[countBranch][treeNum]).ljust(25) for countBranch in countBranches]))
                print("\x1b[34m","sum".ljust(24), ''.join([('%r'%self.totalNanoTreeCounts[countBranch]).ljust(25) for countBranch in countBranches]),"\x1b[0m")
                print("-"*160)

                # fill summed tree (create new tree)
                self.histograms['Runs'] = ROOT.TTree('Runs', 'count histograms for nano')
                nanoTreeCountBuffers = {}
                for key, value in self.totalNanoTreeCounts.iteritems():
                    print (key,"   ", value, "   here print key and value   ")
		    if (key=='run' and len(countBranches)==1): value=1
                    if type(value) == int:
                        # 64 bit signed int 
                        typeCode = 'L'
                    elif type(value) == long:
                        typeCode = 'L'
                    elif type(value) == float:
                        typeCode = 'f'
                    nanoTreeCountBuffers[key] = array.array(typeCode, [value])
                    self.histograms['Runs'].Branch(key, nanoTreeCountBuffers[key], '{name}/{typeCode}'.format(name=key, typeCode=typeCode))
                self.histograms['Runs'].Fill()