class PostProcessor: def __init__(self, outputDir, inputFiles, cut=None, branchsel=None, modules=[], compression="LZMA:9", friend=False, postfix=None, jsonInput=None, noOut=False, justcount=False, provenance=False, haddFileName=None, fwkJobReport=False, histFileName=None, histDirName=None, outputbranchsel=None, maxEntries=None, firstEntry=0, prefetch=False, longTermCache=False): self.outputDir = outputDir self.inputFiles = inputFiles self.cut = cut self.modules = modules self.compression = compression self.postfix = postfix self.json = jsonInput self.noOut = noOut self.friend = friend self.justcount = justcount self.provenance = provenance self.jobReport = JobReport() if fwkJobReport else None self.haddFileName = haddFileName self.histFile = None self.histDirName = None if self.jobReport and not self.haddFileName: print("Because you requested a FJR we assume you want the final " \ "hadd. No name specified for the output file, will use tree.root") self.haddFileName = "tree.root" self.branchsel = BranchSelection(branchsel) if branchsel else None if outputbranchsel != None: self.outputbranchsel = BranchSelection(outputbranchsel) elif outputbranchsel == None and branchsel != None: # Use the same branches in the output as in input self.outputbranchsel = BranchSelection(branchsel) else: self.outputbranchsel = None self.histFileName = histFileName self.histDirName = histDirName # 2^63 - 1, largest int64 self.maxEntries = maxEntries if maxEntries else 9223372036854775807 self.firstEntry = firstEntry self.prefetch = prefetch # prefetch files to TMPDIR using xrdcp # keep cached files across runs (it's then up to you to clean up the temp) self.longTermCache = longTermCache def prefetchFile(self, fname, verbose=True): tmpdir = os.environ['TMPDIR'] if 'TMPDIR' in os.environ else "/tmp" if not fname.startswith("root://"): return fname, False rndchars = "".join([hex(ord(i))[2:] for i in os.urandom( 8)]) if not self.longTermCache else "long_cache-id%d-%s" \ % (os.getuid(), hashlib.sha1(fname).hexdigest()) localfile = "%s/%s-%s.root" \ % (tmpdir, os.path.basename(fname).replace(".root", ""), rndchars) if self.longTermCache and os.path.exists(localfile): if verbose: print("Filename %s is already available in local path %s " \ % (fname, localfile)) return localfile, False try: if verbose: print("Filename %s is remote, will do a copy to local path %s"\ % (fname, localfile)) start = time.time() subprocess.check_output(["xrdcp", "-f", "-N", fname, localfile]) if verbose: print("Time used for transferring the file locally: %.2f s"\ % (time.time() - start)) return localfile, (not self.longTermCache) except: if verbose: print( "Error: could not save file locally, will run from remote") if os.path.exists(localfile): if verbose: print("Deleting partially transferred file %s" % localfile) try: os.unlink(localfile) except: pass return fname, False def run(self): outpostfix = self.postfix if self.postfix != None else ( "_Friend" if self.friend else "_Skim") if not self.noOut: if self.compression != "none": ROOT.gInterpreter.ProcessLine("#include <Compression.h>") (algo, level) = self.compression.split(":") compressionLevel = int(level) if algo == "LZMA": compressionAlgo = ROOT.ROOT.kLZMA elif algo == "ZLIB": compressionAlgo = ROOT.ROOT.kZLIB elif algo == "LZ4": compressionAlgo = ROOT.ROOT.kLZ4 else: raise RuntimeError("Unsupported compression %s" % algo) else: compressionLevel = 0 print("Will write selected trees to " + self.outputDir) if not self.justcount: if not os.path.exists(self.outputDir): os.system("mkdir -p " + self.outputDir) else: compressionLevel = 0 if self.noOut: if len(self.modules) == 0: raise RuntimeError( "Running with --noout and no modules does nothing!") # Open histogram file, if desired if (self.histFileName != None and self.histDirName == None) or ( self.histFileName == None and self.histDirName != None): raise RuntimeError( "Must specify both histogram file and histogram directory!") elif self.histFileName != None and self.histDirName != None: self.histFile = ROOT.TFile.Open(self.histFileName, "RECREATE") else: self.histFile = None for m in self.modules: if hasattr(m, 'writeHistFile') and m.writeHistFile: m.beginJob(histFile=self.histFile, histDirName=self.histDirName) else: m.beginJob() fullClone = (len(self.modules) == 0) outFileNames = [] t0 = time.time() totEntriesRead = 0 for fname in self.inputFiles: ffnames = [] if "," in fname: fnames = fname.split(',') fname, ffnames = fnames[0], fnames[1:] # open input file if self.prefetch: ftoread, toBeDeleted = self.prefetchFile(fname) inFile = ROOT.TFile.Open(ftoread) else: inFile = ROOT.TFile.Open(fname) # get input tree inTree = inFile.Get("Events") if inTree == None: inTree = inFile.Get("Friends") nEntries = min(inTree.GetEntries() - self.firstEntry, self.maxEntries) totEntriesRead += nEntries # pre-skimming elist, jsonFilter = preSkim(inTree, self.json, self.cut, maxEntries=self.maxEntries, firstEntry=self.firstEntry) if self.justcount: print('Would select %d / %d entries from %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, fname, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) if self.prefetch: if toBeDeleted: os.unlink(ftoread) continue else: print('Pre-select %d entries out of %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) inAddFiles = [] inAddTrees = [] for ffname in ffnames: inAddFiles.append(ROOT.TFile.Open(ffname)) inAddTree = inAddFiles[-1].Get("Events") if inAddTree == None: inAddTree = inAddFiles[-1].Get("Friends") inAddTrees.append(inAddTree) inTree.AddFriend(inAddTree) if fullClone: # no need of a reader (no event loop), but set up the elist if available if elist: inTree.SetEntryList(elist) else: # initialize reader inTree = InputTree(inTree, elist) # prepare output file if not self.noOut: outFileName = os.path.join( self.outputDir, os.path.basename(fname).replace(".root", outpostfix + ".root")) outFile = ROOT.TFile.Open(outFileName, "RECREATE", "", compressionLevel) outFileNames.append(outFileName) if compressionLevel: outFile.SetCompressionAlgorithm(compressionAlgo) # prepare output tree if self.friend: outTree = FriendOutput(inFile, inTree, outFile) else: outTree = FullOutput( inFile, inTree, outFile, branchSelection=self.branchsel, outputbranchSelection=self.outputbranchsel, fullClone=fullClone, maxEntries=self.maxEntries, firstEntry=self.firstEntry, jsonFilter=jsonFilter, provenance=self.provenance) else: outFile = None outTree = None if self.branchsel: self.branchsel.selectBranches(inTree) # process events, if needed if not fullClone: eventRange = range( self.firstEntry, self.firstEntry + nEntries) if nEntries > 0 and not elist else None (nall, npass, timeLoop) = eventLoop(self.modules, inFile, outFile, inTree, outTree, eventRange=eventRange, maxEvents=self.maxEntries) print( 'Processed %d preselected entries from %s (%s entries). Finally selected %d entries' % (nall, fname, nEntries, npass)) else: nall = nEntries print('Selected %d / %d entries from %s (%.2f%%)' % (outTree.tree().GetEntries(), nall, fname, outTree.tree().GetEntries() / (0.01 * nall) if nall else 0)) # now write the output if not self.noOut: outTree.write() outFile.Close() print("Done %s" % outFileName) if self.jobReport: self.jobReport.addInputFile(fname, nall) if self.prefetch: if toBeDeleted: os.unlink(ftoread) for m in self.modules: m.endJob() print("Total time %.1f sec. to process %i events. Rate = %.1f Hz." % ((time.time() - t0), totEntriesRead, totEntriesRead / (time.time() - t0))) if self.haddFileName: haddnano = "./haddnano.py" if os.path.isfile( "./haddnano.py") else "haddnano.py" os.system("%s %s %s" % (haddnano, self.haddFileName, " ".join(outFileNames))) if self.jobReport: self.jobReport.addOutputFile(self.haddFileName) self.jobReport.save()
def HarvestNanoAOD(inFileList, outFilePath, sample): # # Create the output file # print "Create Output File: %s" % (outFilePath) f = ROOT.TFile(outFilePath, "RECREATE") f.cd() # # Initialize the tree jet # treeName = "TreeFatJet" isMC_QCD = "QCD" in sample print "Create Output Tree: %s" % (treeName) TreeFatJet = ROOT.TTree(treeName, treeName) # # FatJet branch # nFatJetSizeMax = 25 nFatJetString = 'nFatJet' nFatJet = bookIntBranch(TreeFatJet, nFatJetString) FatJetPt = bookFloatArrayBranch(TreeFatJet, 'FatJet_pt', nFatJetString, nFatJetSizeMax) FatJetEta = bookFloatArrayBranch(TreeFatJet, 'FatJet_eta', nFatJetString, nFatJetSizeMax) FatJetPhi = bookFloatArrayBranch(TreeFatJet, 'FatJet_phi', nFatJetString, nFatJetSizeMax) FatJetM = bookFloatArrayBranch(TreeFatJet, 'FatJet_mass', nFatJetString, nFatJetSizeMax) FatJetTau21 = bookFloatArrayBranch(TreeFatJet, 'FatJet_tau21', nFatJetString, nFatJetSizeMax) FatJetTau31 = bookFloatArrayBranch(TreeFatJet, 'FatJet_tau31', nFatJetString, nFatJetSizeMax) FatJetTau32 = bookFloatArrayBranch(TreeFatJet, 'FatJet_tau32', nFatJetString, nFatJetSizeMax) FatJetDeepTagTvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTag_TvsQCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagWvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTag_WvsQCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagZvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTag_ZvsQCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTag_QCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagQCDOthers = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTag_QCDothers', nFatJetString, nFatJetSizeMax) FatJetDeepTagMDTvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTagMD_TvsQCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagMDWvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTagMD_WvsQCD', nFatJetString, nFatJetSizeMax) FatJetDeepTagMDZvsQCD = bookFloatArrayBranch(TreeFatJet, 'FatJet_deepTagMD_ZvsQCD', nFatJetString, nFatJetSizeMax) FatJetMSoftDrop = bookFloatArrayBranch(TreeFatJet, 'FatJet_msoftdrop', nFatJetString, nFatJetSizeMax) FatJetRawFactor = bookFloatArrayBranch(TreeFatJet, 'FatJet_rawFactor', nFatJetString, nFatJetSizeMax) FatJetJetId = bookIntArrayBranch(TreeFatJet, 'FatJet_jetId', nFatJetString, nFatJetSizeMax) FatJetSubJetIdx1 = bookIntArrayBranch(TreeFatJet, 'FatJet_subJetIdx1', nFatJetString, nFatJetSizeMax) FatJetSubJetIdx2 = bookIntArrayBranch(TreeFatJet, 'FatJet_subJetIdx2', nFatJetString, nFatJetSizeMax) FatJetGenJetAK8Idx = bookIntArrayBranch(TreeFatJet, 'FatJet_genJetAK8Idx', nFatJetString, nFatJetSizeMax) # # GenPart branch # if isMC_QCD == False: nGenPartSizeMax = 1000 nGenPartString = 'nGenPart' nGenPart = bookIntBranch(TreeFatJet, nGenPartString) GenPartPt = bookFloatArrayBranch(TreeFatJet, 'GenPart_pt', nGenPartString, nGenPartSizeMax) GenPartEta = bookFloatArrayBranch(TreeFatJet, 'GenPart_eta', nGenPartString, nGenPartSizeMax) GenPartPhi = bookFloatArrayBranch(TreeFatJet, 'GenPart_phi', nGenPartString, nGenPartSizeMax) GenPartM = bookFloatArrayBranch(TreeFatJet, 'GenPart_mass', nGenPartString, nGenPartSizeMax) GenPartPdgId = bookIntArrayBranch(TreeFatJet, 'GenPart_pdgId', nGenPartString, nGenPartSizeMax) GenPartStatus = bookIntArrayBranch(TreeFatJet, 'GenPart_status', nGenPartString, nGenPartSizeMax) GenPartStatusFlags = bookIntArrayBranch(TreeFatJet, 'GenPart_statusFlags', nGenPartString, nGenPartSizeMax) GenPartGenPartIdxMother = bookIntArrayBranch( TreeFatJet, 'GenPart_genPartIdxMother', nGenPartString, nGenPartSizeMax) # # GenJetAK8 branch # nGenJetAK8SizeMax = 25 nGenJetAK8String = 'nGenJetAK8' nGenJetAK8 = bookIntBranch(TreeFatJet, nGenJetAK8String) GenJetAK8Pt = bookFloatArrayBranch(TreeFatJet, 'GenJetAK8_pt', nGenJetAK8String, nGenJetAK8SizeMax) GenJetAK8Eta = bookFloatArrayBranch(TreeFatJet, 'GenJetAK8_eta', nGenJetAK8String, nGenJetAK8SizeMax) GenJetAK8Phi = bookFloatArrayBranch(TreeFatJet, 'GenJetAK8_phi', nGenJetAK8String, nGenJetAK8SizeMax) GenJetAK8M = bookFloatArrayBranch(TreeFatJet, 'GenJetAK8_mass', nGenJetAK8String, nGenJetAK8SizeMax) GenJetAK8HadronFlavour = bookIntArrayBranch(TreeFatJet, 'GenJetAK8_hadronFlavour', nGenJetAK8String, nGenJetAK8SizeMax) GenJetAK8PartonFlavour = bookIntArrayBranch(TreeFatJet, 'GenJetAK8_partonFlavour', nGenJetAK8String, nGenJetAK8SizeMax) # # Subjet branch # nSubJetSizeMax = 50 nSubJetString = 'nSubJet' nSubJet = bookIntBranch(TreeFatJet, nSubJetString) SubJetPt = bookFloatArrayBranch(TreeFatJet, 'SubJet_pt', nSubJetString, nSubJetSizeMax) SubJetEta = bookFloatArrayBranch(TreeFatJet, 'SubJet_eta', nSubJetString, nSubJetSizeMax) SubJetPhi = bookFloatArrayBranch(TreeFatJet, 'SubJet_phi', nSubJetString, nSubJetSizeMax) SubJetM = bookFloatArrayBranch(TreeFatJet, 'SubJet_mass', nSubJetString, nSubJetSizeMax) SubJetRawFactor = bookFloatArrayBranch(TreeFatJet, 'SubJet_rawFactor', nSubJetString, nSubJetSizeMax) SubJetNBHadrons = bookIntArrayBranch(TreeFatJet, 'SubJet_nBHadrons', nSubJetString, nSubJetSizeMax) SubJetNCHadrons = bookIntArrayBranch(TreeFatJet, 'SubJet_nCHadrons', nSubJetString, nSubJetSizeMax) # # SubGenJetAK8 branch # nSubGenJetAK8SizeMax = 50 nSubGenJetAK8String = 'nSubGenJetAK8' nSubGenJetAK8 = bookIntBranch(TreeFatJet, nSubGenJetAK8String) SubGenJetAK8Pt = bookFloatArrayBranch(TreeFatJet, 'SubGenJetAK8_pt', nSubGenJetAK8String, nSubGenJetAK8SizeMax) SubGenJetAK8Eta = bookFloatArrayBranch(TreeFatJet, 'SubGenJetAK8_eta', nSubGenJetAK8String, nSubGenJetAK8SizeMax) SubGenJetAK8Phi = bookFloatArrayBranch(TreeFatJet, 'SubGenJetAK8_phi', nSubGenJetAK8String, nSubGenJetAK8SizeMax) SubGenJetAK8M = bookFloatArrayBranch(TreeFatJet, 'SubGenJetAK8_mass', nSubGenJetAK8String, nSubGenJetAK8SizeMax) # # PV branch # PVnpvs = bookIntBranch(TreeFatJet, 'nPVnpvs') PVnpvsGood = bookIntBranch(TreeFatJet, 'nPVnpvsGood') PileUpNTrueInt = bookFloatBranch(TreeFatJet, 'nPileUpNTrueInt') PileUpNPU = bookIntBranch(TreeFatJet, 'nPileUpNPU') # # SetupTChain # tree = ROOT.TChain("Events") for inFilePath in inFileList: print 'Adding files: %s' % (inFilePath) tree.Add(inFilePath) tree.ls() # # Use TChain and Setup TTreeReader. # inTree = InputTree(tree) # # # if isMC_QCD: branchSel = BranchSelection("branchSel_QCD.txt") else: branchSel = BranchSelection("branchSel.txt") branchSel.selectBranches(inTree) numEvents = inTree.GetEntries() # # Set max number of events to process # Set to -1 if you want to run over all events # maxevents = -1 # maxevents = 1000 # # Loop over events # print numEvents for iev in xrange(0, numEvents): # print iev if maxevents > 0 and iev > maxevents: break if (iev) % 1000 == 0: print "Processing event %d out of %d" % (iev, numEvents) # # Load Event # evt = Event(inTree, iev) # # Loop over fatjets # fatjets = Collection(evt, "FatJet") nFatJet[0] = 0 for i, fj in enumerate(fatjets): fj_p4 = fj.p4() FatJetPt[i] = fj_p4.Pt() FatJetEta[i] = fj_p4.Eta() FatJetPhi[i] = fj_p4.Phi() FatJetM[i] = fj_p4.M() if fj.tau1 > 0: FatJetTau21[i] = fj.tau2 / fj.tau1 else: FatJetTau21[i] = -1 if fj.tau1 > 0: FatJetTau31[i] = fj.tau3 / fj.tau1 else: FatJetTau31[i] = -1 if fj.tau2 > 0: FatJetTau32[i] = fj.tau3 / fj.tau2 else: FatJetTau32[i] = -1 FatJetDeepTagMDTvsQCD[i] = fj.deepTagMD_TvsQCD FatJetDeepTagMDWvsQCD[i] = fj.deepTagMD_WvsQCD FatJetDeepTagMDZvsQCD[i] = fj.deepTagMD_ZvsQCD FatJetDeepTagTvsQCD[i] = fj.deepTag_TvsQCD FatJetDeepTagWvsQCD[i] = fj.deepTag_WvsQCD FatJetDeepTagZvsQCD[i] = fj.deepTag_ZvsQCD FatJetDeepTagQCD[i] = fj.deepTag_QCD FatJetDeepTagQCDOthers[i] = fj.deepTag_QCDothers FatJetMSoftDrop[i] = fj.msoftdrop FatJetRawFactor[i] = fj.rawFactor FatJetJetId[i] = fj.jetId FatJetSubJetIdx1[i] = fj.subJetIdx1 FatJetSubJetIdx2[i] = fj.subJetIdx2 FatJetGenJetAK8Idx[i] = fj.genJetAK8Idx nFatJet[0] += 1 # # Loop over genparts # if isMC_QCD == False: particles = Collection(evt, "GenPart") nGenPart[0] = 0 for i, gp in enumerate(particles): GenPartPt[i] = gp.pt GenPartEta[i] = gp.eta GenPartPhi[i] = gp.phi GenPartM[i] = gp.mass GenPartPdgId[i] = gp.pdgId GenPartStatus[i] = gp.status GenPartStatusFlags[i] = gp.statusFlags GenPartGenPartIdxMother[i] = gp.genPartIdxMother nGenPart[0] += 1 # # Loop over GenJetAK8 # jets = Collection(evt, "GenJetAK8") nGenJetAK8[0] = 0 for i, gj in enumerate(jets): GenJetAK8Pt[i] = gj.pt GenJetAK8Eta[i] = gj.eta GenJetAK8Phi[i] = gj.phi GenJetAK8M[i] = gj.mass GenJetAK8HadronFlavour[i] = gj.hadronFlavour GenJetAK8PartonFlavour[i] = gj.partonFlavour nGenJetAK8[0] += 1 # # Subjet over GenJetAK8 # subjets = Collection(evt, "SubJet") nSubJet[0] = 0 for i, sj in enumerate(subjets): SubJetPt[i] = sj.pt SubJetEta[i] = sj.eta SubJetPhi[i] = sj.phi SubJetM[i] = sj.mass SubJetRawFactor[i] = sj.rawFactor SubJetNBHadrons[i] = sj.nBHadrons SubJetNCHadrons[i] = sj.nCHadrons nSubJet[0] += 1 # # Subjet over GenJetAK8 # subjets = Collection(evt, "SubGenJetAK8") nSubGenJetAK8[0] = 0 for i, sj in enumerate(subjets): SubGenJetAK8Pt[i] = sj.pt SubGenJetAK8Eta[i] = sj.eta SubGenJetAK8Phi[i] = sj.phi SubGenJetAK8M[i] = sj.mass nSubGenJetAK8[0] += 1 # # Loop over PV # PVnpvs[0] = evt.PV_npvs PVnpvsGood[0] = evt.PV_npvsGood PileUpNTrueInt[0] = evt.Pileup_nTrueInt PileUpNPU[0] = evt.Pileup_nPU # # Fill the tree for this event # TreeFatJet.Fill() # # Save the output ttree in the output file # print "Write tree to file" f.Write() # # Gracefully close the output file # print "Closing output" f.Close()
#now we will demonstrate running outside the Draw command which is more flexable #however this is slooow so we do things to speed it up like #pre skim things as it is slower with an entry list #and dropping branches we dont need so we dont waste time reading them #note I highly suggest you look into RDataFrames as that should be much faster Events.Draw( ">>elist", "Sum$(Electron_pt>25 && abs(Electron_eta+Electron_deltaEtaSC)<1.4442)>=1", "entrylist goff", max_events) elist = ROOT.gDirectory.Get('elist') elist.SetDirectory(0) #removing it from the file not to write it out Events.SetEntryList(elist) branchsel = BranchSelection( "EgammaUser/EgammaDAS2020/data/nano_electron_branches.txt") branchsel.selectBranches(Events) nr_events = elist.GetN() #2nd way, bare event loop for event_nr in range(0, nr_events): entry_nr = Events.GetEntryNumber(event_nr) Events.GetEntry(entry_nr) event = Events if event_nr % args.report == 0: print("processing event {} / {} {}".format(event_nr, nr_events, time.ctime())) for ele_nr in range(0, event.nElectron): if event.Electron_pt[ele_nr] > 25 and abs( event.Electron_eta[ele_nr] + event.Electron_deltaEtaSC[ele_nr]) < 1.4442: if event.Electron_genPartIdx[ele_nr] >= 0:
class PostProcessor: def __init__(self, outputDir, inputFiles, cut=None, branchsel=None, modules=[], compression="LZMA:9", friend=False, postfix=None, jsonInput=None, noOut=False, justcount=False, provenance=False, haddFileName=None, fwkJobReport=False, histFileName=None, histDirName=None, outputbranchsel=None, maxEntries=None, firstEntry=0, prefetch=False, longTermCache=False, saveHistoGenWeights=False, allowNoPostfix=False): self.outputDir = outputDir self.inputFiles = inputFiles self.cut = cut self.modules = modules self.compression = compression self.postfix = postfix self.allowNoPostfix = allowNoPostfix self.json = jsonInput self.noOut = noOut self.friend = friend self.justcount = justcount self.provenance = provenance self.jobReport = JobReport() if fwkJobReport else None self.haddFileName = haddFileName self.saveHistoGenWeights = saveHistoGenWeights self.histFile = None self.histDirName = None if self.jobReport and not self.haddFileName: print("Because you requested a FJR we assume you want the final " \ "hadd. No name specified for the output file, will use tree.root") self.haddFileName = "tree.root" self.branchsel = BranchSelection(branchsel) if branchsel else None if outputbranchsel is not None: self.outputbranchsel = BranchSelection(outputbranchsel) elif outputbranchsel is None and branchsel is not None: # Use the same branches in the output as in input self.outputbranchsel = BranchSelection(branchsel) else: self.outputbranchsel = None self.histFileName = histFileName self.histDirName = histDirName # 2^63 - 1, largest int64 self.maxEntries = maxEntries if maxEntries else 9223372036854775807 self.firstEntry = firstEntry self.prefetch = prefetch # prefetch files to TMPDIR using xrdcp # keep cached files across runs (it's then up to you to clean up the temp) self.longTermCache = longTermCache def prefetchFile(self, fname, verbose=True): tmpdir = os.environ['TMPDIR'] if 'TMPDIR' in os.environ else "/tmp" if not fname.startswith("root://"): return fname, False rndchars = "".join([hex(ord(i))[2:] for i in os.urandom( 8)]) if not self.longTermCache else "long_cache-id%d-%s" \ % (os.getuid(), hashlib.sha1(fname).hexdigest()) localfile = "%s/%s-%s.root" \ % (tmpdir, os.path.basename(fname).replace(".root", ""), rndchars) if self.longTermCache and os.path.exists(localfile): if verbose: print("Filename %s is already available in local path %s " \ % (fname, localfile)) return localfile, False try: if verbose: print("Filename %s is remote, will do a copy to local path %s"\ % (fname, localfile)) start = time.time() subprocess.check_output(["xrdcp", "-f", "-N", fname, localfile]) if verbose: print("Time used for transferring the file locally: %.2f s"\ % (time.time() - start)) return localfile, (not self.longTermCache) except: if verbose: print( "Error: could not save file locally, will run from remote") if os.path.exists(localfile): if verbose: print("Deleting partially transferred file %s" % localfile) try: os.unlink(localfile) except: pass return fname, False def run(self): outpostfix = self.postfix if self.postfix is not None else ( "_Friend" if self.friend else "_Skim") if self.allowNoPostfix and self.postfix is None: outpostfix = "" if not self.noOut: if self.compression != "none": ROOT.gInterpreter.ProcessLine("#include <Compression.h>") (algo, level) = self.compression.split(":") compressionLevel = int(level) if algo == "LZMA": compressionAlgo = ROOT.ROOT.kLZMA elif algo == "ZLIB": compressionAlgo = ROOT.ROOT.kZLIB elif algo == "LZ4": compressionAlgo = ROOT.ROOT.kLZ4 else: raise RuntimeError("Unsupported compression %s" % algo) else: compressionLevel = 0 print("Will write selected trees to " + self.outputDir) if not self.justcount: if not os.path.exists(self.outputDir): os.system("mkdir -p " + self.outputDir) else: compressionLevel = 0 if self.noOut: if len(self.modules) == 0: raise RuntimeError( "Running with --noout and no modules does nothing!") # Open histogram file, if desired if (self.histFileName is not None and self.histDirName is None) or ( self.histFileName is None and self.histDirName is not None): raise RuntimeError( "Must specify both histogram file and histogram directory!") elif self.histFileName is not None and self.histDirName is None: self.histFile = ROOT.TFile.Open(self.histFileName, "RECREATE") else: self.histFile = None for m in self.modules: if hasattr(m, 'writeHistFile') and m.writeHistFile: m.beginJob(histFile=self.histFile, histDirName=self.histDirName) else: m.beginJob() fullClone = (len(self.modules) == 0) outFileNames = [] t0 = time.time() totEntriesRead = 0 for fname in self.inputFiles: ffnames = [] if "," in fname: fnames = fname.split(',') fname, ffnames = fnames[0], fnames[1:] # open input file if self.prefetch: ftoread, toBeDeleted = self.prefetchFile(fname) inFile = ROOT.TFile.Open(ftoread) else: inFile = ROOT.TFile.Open(fname) if not inFile: print 'ERROR: file does not exist, check!' print ' filename:', fname exit(0) # get input tree inTree = inFile.Get("Events") if inTree is None: inTree = inFile.Get("Friends") nEntries = min(inTree.GetEntries() - self.firstEntry, self.maxEntries) # first check that the histogram with weights is not already in the file hasWeightHistograms = False if inFile.GetListOfKeys().Contains( "hGenWeights") and inFile.GetListOfKeys().Contains( "hNumWeights"): hasWeightHistograms = True print "Histogram hGenWeights already exists, I will just copy it without recreating it" if self.saveHistoGenWeights and inTree.GetName( ) == "Events" and not hasWeightHistograms: print "Histogram hGenWeights does not exist yet, I will create it" # check that the tree contains all the original events, otherwise the sum of gen weights will miss some tmpTreeRuns = inFile.Get("Runs") for ievt, event in enumerate(tmpTreeRuns): if ievt: break # only need first event (but there should be only 1 here) nGenEvents = event.genEventCount if nGenEvents != inTree.GetEntries(): raise RuntimeError( "I am creating the histogram with genWeight, but tree Events has less entries than genEventCount in tree Runs (%s instead of %s). The sum of weights will thus be wrong, please check" % (str(inTree.GetEntries()), str(nGenEvents))) # saving distribution of genWeight for offline usage # idea is to fill the distribution of Log10(genWeight) with the sign, so to have a histogram from about -10 to 10 # with about 10k bins (genWeights can take valus spanning several orders of magnitude, especially for fancy weights) # then one can compute the sum of genWeight in a given range using its integral (using Log10(threshold) ). # This somehow relies on having always |genWeight|>1, should it be < 1 the Log would change the sign. # So for the purpose of choosing the bin to be filled, we use |value| or 1.001, whatever is larger (this will not affect the integral) # then, need a second histogram to keep the integer number of events in each bin, so to allow for clipping of large weights hGenWeights = ROOT.TH1D("hGenWeights", "distribution of Log10(genWeight)", 4800, -12.0, 12.0) hNumWeights = ROOT.TH1D( "hNumWeights", "distribution of Log10(genWeight) (unweighted)", 4800, -12.0, 12.0) drawResult = inTree.Draw( "TMath::Sign(1.0,genWeight)*TMath::Log10(max(1.001,abs(genWeight)))>>hGenWeights", "genWeight", "goff", nEntries, self.firstEntry) drawResult = inTree.Draw( "TMath::Sign(1.0,genWeight)*TMath::Log10(max(1.001,abs(genWeight)))>>hNumWeights", "1", "goff", nEntries, self.firstEntry) totEntriesRead += nEntries # pre-skimming elist, jsonFilter = preSkim(inTree, self.json, self.cut, maxEntries=self.maxEntries, firstEntry=self.firstEntry) if self.justcount: print('Would select %d / %d entries from %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, fname, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) if self.prefetch: if toBeDeleted: os.unlink(ftoread) continue else: print('Pre-select %d entries out of %s (%.2f%%)' % (elist.GetN() if elist else nEntries, nEntries, (elist.GetN() if elist else nEntries) / (0.01 * nEntries) if nEntries else 0)) inAddFiles = [] inAddTrees = [] for ffname in ffnames: inAddFiles.append(ROOT.TFile.Open(ffname)) inAddTree = inAddFiles[-1].Get("Events") if inAddTree is None: inAddTree = inAddFiles[-1].Get("Friends") inAddTrees.append(inAddTree) inTree.AddFriend(inAddTree) if fullClone: # no need of a reader (no event loop), but set up the elist if available if elist: inTree.SetEntryList(elist) else: # initialize reader inTree = InputTree(inTree, elist) # prepare output file if not self.noOut: outFileName = os.path.join( self.outputDir, os.path.basename(fname).replace(".root", outpostfix + ".root")) outFile = ROOT.TFile.Open(outFileName, "RECREATE", "", compressionLevel) outFileNames.append(outFileName) if compressionLevel: outFile.SetCompressionAlgorithm(compressionAlgo) # prepare output tree if self.friend: outTree = FriendOutput(inFile, inTree, outFile) else: outTree = FullOutput( inFile, inTree, outFile, branchSelection=self.branchsel, outputbranchSelection=self.outputbranchsel, fullClone=fullClone, maxEntries=self.maxEntries, firstEntry=self.firstEntry, jsonFilter=jsonFilter, provenance=self.provenance) else: outFile = None outTree = None if self.branchsel: self.branchsel.selectBranches(inTree) # process events, if needed if not fullClone: eventRange = range( self.firstEntry, self.firstEntry + nEntries) if nEntries > 0 and not elist else None (nall, npass, timeLoop) = eventLoop(self.modules, inFile, outFile, inTree, outTree, eventRange=eventRange, maxEvents=self.maxEntries) print( 'Processed %d preselected entries from %s (%s entries). Finally selected %d entries' % (nall, fname, nEntries, npass)) else: nall = nEntries print('Selected %d / %d entries from %s (%.2f%%)' % (outTree.tree().GetEntries(), nall, fname, outTree.tree().GetEntries() / (0.01 * nall) if nall else 0)) # now write the output if not self.noOut: outTree.write() if not hasWeightHistograms: if self.saveHistoGenWeights: hGenWeights.Write(hGenWeights.GetName()) hNumWeights.Write(hNumWeights.GetName()) outFile.Close() print("Done %s" % outFileName) if self.jobReport: self.jobReport.addInputFile(fname, nall) if self.prefetch: if toBeDeleted: os.unlink(ftoread) for m in self.modules: m.endJob() print("Total time %.1f sec. to process %i events. Rate = %.1f Hz." % ((time.time() - t0), totEntriesRead, totEntriesRead / (time.time() - t0))) if self.haddFileName: haddnano = "./haddnano.py" if os.path.isfile( "./haddnano.py") else "haddnano.py" os.system("%s %s %s" % (haddnano, self.haddFileName, " ".join(outFileNames))) if self.jobReport: self.jobReport.addOutputFile(self.haddFileName) self.jobReport.save()
class PostProcessor : def __init__(self,outputDir,inputFiles,cut=None,branchsel=None,modules=[],compression="LZMA:9",friend=False,postfix=None, jsonInput=None,noOut=False,justcount=False,provenance=False,haddFileName=None,fwkJobReport=False,histFileName=None,histDirName=None, outputbranchsel=None): self.outputDir=outputDir self.inputFiles=inputFiles self.cut=cut self.modules=modules self.compression=compression self.postfix=postfix self.json=jsonInput self.noOut=noOut self.friend=friend self.justcount=justcount self.provenance=provenance self.jobReport = JobReport() if fwkJobReport else None self.haddFileName=haddFileName self.histFile = None self.histDirName = None if self.jobReport and not self.haddFileName : print "Because you requested a FJR we assume you want the final hadd. No name specified for the output file, will use tree.root" self.haddFileName="tree.root" self.branchsel = BranchSelection(branchsel) if branchsel else None self.outputbranchsel = BranchSelection(outputbranchsel) if outputbranchsel else None self.histFileName=histFileName self.histDirName=histDirName def run(self) : outpostfix = self.postfix if self.postfix != None else ("_Friend" if self.friend else "_Skim") if not self.noOut: if self.compression != "none": ROOT.gInterpreter.ProcessLine("#include <Compression.h>") (algo, level) = self.compression.split(":") compressionLevel = int(level) if algo == "LZMA": compressionAlgo = ROOT.ROOT.kLZMA elif algo == "ZLIB": compressionAlgo = ROOT.ROOT.kZLIB else: raise RuntimeError("Unsupported compression %s" % algo) else: compressionLevel = 0 print "Will write selected trees to "+self.outputDir if not self.justcount: if not os.path.exists(self.outputDir): os.system("mkdir -p "+self.outputDir) else: compressionLevel = 0 if self.noOut: if len(self.modules) == 0: raise RuntimeError("Running with --noout and no modules does nothing!") # Open histogram file, if desired if (self.histFileName != None and self.histDirName == None) or (self.histFileName == None and self.histDirName != None) : raise RuntimeError("Must specify both histogram file and histogram directory!") elif self.histFileName != None and self.histDirName != None: self.histFile = ROOT.TFile.Open( self.histFileName, "RECREATE" ) else : self.histFile = None for m in self.modules: if hasattr( m, 'writeHistFile') and m.writeHistFile : m.beginJob(histFile=self.histFile,histDirName=self.histDirName) else : m.beginJob() fullClone = (len(self.modules) == 0) outFileNames=[] t0 = time.clock() totEntriesRead=0 for fname in self.inputFiles: # open input file inFile = ROOT.TFile.Open(fname) #get input tree inTree = inFile.Get("Events") totEntriesRead+=inTree.GetEntries() # pre-skimming elist,jsonFilter = preSkim(inTree, self.json, self.cut) if self.justcount: print 'Would select %d entries from %s'%(elist.GetN() if elist else inTree.GetEntries(), fname) continue else: print 'Pre-select %d entries out of %s '%(elist.GetN() if elist else inTree.GetEntries(),inTree.GetEntries()) if fullClone: # no need of a reader (no event loop), but set up the elist if available if elist: inTree.SetEntryList(elist) else: # initialize reader inTree = InputTree(inTree, elist) # prepare output file if not self.noOut: outFileName = os.path.join(self.outputDir, os.path.basename(fname).replace(".root",outpostfix+".root")) outFile = ROOT.TFile.Open(outFileName, "RECREATE", "", compressionLevel) outFileNames.append(outFileName) if compressionLevel: outFile.SetCompressionAlgorithm(compressionAlgo) # prepare output tree if self.friend: outTree = FriendOutput(inFile, inTree, outFile) else: outTree = FullOutput(inFile, inTree, outFile, branchSelection = self.branchsel, fullClone = fullClone, jsonFilter = jsonFilter,provenance=self.provenance) else : outFile = None outTree = None # process events, if needed if not fullClone: (nall, npass, timeLoop) = eventLoop(self.modules, inFile, outFile, inTree, outTree) print 'Processed %d preselected entries from %s (%s entries). Finally selected %d entries' % (nall, fname, inTree.GetEntries(), npass) else: print 'Selected %d entries from %s' % (outTree.tree().GetEntries(), fname) # now write the output if self.outputbranchsel: self.outputbranchsel.selectBranches(outTree._tree) if not self.noOut: outTree.write() outFile.Close() print "Done %s" % outFileName if self.jobReport: self.jobReport.addInputFile(fname,nall) for m in self.modules: m.endJob() print totEntriesRead/(time.clock()-t0), "Hz" if self.haddFileName : os.system("./haddnano.py %s %s" %(self.haddFileName," ".join(outFileNames))) #FIXME: remove "./" once haddnano.py is distributed with cms releases if self.jobReport : self.jobReport.addOutputFile(self.haddFileName) self.jobReport.save()