def copy_directory(newdir, olddir, condition=None): """Reads all objects from olddir and writes them to newdir. newdir, olddir: Directories (inheriting from TDirectory). condition: Function that takes key name and returns whether the file should be kept or not (optional). """ for key in olddir.GetListOfKeys(): if condition is not None and (not condition(key) or key.GetName().startswith('ProcessID')): continue cl = gROOT.GetClass(key.GetClassName()) if not cl: continue if cl.InheritsFrom(TDirectory.Class()): newsub = newdir.mkdir(key.GetName()) oldsub = olddir.GetDirectory(key.GetName()) copy_directory(newsub, oldsub) elif cl.InheritsFrom(TTree.Class()): oldtree = olddir.Get(key.GetName()) newdir.cd() newtree = oldtree.CloneTree(-1, 'fast') newtree.Write() else: olddir.cd() obj = key.ReadObj() newdir.cd() obj.Write(key.GetName()) del obj
def merge_root_file(target, source_list): """ Merge next file from the source list with the target file. Function called recursively for each element of the list. :param TFile target: the result ROOT file :param TList source_list: list of input files to merge """ logger = get_logger() raw_path = target.GetPath() path = raw_path[raw_path.find(":") + 1:] first_source = source_list.First() first_source.cd(path) current_source_dir = gDirectory # gain time, do not add the objects in the list in memory status = TH1.AddDirectoryStatus() TH1.AddDirectory(False) # loop over all keys in this directory #global_chain = TChain() next_key = TIter(current_source_dir.GetListOfKeys()) #key = TKey() #TKey old_key = None key = next_key() while key: # keep only the highest cycle number for each key #if old_key and not old_key.GetName() == key.GetName(): # continue # read object from first source file first_source.cd(path) obj = key.ReadObj() if obj.IsA().InheritsFrom(TH1.Class()): # descendant of TH1 -> merge it logger.info("Merging histogram %s", obj.GetName()) h1 = TH1(obj) # loop over all source files and add the content of the # correspondant histogram to the one pointed to by "h1" next_source = source_list.After(first_source) while next_source: # make sure we are at the correct directory level by cd'ing to path next_source.cd(path) key2 = gDirectory.GetListOfKeys().FindObject(h1.GetName()) if key2: h2 = TH1(key2.ReadObj()) h1.Add(h2) #del h2 next_source = source_list.After(next_source) elif obj.IsA().InheritsFrom(TTree.Class()): logger.info("Merging tree %s", obj.GetName()) # loop over all source files and create a chain of Trees "global_chain" obj_name = obj.GetName() global_chain = TChain(obj_name) global_chain.Add(first_source.GetName()) next_source = source_list.After(first_source) while next_source: global_chain.Add(next_source.GetName()) next_source = source_list.After(next_source) elif obj.IsA().InheritsFrom(TDirectory.Class()): logger.info("Found subdirectory %s", obj.GetName()) # create a new subdir of same name and title in the target file target.cd() new_dir = target.mkdir(obj.GetName(), obj.GetTitle()) # newdir is now the starting point of another round of merging # newdir still knows its depth within the target file via # GetPath(), so we can still figure out where we are in the recursion merge_root_file(new_dir, source_list) else: logger.info("Unknown object type, name: %s, title: %s", obj.GetName(), obj.GetTitle()) # now write the merged histogram (which is "in" obj) to the target file # note that this will just store obj in the current directory level, # which is not persistent until the complete directory itself is stored # by "target.Write()" below if obj is not None: target.cd() # if the object is a tree, it is stored in global_chain... if obj.IsA().InheritsFrom(TTree.Class()): global_chain.Merge(target.GetFile(), 0, "keep") else: obj.Write(key.GetName()) # move to the next element key = next_key() # save modifications to target file target.SaveSelf(True) TH1.AddDirectory(status) target.Write()
def skim_tree(fname_patts, branches_to_keep, treename="t", fname_out="skim.root", cut_str=""): # This stuff is super necessary or else we all die from ROOT import TChain, TFile, gSystem, gROOT, TTree gSystem.Load("libFWCoreFWLite.so") gSystem.Load("libDataFormatsFWLite.so") gROOT.ProcessLine("FWLiteEnabler::enable()") ch = TChain(treename) for patt in fname_patts: ch.Add(patt) nevents = ch.GetEntries() branches_to_keep = [b for b in branches_to_keep if b] # remove empty strings if len(cut_str) > 0: print ">>> [!] You specified a cut string of: %s" % cut_str print ">>> [!] Make sure that you are opting to keep all branches used in that cut string." filenames = [f.GetTitle() for f in ch.GetListOfFiles()] f1 = TFile(filenames[0]) tree = f1.Get(treename) tree.SetMakeClass(1) branches = [b.GetName() for b in tree.GetListOfBranches()] # see if the dummy user specified any branches to keep that aren't in the chain # and subtract them out to avoid segfaulttttt branches_not_in_chain = set(branches_to_keep) - set(branches) if len(branches_not_in_chain) > 0 and len(branches_to_keep) > 0: print ">>> [!] You dummy! I am going to neglect these branches which are not even in the TTree: %s" % ",".join( list(branches_not_in_chain)) branches_to_keep = list(set(branches_to_keep) - branches_not_in_chain) if len(branches_to_keep) == 0: if len(cut_str) == 0: print ">>> [!] You dummy! You want me to skim 0 branches without any cut? That's pointless." return else: print ">>> [!] You specified 0 branches to keep, but you gave me a cut string, so keeping ALL branches." branches_to_keep = branches[:] else: # whitelist the ones to copy ch.SetBranchStatus("*", 0) for bname in branches_to_keep: ch.SetBranchStatus(bname, 1) # need this to actually copy over any 4vectors. WTF. # https://root.cern.ch/phpBB3/viewtopic.php?t=10725 ch.SetBranchStatus("fCoordinates*", 1) # actually do the skim and save the file t0 = time.time() new_file = TFile(fname_out, "RECREATE") # copy over all the histograms too - note that this only takes the first file (TODO is to actually add multiples, but this is not a use case for me right now) for key in f1.GetListOfKeys(): if key.ReadObj().InheritsFrom(TTree.Class()): continue name = key.GetName() print name f1.Get(name).Write() print ">>> Started skimming tree %s with %i events: %i --> %i branches" % ( treename, nevents, len(branches), len(branches_to_keep)) ch_new = ch.CopyTree(cut_str) print ">>> Finished skim in %.2f seconds" % (time.time() - t0) ch_new.GetCurrentFile().Write() ch_new.GetCurrentFile().Close() # wow the user with incredible reduction stats size_before = get_filesizes(filenames) size_after = get_filesizes([fname_out]) print ">>> Size reduction: %s --> %s (factor of %.1f)" % (readable_size( size_before), readable_size(size_after), size_before / size_after) print ">>> Your output file is %s." % fname_out
def skim_tree(fname_patts, branches_to_keep=[], treename="t", fname_out="skim.root", cut_str="", flip_branches=False): # This stuff is super necessary or else we all die from ROOT import TChain, TFile, gSystem, gROOT, TTree import ROOT as r r.PyConfig.IgnoreCommandLineOptions = True # https://root-forum.cern.ch/t/pyroot-crashes-when-in-arguments/25379/3 r.v5.TFormula.SetMaxima(100000000) gSystem.Load("libFWCoreFWLite.so") gSystem.Load("libDataFormatsFWLite.so") gROOT.ProcessLine("FWLiteEnabler::enable()") ch = r.TChain(treename) for patt in fname_patts: ch.Add(patt) nevents = ch.GetEntries() branches_to_keep = [b for b in branches_to_keep if b] # remove empty strings # https://root-forum.cern.ch/t/pyroot-crashes-when-in-arguments/25379/3 -- figured out solution to below lines # # cut_str = "Sum$(abs(genps_id_mother)==24 && genps_isLastCopy && (abs(genps_id)==11 || abs(genps_id)==13))==2" # 2 leps # # cut_str = "abs(Sum$(genps_id*(abs(genps_id_mother)==24 && genps_isLastCopy && (abs(genps_id)==11 || abs(genps_id)==13))))>20" # SS # # apparently when using a $ sign in the cut_str, root hijacks the arguments to the script and crashes. Sigh. # cut_str = cut_str.replace("SSS","$") if len(cut_str) > 0: print ">>> [!] You specified a cut string of: %s" % cut_str print ">>> [!] Make sure that you are opting to keep all branches used in that cut string." filenames = [f.GetTitle() for f in ch.GetListOfFiles()] f1 = r.TFile(filenames[0]) tree = f1.Get(treename) tree.SetMakeClass(1) branches = [b.GetName() for b in tree.GetListOfBranches()] # see if the dummy user specified any branches to keep that aren't in the chain # and subtract them out to avoid segfaulttttt if not flip_branches: branches_not_in_chain = set(branches_to_keep) - set(branches) if len(branches_not_in_chain) > 0 and len(branches_to_keep) > 0: print ">>> [!] You dummy! I am going to neglect these branches which are not even in the TTree: %s" % ",".join( list(branches_not_in_chain)) branches_to_keep = list(set(branches_to_keep) - branches_not_in_chain) if len(branches_to_keep) == 0 and not flip_branches: if len(cut_str) == 0: print ">>> [!] You dummy! You want me to skim 0 branches without any cut? That's pointless." return else: print ">>> [!] You specified 0 branches to keep, but you gave me a cut string, so keeping ALL branches." branches_to_keep = branches[:] else: # whitelist the ones to copy # or reverse if we have flip_branches if not flip_branches: ch.SetBranchStatus("*", 0) for bname in branches_to_keep: ch.SetBranchStatus(bname, 1) else: ch.SetBranchStatus("*", 1) for bname in branches_to_keep: ch.SetBranchStatus(bname, 0) # need this to actually copy over any 4vectors. WTF. # https://root.cern.ch/phpBB3/viewtopic.php?t=10725 ch.SetBranchStatus("fCoordinates*", 1) # actually do the skim and save the file t0 = time.time() new_file = r.TFile(fname_out, "RECREATE") # copy over all the histograms too - note that this only takes the first file (TODO is to actually add multiples, but this is not a use case for me right now) for key in f1.GetListOfKeys(): if key.ReadObj().InheritsFrom(TTree.Class()): continue name = key.GetName() f1.Get(name).Write() print ">>> Started skimming tree %s with %i events: %i --> %i branches" % ( treename, nevents, len(branches), len(branches_to_keep)) ch_new = ch.CopyTree(cut_str) print ">>> Finished skim in %.2f seconds" % (time.time() - t0) ch_new.GetCurrentFile().Write() ch_new.GetCurrentFile().Close() # wow the user with incredible reduction stats size_before = get_filesizes(filenames) size_after = get_filesizes([fname_out]) print ">>> Size reduction: %s --> %s (factor of %.1f)" % ( readable_size(size_before), readable_size(size_after), 1.0 * size_before / size_after) print ">>> Your output file is %s" % fname_out