def branchStats_RDF(branchDict, reportFile = "branchStats.txt", plotCard="plotCard_{label}.txt", doHLT=False, doL1=False): procstart['branchStats_RDF'] = datetime.now() uniquePairs = [(a,b) for a in branchDict["indices"] for b in branchDict["indices"] if a < b] stats = collections.OrderedDict() triggers = collections.OrderedDict() rdf_nodes = branchDict.get("Events") event_counts = branchDict.get("SumEvents") branches_all = branchDict.get("Branches") branch_types_all = branchDict.get("BranchTypesRaw") for index in branchDict.get("indices"): label = nToLabel[index] stats[index] = collections.OrderedDict() # triggers[index] = rdf_nodes[index].Count() triggers[index] = ROOT.AddProgressBar(ROOT.RDF.AsRNode(rdf_nodes[index]), max(100, int(event_counts[index]/50000)), int(event_counts[index])) branches = branches_all[index] branch_types = branch_types_all[index] for branch in branches: branch_type = branch_types[branch] if branch_type == 'Bool_t': if branch[0:4] == "HLT_": if not doHLT: continue if branch[0:3] == "L1_": if not doL1: continue stats[index][branch] = (rdf_nodes[index].Stats(branch, "diffWeight"), branch_type) elif branch_type in ['Int_t', 'UChar_t', 'UInt_t', 'Float_t', 'double']: stats[index][branch] = (rdf_nodes[index].Stats(branch, "diffWeight"), branch_type) elif branch_type in ['ROOT::VecOps::RVec<Int_t>', 'ROOT::VecOps::RVec<UChar_t>', 'ROOT::VecOps::RVec<Float_t>']: #Potentially unpack these stats[index][branch] = (rdf_nodes[index].Stats(branch, "diffWeight"), branch_type) elif branch_type == 'ROOT::VecOps::RVec<Bool_t>': #need to unpack these (root 6.22 dev) dnode, dcolumns = devectorizeBool(rdf_nodes[index], branch) for dcolumn in dcolumns: stats[index][dcolumn] = (dnode.Filter("{dcol} > -1".format(dcol=dcolumn)).Stats(dcolumn, "diffWeight"), branch_type) elif branch_type == 'ULong64_t': #run number pass else: print("Skipping unhandled branch (name={}, type={})".format(branch, branch_type)) for index, count in triggers.items(): print(count.GetValue()) final_stats = collections.OrderedDict() for index, stats_dict in stats.items(): label = nToLabel[index] final_stats[index] = collections.OrderedDict() plotcard_name = copy.copy(plotCard).format(label=label) with open(plotcard_name, "w") as pc: print("Filling plotcard {}".format(plotcard_name)) for branch , branch_stats in stats_dict.items(): temp = branch_stats[0].GetValue() if "Bool" in branch_stats[1]: plot = [str(branch), str(2), str(0), str(2)] final_stats[index][branch] = [branch, 0, temp.GetMean(), 1, 0, 0] elif "int" in branch_stats[1].lower(): plot = [str(branch), str(100), str(temp.GetMin()), str(temp.GetMax()+1)] final_stats[index][branch] = [branch, temp.GetMin(), temp.GetMean(), temp.GetMax(), temp.GetM2(), temp.GetRMS()] else: plot = [str(branch), str(100), str(temp.GetMin()), str(temp.GetMax())] final_stats[index][branch] = [branch, temp.GetMin(), temp.GetMean(), temp.GetMax(), temp.GetM2(), temp.GetRMS()] pc.write(" ".join(plot) + "\n") with open(reportFile, "w") as o: for indexA, indexB in uniquePairs: labelA = nToLabel[indexA] labelB = nToLabel[indexB] # inCommon = branchDict["BranchSets"][indexA].intersection(branchDict["BranchSets"][indexB]) # onlyInA = branchDict["BranchSets"][indexA] - branchDict["BranchSets"][indexB] # onlyInB = branchDict["BranchSets"][indexB] - branchDict["BranchSets"][indexA] inCommon = set(final_stats[indexA].keys()).intersection(set(final_stats[indexB].keys())) onlyInA = set(final_stats[indexA].keys()) - set(final_stats[indexB].keys()) onlyInB = set(final_stats[indexB].keys()) - set(final_stats[indexA].keys()) o.write("==================================\n") o.write("Report on inputs {iA} and {iB}:\n files[{iA}]: {fA}\n files[{iB}]: {fB}\n"\ "\n SumEvents[{iA}]: {seA}\n SumEvents[{iB}]: {seB}"\ "\n SumWeights[{iA}]: {swA}\n SumWeights[{iB}]: {swB}\n"\ .format(iA = labelA, iB = labelB, fA = branchDict["Files"][indexA], fB = branchDict["Files"][indexB], seA = branchDict["SumEvents"][indexA], seB = branchDict["SumEvents"][indexB], swA = branchDict["SumWeights"][indexA], swB = branchDict["SumWeights"][indexB])) o.write("\n======== Common Branches % Change ({} - {})/{}========\n".format(labelA, labelB, labelA)) for b in sorted(inCommon): if final_stats[indexA].get(b, None) is None or final_stats[indexB].get(b, None) is None: o.write("{branch} Stats not computed due to type or name".format(branch=b)) continue # print("{} - {}".format(b, final_stats[indexA][b])) printout = "{branch} Min: {cminN}/{cminD} Mean: {cmeanN}/{cmeanD} Max: {cmaxN}/{cmaxD} Moment2: {cmomN}/{cmomD} RMS: {crmsN}/{crmsD}\n".format( branch=b, cminN=(final_stats[indexA][b][1] - final_stats[indexB][b][1]), cminD=final_stats[indexA][b][1], cmeanN=(final_stats[indexA][b][2] - final_stats[indexB][b][2]), cmeanD=final_stats[indexA][b][2], cmaxN=(final_stats[indexA][b][3] - final_stats[indexB][b][3]), cmaxD=final_stats[indexA][b][3], cmomN=(final_stats[indexA][b][4] - final_stats[indexB][b][4]), cmomD=final_stats[indexA][b][4], crmsN=(final_stats[indexA][b][5] - final_stats[indexB][b][5]), crmsD=final_stats[indexA][b][5]) o.write(printout) o.write("\n======== Branches only in {iA} ========\n".format(iA = labelA)) for b in sorted(onlyInA): if final_stats[indexA].get(b, None) is None: o.write("{branch} Stats not computed due to type or name".format(branch=b)) continue printout = "{branch} Min: {cmin} Mean: {cmean} Max: {cmax} Moment2: {cmom} RMS: {crms}\n".format( branch=b, cmin=final_stats[indexA][b][1], cmean=final_stats[indexA][b][2], cmax=final_stats[indexA][b][3], cmom=final_stats[indexA][b][4], crms=final_stats[indexA][b][5]) o.write(printout) o.write("\n======== Branches only in {iB} ========\n".format(iB = labelB)) for b in sorted(onlyInB): if final_stats[indexB].get(b, None) is None: o.write("{branch} Stats not computed due to type or name".format(branch=b)) continue printout = "{branch} Min: {cmin} Mean: {cmean} Max: {cmax} Moment2: {cmom} RMS: {crms}\n".format( branch=b, cmin=final_stats[indexB][b][1], cmean=final_stats[indexB][b][2], cmax=final_stats[indexB][b][3], cmom=final_stats[indexB][b][4], crms=final_stats[indexB][b][5]) o.write(printout) procfinish['branchStats_RDF'] = datetime.now()
if args.write: handles.append(dummyResult()) foNameDict[fn]["source"] = "Done" count_bases[fnumber] = 0 count_finals[fnumber] = 0 continue foNameDict[fn]["source"], requiresDeletion = prefetchFile(fn, longTermCache=args.longTermCache, verbose=True) if (args.prefetch or args.longTermCache) else (fn, False) print("temp output {}: {}".format(fnumber, foNameDict[fn]["temp"])) tchains[fn] = ROOT.TChain("Events") # tcmeta[fn] = ROOT.TChain("Runs") tchains[fn].Add(str(foNameDict[fn]["source"])) # tcmeta[fn].Add(str(foNameDict[fn]["source"])) rdfEntries = tchains[fn].GetEntries() rdf_bases[fn] = ROOT.ROOT.RDataFrame(tchains[fn]) count_bases[fnumber] = rdf_bases[fn].Count() booktriggers[fn] = ROOT.AddProgressBar(ROOT.RDF.AsRNode(rdf_bases[fn]), 2000, int(rdfEntries)) rdfFinal = rdf_bases[fn] #Placeholder if args.defines is not None: for define in args.defines: if len(define.split(">==>")) == 2: #backwards-compatibility! var, defn = define.split(">==>") else: var = define[:define.index("=")] defn = define[define.index("=")+1:] rdfFinal = rdfFinal.Define(var, defn) if args.filters is not None: for ncut, cut in enumerate(args.filters): if len(cut.split(">==>")) == 2: #backwards-compatibility! name, defn = cut.split(">==>")
def makeHists_RDF(branchDict, plotDict, eventsKey = "Events", weightsKey = "Weights"): triggers = collections.OrderedDict() rdf_nodes = branchDict.get("Events") event_counts = branchDict.get("SumEvents") branches_all = branchDict.get("Branches") branch_types_all = branchDict.get("BranchTypesRaw") procstart['makeHists_RDF(Define)'] = datetime.now() histDict = collections.OrderedDict() uniqueHistDict = collections.OrderedDict() for index in branchDict[eventsKey].keys(): histDict[index] = collections.OrderedDict() uniqueHistDict[index] = collections.OrderedDict() for branch, plot in plotDict.items(): #Catch the devectorized boolean branches in the plot cards: if branch[0:4] == "HLT_": if not doHLT: continue if branch[0:3] == "L1_": if not doL1: continue if branch[0:4] == "DVB_": branch_type = 'ROOT::VecOps::RVec<Bool_t>' else: branch_type = "UNKNOWN" print("{} - {}".format(branch, branch_type)) if branch_type == 'ROOT::VecOps::RVec<Bool_t>': print("Skipping branch that must be devectorized from boolean") continue # for index, rdf in rdf_nodes.items(): # if branch in histDict[index]: # print("Warning, dual definition of branch {} when making Histograms from plotcards. Will keep first branch only.") # continue # if branch not in rdf.GetColumnNames(): # print(branch) # uniqueHistDict[index][branch] = rdf.Histo1D(plot, str(branch), "diffWeight") # # print("({}, {}, {})".format(plot, branch, branchDict[weightsKey][index])) # else: # #change the plot name in memory by replacing # plot_labeled = (plot[0].replace("$INPUT", nToLabel[index]), plot[1], plot[2], plot[3], plot[4]) # histDict[index][branch] = rdf.Histo1D(plot_labeled, str(branch), "diffWeight") # if branch not in # #need to unpack these (root 6.22 dev) # dnode, dcolumns = devectorizeBool(rdf_nodes[index], branch, ) # for dcolumn in dcolumns: # stats[index][dcolumn] = (dnode.Filter("{dcol} > -1".format(dcol=dcolumn)).(dcolumn, "diffWeight"), branch_type) else: for index, rdf in rdf_nodes.items(): if branch in histDict[index]: print("Warning, dual definition of branch {} when making Histograms from plotcards. Will keep first branch only.") continue if branch not in branchDict["BranchSetsFiltered"][index]: print(branch) uniqueHistDict[index][branch] = rdf.Histo1D(plot, str(branch), "diffWeight") # print("({}, {}, {})".format(plot, branch, branchDict[weightsKey][index])) else: #change the plot name in memory by replacing plot_labeled = (plot[0].replace("$INPUT", nToLabel[index]), plot[1], plot[2], plot[3], plot[4]) histDict[index][branch] = rdf.Histo1D(plot_labeled, str(branch), "diffWeight") # elif branch_type == 'ULong64_t': # #run number # pass # else: # print("Skipping unhandled branch (name={}, type={})".format(branch, branch_type)) procfinish['makeHists_RDF(Define)'] = datetime.now() procstart['makeHists_RDF(Run)'] = datetime.now() for index, rdf in branchDict[eventsKey].items(): triggers[index] = ROOT.AddProgressBar(ROOT.RDF.AsRNode(rdf), max(100, int(event_counts[index]/50000)), int(event_counts[index])) print("Starting event loop...") for index, count in triggers.items(): print(count.GetValue()) # counts = map(lambda x: x.GetValue(), triggers.values()) for index, rdf in branchDict[eventsKey].items(): for branch, hist in histDict[index].items(): histDict[index][branch] = hist.GetPtr().Clone() for branch, hist in uniqueHistDict[index].items(): histDict[index][branch] = hist.GetPtr().Clone() procfinish['makeHists_RDF(Run)'] = datetime.now() return histDict, uniqueHistDict