def prepare(baseDir, particle, probe, resonance, era, config, num, denom, variableLabels, skipPlots=False, cutAndCount=False): subEra = era.split('_')[0] # data subera is beginning of era lumi = registry.luminosity(particle, probe, resonance, era, subEra) hists = {} effName = get_eff_name(num, denom) extEffName = get_extended_eff_name(num, denom, variableLabels) binning = config.binning() dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era) systList = config.get('systematics', { x: { 'fitTypes': [], 'shiftTypes': [] } for x in ['SF', 'dataEff', 'mcEff'] }) def get_variable_name_pretty(variableLabel): variables = config.variables() return variables.get(variableLabel, {}).get('pretty', variableLabel) # create output histograms nVars = len(variableLabels) if nVars == 1: THX = ROOT.TH1F elif nVars == 2: THX = ROOT.TH2F elif nVars == 3: THX = ROOT.TH3F else: raise NotImplementedError( 'More than 3 dimensions are not supported for scale factors') hargs = [extEffName, extEffName] for variableLabel in variableLabels: hargs += [ len(binning[variableLabel]) - 1, array('d', binning[variableLabel]) ] hist = THX(*hargs) axes = [hist.GetXaxis(), hist.GetYaxis(), hist.GetZaxis()] for vi, variableLabel in enumerate(variableLabels): axes[vi].SetTitle(get_variable_name_pretty(variableLabel)) if nVars == 1: hist.GetYaxis().SetTitle('Scalefactor') if nVars == 2: hist.SetOption('colz') hist.GetZaxis().SetTitle('Scalefactor') hist_stat = hist.Clone(extEffName + '_stat') hist_syst = hist.Clone(extEffName + '_syst') histList_syst = { 'combined_syst': hist.Clone(extEffName + '_combined_syst'), } histList_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty') hist_dataEff = hist.Clone(extEffName + '_efficiencyData') if nVars == 1: hist_dataEff.GetYaxis().SetTitle('Efficiency') if nVars == 2: hist_dataEff.GetZaxis().SetTitle('Efficiency') hist_dataEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyData_stat') hist_dataEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyData_syst') histList_dataEff_syst = { 'combined_syst': hist_dataEff.Clone(extEffName + '_efficiencyData_combined_syst'), } histList_dataEff_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty') hist_mcEff = hist_dataEff.Clone(extEffName + '_efficiencyMC') hist_mcEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyMC_stat') hist_mcEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyMC_syst') histList_mcEff_syst = { 'combined_syst': hist_dataEff.Clone(extEffName + '_efficiencyMC_combined_syst'), } histList_mcEff_syst['combined_syst'].GetZaxis().SetTitle('Uncertainty') # the individual systematics for iSyst in itertools.chain(systList['SF']['fitTypes'], systList['SF']['shiftTypes']): histList_syst[iSyst] = hist.Clone(extEffName + '_' + iSyst) histList_syst[iSyst + '_syst'] = hist.Clone(extEffName + '_' + iSyst + '_syst') histList_syst[iSyst + '_syst'].GetZaxis().SetTitle('Uncertainty') for iSyst in itertools.chain(systList['dataEff']['fitTypes'], systList['dataEff']['shiftTypes']): histList_dataEff_syst[iSyst] = hist_dataEff.Clone(extEffName + '_' + iSyst) histList_dataEff_syst[iSyst + '_syst'] = hist_dataEff.Clone(extEffName + '_' + iSyst + '_syst') histList_dataEff_syst[iSyst + '_syst'].GetZaxis().SetTitle('Uncertainty') for iSyst in itertools.chain(systList['mcEff']['fitTypes'], systList['mcEff']['shiftTypes']): histList_mcEff_syst[iSyst] = hist_mcEff.Clone(extEffName + '_' + iSyst) histList_mcEff_syst[iSyst + '_syst'] = hist_mcEff.Clone(extEffName + '_' + iSyst + '_syst') histList_mcEff_syst[iSyst + '_syst'].GetZaxis().SetTitle('Uncertainty') varName = get_variables_name(variableLabels) # iterate through the bin indices # this does nested for loops of the N-D binning (e.g. pt, eta) # binning starts at 1 (0 is underflow), same as ROOT indices = [ list(range(1, len(binning[variableLabel]))) for variableLabel in variableLabels ] output = {effName: {varName: {}}} for index in itertools.product(*indices): binName = get_full_name(num, denom, variableLabels, index) subVarKeys = [ '{}:[{},{}]'.format(variableLabels[i], binning[variableLabels[i]][ind - 1], binning[variableLabels[i]][ind]) for i, ind in enumerate(index) ] _out = output[effName][varName] # add binning definitions _out['binning'] = [{ 'variable': vl, 'binning': binning[vl].tolist(), } for vl in variableLabels] for subVarKey in subVarKeys: if subVarKey not in _out: _out[subVarKey] = {} _out = _out[subVarKey] # the fitted distributions fitType = 'Nominal' dataFNameFit = os.path.join(baseDir, 'fits_data', particle, probe, resonance, era, fitType, effName, binName + '.root') dataFNameCNC = os.path.join(baseDir, 'flat', particle, probe, resonance, era, dataSubEra, 'Nominal', extEffName + '.root') mcFNameCNC = os.path.join(baseDir, 'flat', particle, probe, resonance, era, mcSubEra, 'Nominal', extEffName + '.root') if cutAndCount: sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF_cutAndCount( binName, dataFNameCNC, mcFNameCNC) else: sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF( binName, dataFNameFit) fitTypes = set(systList['SF']['fitTypes'] + systList['dataEff']['fitTypes'] + systList['mcEff']['fitTypes']) shiftTypes = set(systList['SF']['shiftTypes'] + systList['dataEff']['shiftTypes'] + systList['mcEff']['shiftTypes']) if cutAndCount: sf_syst = getSyst_cutAndCount(binName, dataFNameCNC, mcFNameCNC, fitTypes, shiftTypes) else: sf_syst = getSyst(binName, dataFNameFit, fitTypes, shiftTypes) combined_syst = {} for kind in ['SF', 'dataEff', 'mcEff']: combined_syst[kind] = 0 errKey = 'err' if kind == 'dataEff': errKey = 'dataErr' if kind == 'mcEff': errKey = 'mcErr' for t in itertools.chain(systList[kind]['fitTypes'], systList[kind]['shiftTypes']): combined_syst[kind] += sf_syst[t][errKey]**2 combined_syst[kind] = combined_syst[kind]**0.5 sf_err = (sf_stat**2 + combined_syst['SF']**2)**0.5 dataErr = (dataStat**2 + combined_syst['dataEff']**2)**0.5 mcErr = (mcStat**2 + combined_syst['mcEff']**2)**0.5 _out['value'] = sf _out['stat'] = sf_stat _out['syst'] = combined_syst['SF'] for s in itertools.chain(systList['SF']['fitTypes'], systList['SF']['shiftTypes']): _out[s] = sf_syst[s]['err'] def set_bin(hist, index, val, err): index = list(index) val_args = index + [val] err_args = index + [err] hist.SetBinContent(*val_args) if err >= 0: hist.SetBinError(*err_args) set_bin(hist, index, sf, sf_err) set_bin(hist_stat, index, sf, sf_stat) set_bin(hist_syst, index, sf, combined_syst['SF']) set_bin(histList_syst['combined_syst'], index, combined_syst['SF'], -1) set_bin(hist_dataEff, index, dataEff, dataErr) set_bin(hist_dataEff_stat, index, dataEff, dataStat) set_bin(hist_dataEff_syst, index, dataEff, combined_syst['dataEff']) set_bin(histList_dataEff_syst['combined_syst'], index, combined_syst['dataEff'], -1) set_bin(hist_mcEff, index, mcEff, mcErr) set_bin(hist_mcEff_stat, index, mcEff, mcStat) set_bin(hist_mcEff_syst, index, mcEff, combined_syst['mcEff']) set_bin(histList_mcEff_syst['combined_syst'], index, combined_syst['mcEff'], -1) for iKey in sf_syst.keys(): if iKey in histList_syst: set_bin(histList_syst[iKey], index, sf_syst[iKey]['sf'], sf_syst[iKey]['err']) set_bin(histList_syst[iKey + '_syst'], index, sf_syst[iKey]['err'], -1) if iKey in histList_dataEff_syst: set_bin(histList_dataEff_syst[iKey], index, sf_syst[iKey]['dataEff'], sf_syst[iKey]['dataErr']) set_bin(histList_dataEff_syst[iKey + '_syst'], index, sf_syst[iKey]['dataErr'], -1) if iKey in histList_mcEff_syst: set_bin(histList_mcEff_syst[iKey], index, sf_syst[iKey]['mcEff'], sf_syst[iKey]['mcErr']) set_bin(histList_mcEff_syst[iKey + '_syst'], index, sf_syst[iKey]['mcErr'], -1) hists[extEffName] = hist hists[extEffName + '_stat'] = hist_stat hists[extEffName + '_syst'] = hist_syst hists[extEffName + '_efficiencyData'] = hist_dataEff hists[extEffName + '_efficiencyData_stat'] = hist_dataEff_stat hists[extEffName + '_efficiencyData_syst'] = hist_dataEff_syst hists[extEffName + '_efficiencyMC'] = hist_mcEff hists[extEffName + '_efficiencyMC_stat'] = hist_mcEff_stat hists[extEffName + '_efficiencyMC_syst'] = hist_mcEff_syst for iKey in histList_syst.keys(): hname = extEffName + '_' + iKey hists[hname] = histList_syst[iKey] for iKey in histList_dataEff_syst.keys(): hname = extEffName + '_efficiencyData_' + iKey hists[hname] = histList_dataEff_syst[iKey] for iKey in histList_mcEff_syst.keys(): hname = extEffName + '_efficiencyMC_' + iKey hists[hname] = histList_mcEff_syst[iKey] # save the efficiency plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) effDir = os.path.join(baseDir, 'efficiencies', particle, probe, resonance, era, effName) os.makedirs(effDir, exist_ok=True) effPath = os.path.join(effDir, extEffName) # JSON format with open('{}.json'.format(effPath), 'w') as f: f.write(json.dumps(output, indent=4, sort_keys=True)) # ROOT histogram format tfile = ROOT.TFile.Open('{}.root'.format(effPath), 'recreate') for h in sorted(hists): hists[h].Write(h) if nVars == 2 and not skipPlots: cName = 'c' + h canvas = ROOT.TCanvas(cName, cName, 1000, 800) ROOT.gStyle.SetPaintTextFormat("5.3f") canvas.SetRightMargin(0.24) hists[h].Draw('colz text') plotPath = os.path.join(plotDir, h) canvas.Modified() canvas.Update() CMS_lumi.cmsText = 'CMS' CMS_lumi.writeExtraText = True CMS_lumi.extraText = 'Preliminary' CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (lumi) CMS_lumi.CMS_lumi(canvas, 4, 0) canvas.Print('{}.png'.format(plotPath)) canvas.Print('{}.pdf'.format(plotPath)) canvas.Print('{}.root'.format(plotPath)) tfile.Close() if skipPlots: return # gets a graph projection of an ND histogram for a given axis # with axis index (ie x,y,z = 0,1,2) and other dimensions ind def get_graph(hist, axis, axis_ind, *ind): ind = list(ind) ni = axis.GetNbins() xvals = [axis.GetBinCenter(i + 1) for i in range(ni)] xvals_errLow = [ xvals[i] - axis.GetBinLowEdge(i + 1) for i in range(ni) ] xvals_errHigh = [ axis.GetBinUpEdge(i + 1) - xvals[i] for i in range(ni) ] yvals = [ hist.GetBinContent(*ind[:axis_ind] + [i + 1] + ind[axis_ind:]) for i in range(ni) ] yvals_err = [ hist.GetBinError(*ind[:axis_ind] + [i + 1] + ind[axis_ind:]) for i in range(ni) ] graph = ROOT.TGraphAsymmErrors( ni, array('d', xvals), array('d', yvals), array('d', xvals_errLow), array('d', xvals_errHigh), array('d', yvals_err), array('d', yvals_err), ) return graph # plot the efficiencies # some default colors for plots colors = [ ROOT.kBlack, ROOT.kBlue, ROOT.kRed, ROOT.kGreen + 2, ROOT.kMagenta + 1, ROOT.kOrange + 1, ROOT.kTeal - 1, ROOT.kRed - 3, ROOT.kCyan + 2 ] def plot_1d_eff(savename, graphs, labels=['Data', 'Simulation'], colors=colors, xlabel='', ylabel='Efficiency', xRange=[], additional_text=[]): ng = len(graphs) mg = ROOT.TMultiGraph() for gi in range(ng): graphs[gi].SetLineColor(colors[gi]) graphs[gi].SetMarkerColor(colors[gi]) mg.Add(graphs[gi]) canvas = ROOT.TCanvas(savename, savename, 800, 800) mg.Draw('AP0') mg.GetXaxis().SetTitle(xlabel) if xRange: mg.GetXaxis().SetRangeUser(*xRange) mg.GetYaxis().SetTitle(ylabel) mg.GetYaxis().SetRangeUser(0.8, 1.10) legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92) legend.SetTextFont(42) legend.SetBorderSize(0) legend.SetFillColor(0) for gi in range(ng): legend.AddEntry(graphs[gi], labels[gi], 'l') legend.SetHeader('{} / {}'.format(num, denom)) legend.Draw() if additional_text: nother = len(additional_text) dims = [0.18, 0.84 - nother * 0.04 - 0.02, 0.35, 0.84] text = ROOT.TPaveText(*dims + ['NB NDC']) text.SetTextFont(42) text.SetBorderSize(0) text.SetFillColor(0) text.SetTextAlign(11) text.SetTextSize(0.03) for rtext in additional_text: text.AddText(rtext) text.Draw() CMS_lumi.cmsText = 'CMS' CMS_lumi.writeExtraText = True CMS_lumi.extraText = 'Preliminary' CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (lumi) CMS_lumi.CMS_lumi(canvas, 4, 11) canvas.Print('{}.png'.format(savename)) canvas.Print('{}.pdf'.format(savename)) canvas.Print('{}.root'.format(savename)) # enumerate over the axis/variable to plot axes = [ hists[extEffName].GetXaxis(), hists[extEffName].GetYaxis(), hists[extEffName].GetZaxis() ] for vi, variableLabel in enumerate(variableLabels): # iterate over the other axis indices otherVariableLabels = [ ovl for ovl in variableLabels if ovl != variableLabel ] otherVariableIndices = [ ovi for ovi, ovl in enumerate(variableLabels) if ovl != variableLabel ] indices = [ list(range(1, len(binning[vl]))) for vl in otherVariableLabels ] if indices: for index in itertools.product(*indices): graph_data = get_graph(hists[extEffName + '_efficiencyData'], axes[vi], vi, *index) graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi], vi, *index) xlabel = get_variable_name_pretty(variableLabel) ylabel = 'Efficiency' xRange = [ axes[vi].GetBinLowEdge(1), axes[vi].GetBinUpEdge(axes[vi].GetNbins()) ] additional_text = [] for novi, (ovi, ovl) in enumerate( zip(otherVariableIndices, otherVariableLabels)): xlow = axes[ovi].GetBinLowEdge(index[novi]) xhigh = axes[ovi].GetBinUpEdge(index[novi]) rtext = '{} < {} < {}'.format( xlow, get_variable_name_pretty(ovl), xhigh) additional_text += [rtext] plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) otherVariableLabel = get_bin_name(otherVariableLabels, index) plotName = '{}_{}_vs_{}'.format(effName, otherVariableLabel, variableLabel) plotPath = os.path.join(plotDir, plotName) plot_1d_eff(plotPath, [graph_data, graph_mc], xlabel=xlabel, ylabel=ylabel, xRange=xRange, additional_text=additional_text) # dataEfficiency systs graphs = [ get_graph(hists[extEffName + '_efficiencyData'], axes[vi], vi, *index) ] labels = ['Nominal'] for iSyst in itertools.chain( systList['dataEff']['fitTypes'], systList['dataEff']['shiftTypes']): graphs += [ get_graph( hists[extEffName + '_efficiencyData_' + iSyst], axes[vi], vi, *index) ] labels += [iSyst] plotName = '{}_{}_vs_{}_efficiencyData_syst'.format( effName, otherVariableLabel, variableLabel, ) plotPath = os.path.join(plotDir, plotName) plot_1d_eff(plotPath, graphs, labels=labels, xlabel=xlabel, ylabel=ylabel, xRange=xRange, additional_text=additional_text) # mcEfficiency systs graphs = [ get_graph(hists[extEffName + '_efficiencyMC'], axes[vi], vi, *index) ] labels = ['Nominal'] for iSyst in itertools.chain(systList['mcEff']['fitTypes'], systList['mcEff']['shiftTypes']): graphs += [ get_graph(hists[extEffName + '_efficiencyMC_' + iSyst], axes[vi], vi, *index) ] labels += [iSyst] plotName = '{}_{}_vs_{}_efficiencyMC_syst'.format( effName, otherVariableLabel, variableLabel, ) plotPath = os.path.join(plotDir, plotName) plot_1d_eff(plotPath, graphs, labels=labels, xlabel=xlabel, ylabel=ylabel, xRange=xRange, additional_text=additional_text) # if no indices, easier, just itself else: graph_data = get_graph(hists[extEffName + '_efficiencyData'], axes[vi], vi) graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi], vi) xlabel = get_variable_name_pretty(variableLabel) ylabel = 'Efficiency' xRange = [ axes[0].GetBinLowEdge(1), axes[0].GetBinUpEdge(axes[0].GetNbins()) ] plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) plotName = '{}_vs_{}'.format(effName, variableLabel) plotPath = os.path.join(plotDir, plotName) plot_1d_eff(plotPath, [graph_data, graph_mc], xlabel=xlabel, ylabel=ylabel, xRange=xRange)
def prepare(baseDir, particle, probe, resonance, era, config, num, denom, variableLabels): hists = {} effName = get_eff_name(num, denom) extEffName = get_extended_eff_name(num, denom, variableLabels) binning = config.binning() dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era) systList = config.get('systematics', { x: { 'fitTypes': [], 'shittTypes': [] } for x in ['SF', 'dataEff', 'mcEff'] }) def get_variable_name_pretty(variableLabel): variables = config.variables() return variables.get(variableLabel, {}).get('pretty', variableLabel) # create output histograms nVars = len(variableLabels) if nVars == 1: THX = ROOT.TH1F elif nVars == 2: THX = ROOT.TH2F elif nVars == 3: THX = ROOT.TH3F else: raise NotImplementedError( 'More than 3 dimensions are not supported for scale factors') hargs = [extEffName, extEffName] for variableLabel in variableLabels: hargs += [ len(binning[variableLabel]) - 1, array('d', binning[variableLabel]) ] hist = THX(*hargs) axes = [hist.GetXaxis(), hist.GetYaxis(), hist.GetZaxis()] for vi, variableLabel in enumerate(variableLabels): axes[vi].SetTitle(get_variable_name_pretty(variableLabel)) if nVars == 1: hist.GetYaxis().SetTitle('Scalefactor') if nVars == 2: hist.SetOption('colz') hist.GetZaxis().SetTitle('Scalefactor') hist_stat = hist.Clone(extEffName + '_stat') hist_syst = hist.Clone(extEffName + '_syst') histList_syst = {'combined': hist.Clone(effName + '_combinedSyst')} hist_dataEff = hist.Clone(extEffName + '_efficiencyData') if nVars == 1: hist_dataEff.GetYaxis().SetTitle('Efficiency') if nVars == 2: hist_dataEff.GetZaxis().SetTitle('Efficiency') hist_dataEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyData_stat') hist_dataEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyData_syst') histList_dataEff_syst = { 'combined': hist_dataEff.Clone(effName + '_efficiencyData_combinedSyst') } hist_mcEff = hist_dataEff.Clone(extEffName + '_efficiencyMC') hist_mcEff_stat = hist_dataEff.Clone(extEffName + '_efficiencyMC_stat') hist_mcEff_syst = hist_dataEff.Clone(extEffName + '_efficiencyMC_syst') histList_mcEff_syst = { 'combined': hist_dataEff.Clone(effName + '_efficiencyMC_combinedSyst') } for iSyst in itertools.chain(systList['SF']['fitTypes'], systList['SF']['shiftTypes']): histList_syst.update({iSyst: hist.Clone(effName + '_' + iSyst)}) for iSyst in itertools.chain(systList['dataEff']['fitTypes'], systList['dataEff']['shiftTypes']): histList_dataEff_syst.update( {iSyst: hist.Clone(effName + '_' + iSyst)}) for iSyst in itertools.chain(systList['mcEff']['fitTypes'], systList['mcEff']['shiftTypes']): histList_mcEff_syst.update({iSyst: hist.Clone(effName + '_' + iSyst)}) varName = get_variables_name(variableLabels) # iterate through the bin indices # this does nested for loops of the N-D binning (e.g. pt, eta) # binning starts at 1 (0 is underflow), same as ROOT indices = [ list(range(1, len(binning[variableLabel]))) for variableLabel in variableLabels ] output = {effName: {varName: {}}} for index in itertools.product(*indices): binName = get_full_name(num, denom, variableLabels, index) subVarKeys = [ '{}:[{},{}]'.format(variableLabels[i], binning[variableLabels[i]][ind - 1], binning[variableLabels[i]][ind]) for i, ind in enumerate(index) ] _out = output[effName][varName] # add binning definitions _out['binning'] = [{ 'variable': vl, 'binning': binning[vl].tolist(), } for vl in variableLabels] for subVarKey in subVarKeys: if subVarKey not in _out: _out[subVarKey] = {} _out = _out[subVarKey] # the fitted distributions fitType = 'Nominal' dataFNameFit = os.path.join(baseDir, 'fits_data', particle, probe, resonance, era, fitType, effName, binName + '.root') sf, sf_stat, dataEff, dataStat, mcEff, mcStat = getSF( binName, dataFNameFit) sf_syst = getSyst(binName, dataFNameFit, dataEff, mcEff, systList['SF']['fitTypes'], systList['SF']['shiftTypes']) dataSyst = getSyst(binName, dataFNameFit, dataEff, mcEff, systList['dataEff']['fitTypes'], systList['dataEff']['shiftTypes']) mcSyst = getSyst(binName, dataFNameFit, dataEff, mcEff, systList['mcEff']['fitTypes'], systList['mcEff']['shiftTypes']) sf_err = (sf_stat**2 + sf_syst['combined']**2)**0.5 dataErr = (dataStat**2 + dataSyst['combined']**2)**0.5 mcErr = (mcStat**2 + mcSyst['combined']**2)**0.5 _out['value'] = sf _out['stat'] = sf_stat _out['syst'] = sf_syst['combined'] for s in itertools.chain(systList['SF']['fitTypes'], systList['SF']['shiftTypes']): _out[s] = sf_syst[s] def set_bin(hist, index, val, err): index = list(index) val_args = index + [val] err_args = index + [err] hist.SetBinContent(*val_args) if err >= 0: hist.SetBinError(*err_args) set_bin(hist, index, sf, sf_err) set_bin(hist_stat, index, sf, sf_stat) set_bin(hist_syst, index, sf, sf_syst['combined']) for iKey in sf_syst.keys(): set_bin(histList_syst[iKey], index, sf_syst[iKey], -1) set_bin(hist_dataEff, index, dataEff, dataErr) set_bin(hist_dataEff_stat, index, dataEff, dataStat) set_bin(hist_dataEff_syst, index, dataEff, dataSyst['combined']) for iKey in dataSyst.keys(): set_bin(histList_dataEff_syst[iKey], index, dataSyst[iKey], -1) set_bin(hist_mcEff, index, mcEff, mcErr) set_bin(hist_mcEff_stat, index, mcEff, mcStat) set_bin(hist_mcEff_syst, index, mcEff, mcSyst['combined']) for iKey in mcSyst.keys(): set_bin(histList_mcEff_syst[iKey], index, mcSyst[iKey], -1) hists[extEffName] = hist hists[extEffName + '_stat'] = hist_stat hists[extEffName + '_syst'] = hist_syst hists[extEffName + '_efficiencyData'] = hist_dataEff hists[extEffName + '_efficiencyData_stat'] = hist_dataEff_stat hists[extEffName + '_efficiencyData_syst'] = hist_dataEff_syst hists[extEffName + '_efficiencyMC'] = hist_mcEff hists[extEffName + '_efficiencyMC_stat'] = hist_mcEff_stat hists[extEffName + '_efficiencyMC_syst'] = hist_mcEff_syst for iKey in histList_syst.keys(): hists[effName + '_' + iKey] = histList_syst[iKey] for iKey in histList_dataEff_syst.keys(): hists[effName + '_efficiencyData_' + iKey] = histList_dataEff_syst[iKey] for iKey in histList_mcEff_syst.keys(): hists[effName + '_efficiencyMC_' + iKey] = histList_mcEff_syst[iKey] # save the efficiency plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) effDir = os.path.join(baseDir, 'efficiencies', particle, probe, resonance, era, effName) os.makedirs(effDir, exist_ok=True) effPath = os.path.join(effDir, extEffName) # JSON format with open('{}.json'.format(effPath), 'w') as f: f.write(json.dumps(output, indent=4, sort_keys=True)) # ROOT histogram format tfile = ROOT.TFile.Open('{}.root'.format(effPath), 'recreate') for h in sorted(hists): hists[h].Write(h) if nVars == 2: cName = 'c' + h canvas = ROOT.TCanvas(cName, cName, 1000, 800) ROOT.gStyle.SetPaintTextFormat("5.3f") canvas.SetRightMargin(0.24) hists[h].Draw('colz text') plotPath = os.path.join(plotDir, h) canvas.Modified() canvas.Update() canvas.Print('{}.png'.format(plotPath)) canvas.Print('{}.pdf'.format(plotPath)) tfile.Close() # gets a graph projection of an ND histogram for a given axis # with axis index (ie x,y,z = 0,1,2) and other dimensions ind def get_graph(hist, axis, axis_ind, *ind): ind = list(ind) ni = axis.GetNbins() xvals = [axis.GetBinCenter(i + 1) for i in range(ni)] xvals_errLow = [ xvals[i] - axis.GetBinLowEdge(i + 1) for i in range(ni) ] xvals_errHigh = [ axis.GetBinUpEdge(i + 1) - xvals[i] for i in range(ni) ] yvals = [ hist.GetBinContent(*ind[:axis_ind] + [i + 1] + ind[axis_ind:]) for i in range(ni) ] yvals_err = [ hist.GetBinError(*ind[:axis_ind] + [i + 1] + ind[axis_ind:]) for i in range(ni) ] graph = ROOT.TGraphAsymmErrors( ni, array('d', xvals), array('d', yvals), array('d', xvals_errLow), array('d', xvals_errHigh), array('d', yvals_err), array('d', yvals_err), ) return graph # plot the efficiencies # enumerate over the axis/variable to plot axes = [ hists[extEffName].GetXaxis(), hists[extEffName].GetYaxis(), hists[extEffName].GetZaxis() ] for vi, variableLabel in enumerate(variableLabels): # iterate over the other axis indices otherVariableLabels = [ ovl for ovl in variableLabels if ovl != variableLabel ] otherVariableIndices = [ ovi for ovi, ovl in enumerate(variableLabels) if ovl != variableLabel ] indices = [ list(range(1, len(binning[vl]))) for vl in otherVariableLabels ] if indices: for index in itertools.product(*indices): graph_data = get_graph(hists[extEffName + '_efficiencyData'], axes[vi], vi, *index) graph_data.SetLineColor(ROOT.kBlack) graph_data.SetMarkerColor(ROOT.kBlack) graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi], vi, *index) graph_mc.SetLineColor(ROOT.kBlue) graph_mc.SetMarkerColor(ROOT.kBlue) mg = ROOT.TMultiGraph() mg.Add(graph_data) mg.Add(graph_mc) cName = 'c' + extEffName + '_'.join([str(i) for i in index])\ + variableLabel canvas = ROOT.TCanvas(cName, cName, 800, 800) mg.Draw('AP0') mg.GetXaxis().SetTitle(get_variable_name_pretty(variableLabel)) xRange = [ axes[vi].GetBinLowEdge(1), axes[vi].GetBinUpEdge(axes[vi].GetNbins()) ] mg.GetXaxis().SetRangeUser(*xRange) mg.GetYaxis().SetTitle('Efficiency') mg.GetYaxis().SetRangeUser(0.8, 1.10) legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92) legend.SetTextFont(42) legend.SetBorderSize(0) legend.SetFillColor(0) legend.AddEntry(graph_data, 'Data', 'l') legend.AddEntry(graph_mc, 'Simulation', 'l') legend.SetHeader('{} / {}'.format(num, denom)) legend.Draw() nother = len(indices) dims = [0.18, 0.84 - nother * 0.04 - 0.02, 0.35, 0.84] text = ROOT.TPaveText(*dims + ['NB NDC']) text.SetTextFont(42) text.SetBorderSize(0) text.SetFillColor(0) text.SetTextAlign(11) text.SetTextSize(0.03) for novi, (ovi, ovl) in enumerate( zip(otherVariableIndices, otherVariableLabels)): xlow = axes[ovi].GetBinLowEdge(index[novi]) xhigh = axes[ovi].GetBinUpEdge(index[novi]) rtext = '{} < {} < {}'.format( xlow, get_variable_name_pretty(ovl), xhigh) text.AddText(rtext) text.Draw() CMS_lumi.cmsText = 'CMS' CMS_lumi.writeExtraText = True CMS_lumi.extraText = 'Preliminary' CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (41.5) CMS_lumi.CMS_lumi(canvas, 4, 11) plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) otherVariableLabel = get_bin_name(otherVariableLabels, index) plotName = '{}_{}_vs_{}'.format(effName, otherVariableLabel, variableLabel) plotPath = os.path.join(plotDir, plotName) canvas.Print('{}.png'.format(plotPath)) canvas.Print('{}.pdf'.format(plotPath)) # if no indices, easier, just itself else: graph_data = get_graph(hists[extEffName + '_efficiencyData'], axes[vi], vi) graph_data.SetLineColor(ROOT.kBlack) graph_data.SetMarkerColor(ROOT.kBlack) graph_mc = get_graph(hists[extEffName + '_efficiencyMC'], axes[vi], vi) graph_mc.SetLineColor(ROOT.kBlue) graph_mc.SetMarkerColor(ROOT.kBlue) mg = ROOT.TMultiGraph() mg.Add(graph_data) mg.Add(graph_mc) canvas = ROOT.TCanvas('c' + extEffName, 'c', 800, 800) mg.Draw('AP0') mg.GetXaxis().SetTitle(get_variable_name_pretty(variableLabel)) mg.GetYaxis().SetTitle('Efficiency') mg.GetYaxis().SetRangeUser(0.8, 1.10) legend = ROOT.TLegend(0.5, 0.70, 0.92, 0.92) legend.SetTextFont(42) legend.SetBorderSize(0) legend.SetFillColor(0) legend.AddEntry(graph_data, 'Data', 'l') legend.AddEntry(graph_mc, 'Simulation', 'l') legend.SetHeader('{} / {}'.format(num, denom)) legend.Draw() CMS_lumi.cmsText = 'CMS' CMS_lumi.writeExtraText = True CMS_lumi.extraText = 'Preliminary' CMS_lumi.lumi_13TeV = "%0.1f fb^{-1}" % (41.5) CMS_lumi.CMS_lumi(canvas, 4, 11) plotDir = os.path.join(baseDir, 'plots', particle, probe, resonance, era, effName, 'efficiency') os.makedirs(plotDir, exist_ok=True) plotName = '{}_vs_{}'.format(effName, variableLabel) plotPath = os.path.join(plotDir, plotName) canvas.Print('{}.png'.format(plotPath)) canvas.Print('{}.pdf'.format(plotPath))
def run_conversion(spark, particle, probe, resonance, era, subEra, config, shift='Nominal', **kwargs): _numerator = kwargs.pop('numerator', []) _denominator = kwargs.pop('denominator', []) _baseDir = kwargs.pop('baseDir', '') testing = False print('Running conversion for', resonance, era, subEra, shift) if useParquet: fnames = list(registry.parquet( particle, probe, resonance, era, subEra)) else: fnames = registry.root(particle, probe, resonance, era, subEra) fnames = ['root://eoscms.cern.ch/'+f for f in fnames] jobPath = os.path.join(particle, probe, resonance, era, subEra) if shift: jobPath = os.path.join(jobPath, shift) if testing: jobPath = os.path.join('testing', jobPath) else: jobPath = os.path.join('flat', jobPath) if _baseDir: jobPath = os.path.join(_baseDir, jobPath) os.makedirs(jobPath, exist_ok=True) doGen = subEra in ['DY_madgraph', 'DY_powheg'] # default numerator/denominator defintions efficiencies = config.efficiencies() # get the dataframe if useParquet: print('Loading parquet files:', fnames) if isinstance(fnames, list): baseDF = spark.read.parquet(*fnames) else: baseDF = spark.read.parquet(fnames) else: treename = registry.treename(particle, probe, resonance, era, subEra) baseDF = spark.read.format("root")\ .option('tree', treename)\ .load(fnames) # create the miniIsoaltion columns miniIsoDF = get_miniIso_dataframe(baseDF) # create the definitions columns definitions = config.definitions() defDF = miniIsoDF for d in definitions: defDF = defDF.withColumn(d, F.expr(definitions[d])) # select tags tagsDF = defDF.filter(config.selection()) # build the weights (pileup for MC) weightedDF = get_weighted_dataframe( tagsDF, doGen, resonance, era, subEra, shift=shift) # create the binning structure fitVariable = config.fitVariable() binningSet = set([fitVariable]) if doGen: fitVariableGen = config.fitVariableGen() binningSet = binningSet.union(set([fitVariableGen])) binVariables = config.binVariables() for bvs in binVariables: binningSet = binningSet.union(set(bvs)) binning = config.binning() variables = config.variables() binnedDF = weightedDF for bName in binningSet: binnedDF = get_binned_dataframe( binnedDF, bName+"Bin", variables[bName]['variable'], binning[bName]) # build the unrealized yield dataframes # they are binned in the ID, bin variables, and fit variable yields = {} yields_gen = {} for numLabel, denLabel in efficiencies: den = binnedDF.filter(denLabel) for binVars in binVariables: key = (numLabel, denLabel, tuple(binVars)) yields[key] = den.groupBy( numLabel, *[b+'Bin' for b in binVars+[fitVariable]])\ .agg({'weight2': 'sum', 'weight': 'sum'}) if doGen: yields_gen[key] = den.groupBy( numLabel, *[b+'Bin' for b in binVars+[fitVariableGen]])\ .agg({'weight2': 'sum', 'weight': 'sum'}) def get_values(df, mLabel, **binValues): for k, v in binValues.items(): df = df[df[k] == v] df = df.set_index(mLabel) # fill empty bins with 0 # includes underflow and overflow in the ROOT numbering scheme # (0 is underflow, len(binning)+1 is overflow) values = pd.Series(np.zeros(len(binning['mass'])+1)) values[df.index] = df['sum(weight)'] values = values.to_numpy() sumw2 = pd.Series(np.zeros(len(binning['mass'])+1)) if 'sum(weight2)' in df.columns: sumw2[df.index] = df['sum(weight2)'] else: sumw2[df.index] = df['sum(weight)'] # no weights provided sumw2 = sumw2.to_numpy() return values, sumw2 def get_hist(values, sumw2, edges, overflow=True): if overflow: hist = TH1.from_numpy((values[1:-1], edges)) hist[0] = values[0] hist[-1] = values[-1] hist._fSumw2 = sumw2 else: hist = TH1.from_numpy((values, edges)) hist._fSumw2[1:-1] = sumw2 return hist # realize each of the yield tables # then produce the histograms and saves them # this is the first time things are put into memory for num_den_binVars in yields: num, den, binVars = num_den_binVars if _numerator and num not in _numerator: continue if _denominator and den not in _denominator: continue extended_eff_name = get_extended_eff_name(num, den, binVars) eff_outname = f'{jobPath}/{extended_eff_name}.root' hists = {} print('Processing', eff_outname) realized = yields[num_den_binVars].toPandas() for bins in itertools.product( *[range(1, len(binning[b])) for b in binVars]): binname = get_full_name(num, den, binVars, bins) binargs = {b+'Bin': v for b, v in zip(binVars, bins)} mLabel = fitVariable + 'Bin' passargs = {num: True} passargs.update(binargs) values, sumw2 = get_values(realized, mLabel, **passargs) edges = binning[fitVariable] hists[binname+'_Pass'] = get_hist(values, sumw2, edges) failargs = {num: False} failargs.update(binargs) values, sumw2 = get_values(realized, mLabel, **failargs) edges = binning[fitVariable] hists[binname+'_Fail'] = get_hist(values, sumw2, edges) if doGen: realized = yields_gen[num_den_binVars].toPandas() for bins in itertools.product( *[range(1, len(binning[b])) for b in binVars]): binname = get_full_name(num, den, binVars, bins) binargs = {b+'Bin': v for b, v in zip(binVars, bins)} mLabel = fitVariableGen + 'Bin' passargs = {num: True} passargs.update(binargs) values, sumw2 = get_values(realized, mLabel, **passargs) edges = binning[fitVariableGen] hists[binname+'_Pass_Gen'] = get_hist(values, sumw2, edges) failargs = {num: False} failargs.update(binargs) values, sumw2 = get_values(realized, mLabel, **failargs) edges = binning[fitVariableGen] hists[binname+'_Fail_Gen'] = get_hist(values, sumw2, edges) with uproot.recreate(eff_outname) as f: for h, hist in sorted(hists.items()): f[h] = hist
def build_fit_jobs(particle, probe, resonance, era, config, **kwargs): _baseDir = kwargs.pop('baseDir', '') _numerator = kwargs.pop('numerator', []) _denominator = kwargs.pop('denominator', []) _fitType = kwargs.pop('fitType', []) _shiftType = kwargs.pop('shiftType', []) _sampleType = kwargs.pop('sampleType', []) _efficiencyBin = kwargs.pop('efficiencyBin', []) _recover = kwargs.pop('recover', False) _recoverMode = kwargs.pop('recoverMode', 'simple') doData = (not _sampleType) or ('data' in _sampleType) doMC = (not _sampleType) or ('mc' in _sampleType) dataSubEra, mcSubEra = get_data_mc_sub_eras(resonance, era) def process(outFName): if _recover and _recoverMode == 'simple': return recover_simple(outFName) return True jobs = [] # iterate through the efficiencies efficiencies = config.efficiencies() binning = config.binning() for num, denom in efficiencies: if _numerator and num not in _numerator: continue if _denominator and denom not in _denominator: continue # iterate through the output binning structure for variableLabels in config.binVariables(): # iterate through the bin indices # this does nested for loops of the N-D binning (e.g. pt, eta) indices = [ list(range(len(binning[variableLabel]) - 1)) for variableLabel in variableLabels ] for index in itertools.product(*indices): # binning goes from 1 to N index = [i + 1 for i in index] binName = get_full_name(num, denom, variableLabels, index) extEffName = get_extended_eff_name(num, denom, variableLabels) effName = get_eff_name(num, denom) if _efficiencyBin and binName not in _efficiencyBin: continue def get_jobs(fitType, shiftType, inType, outType): _jobs = [] templateFName = os.path.join(_baseDir, 'flat', particle, probe, resonance, era, mcSubEra, inType, extEffName + '.root') outFName = os.path.join(_baseDir, 'fits_data', particle, probe, resonance, era, outType, effName, binName + '.root') inFName = os.path.join(_baseDir, 'flat', particle, probe, resonance, era, dataSubEra, inType, extEffName + '.root') plotDir = os.path.join(_baseDir, 'plots', particle, probe, resonance, era, 'fits_data', outType, effName) if doData and process(outFName): _jobs += [(outFName, inFName, binName, templateFName, plotDir, fitType, 'data', shiftType)] outFName = os.path.join(_baseDir, 'fits_mc', particle, probe, resonance, era, outType, effName, binName + '.root') inFName = os.path.join(_baseDir, 'flat', particle, probe, resonance, era, mcSubEra, inType, extEffName + '.root') plotDir = os.path.join(_baseDir, 'plots', particle, probe, resonance, era, 'fits_mc', outType, effName) # there is no need to fit MC for templates # PDF based fits are: # NominalOld, AltSigOld if doMC and process(outFName) and\ fitType in ['NominalOld', 'AltSigOld']: _jobs += [(outFName, inFName, binName, templateFName, plotDir, fitType, 'mc', shiftType)] return _jobs for fitShift in config.fitShifts(): if (_fitType or _shiftType): if not ((_fitType and fitShift in _fitType) or (_shiftType and fitShift in _shiftType)): continue params = config.fitShift(fitShift) jobs += get_jobs(params['fitType'], params['shiftType'], params['inType'], fitShift) return jobs