def distribution(data_, args, feat, pt_range, mass_range, title=None): """ Perform study of substructure variable distributions. Saves plot `figures/distribution_[feat].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. feat: Feature for which to plot signal- and background distributions. """ # Select data if pt_range is not None: data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])] else: data = data_ pass if mass_range is not None: data = data[(data['m'] > mass_range[0]) & (data['m'] < mass_range[1])] pass # Define bins xmin = wpercentile(data[feat].values, 1, weights=data['weight_test'].values) xmax = wpercentile(data[feat].values, 99, weights=data['weight_test'].values) if feat == 'D2-k#minusNN': print "distribution: kNN feature '{}'".format(feat) xmin, xmax = -1., 2. elif feat.lower().startswith('d2'): print "distribution: D2 feature '{}'".format(feat) xmin, xmax = 0., 3. elif 'tau21' in feat.lower(): xmin, xmax = 0., 1. pass snap = 0.5 # Snap to nearest multiple in appropriate direction xmin = np.floor(xmin / snap) * snap xmax = np.ceil(xmax / snap) * snap bins = np.linspace(xmin, xmax, 50 + 1, endpoint=True) # Perform plotting c = plot(args, data, feat, bins, pt_range, mass_range) # Output mkdir('figures/distribution/') path = 'figures/distribution/distribution_{}{}{}.pdf'.format( standardise(feat), '__pT{:.0f}_{:.0f}'.format(pt_range[0], pt_range[1]) if pt_range is not None else '', '__mass{:.0f}_{:.0f}'.format( mass_range[0], mass_range[1]) if mass_range is not None else '') c.save(path=path) #this was actually missing, lol return c, args, path
def jetmass (data, args, feat, eff_sig=50): """ Perform study of jet mass distributions before and after subtructure cut. Saves plot `figures/jetmass_[feat]__eff_sig_[eff_sig].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. feat: Feature for which to plot signal- and background distributions. eff_sig: Signal efficiency at which to impose cut """ # Define masks and direction-dependent cut value msk_sig = data['signal'] == 1 msk_bkg = ~msk_sig eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig cut = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight_test'].values) msk_pass = data[feat] > cut # Ensure correct cut direction if signal_low(feat): msk_pass = ~msk_pass pass # Perform plotting c = plot(data, args, feat, msk_pass, msk_bkg, eff_sig) # Output path = 'figures/jetmass_{}__eff_sig_{:d}.pdf'.format(standardise(feat), int(eff_sig)) return c, args, path
def efficiency(data, args, feat, title=None): """ Perform study of background efficiency vs. mass for different inclusive efficiency cuts Saves plot `figures/efficiency_[feat].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. feat: Feature for which to study efficiencies """ # Define common variables msk = data['signal'] == 0 effs = [5, 10, 20, 40, 80] # Define cuts cuts = list() for eff in effs: cut = wpercentile(data.loc[msk, feat].values, eff if signal_low(feat) else 100 - eff, weights=data.loc[msk, 'weight_test'].values) cuts.append(cut) pass # Compute cut efficiency vs. mass profiles = list() for cut, eff in zip(cuts, effs): # Get correct pass-cut mask msk_pass = data[feat] > cut if signal_low(feat): msk_pass = ~msk_pass pass # Fill efficiency profile profile = ROOT.TProfile('profile_{}_{}'.format(feat, cut), "", len(MASSBINS) - 1, MASSBINS) M = np.vstack((data.loc[msk, 'm'].values, msk_pass[msk])).T weights = data.loc[msk, 'weight_test'].values root_numpy.fill_profile(profile, M, weights=weights) # Add to list profiles.append(profile) pass # Perform plotting c = plot(args, data, feat, profiles, cuts, effs) # Output if title is None: path = 'figures/efficiency_{}.pdf'.format(standardise(feat)) else: path = 'figures/' + title + '_efficiency_{}.pdf'.format( standardise(feat)) c.save(path=path) return c, args, path
def fill_profile(data): """Fill ROOT.TH2F with the measured, weighted values of the `EFF`-percentile of the background `VAR`. """ # Define arrays shape = (AXIS[VARX][0], AXIS[VARY][0]) bins = [ np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS ] x, y, z = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C')) #data['weight1'] = data['sample_weight']*data['MC_weight'] # Fill profile for i, j in itertools.product(*map(range, shape)): # Bin edges in x and y edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)] # Masks msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS)] msk = reduce(lambda x, y: x & y, msks) # Percentile perc = np.nan if np.sum( msk ) > 20: # Ensure sufficient statistics for meaningful percentile. Was 20 perc = wpercentile( data=data.loc[msk, VAR].values, percents=100 - EFF, weights=data.loc[msk, 'TotalEventWeight'].values) #wpercentile pass x[i, j] = np.mean(edges[0]) y[i, j] = np.mean(edges[1]) z[i, j] = perc # Set non-zero bin content if perc != np.nan: profile.SetBinContent(i + 1, j + 1, perc) pass pass # Normalise arrays x, y = standardise(x, y, rank=None) # Filter out NaNs msk = ~np.isnan(z) x, y, z = x[msk], y[msk], z[msk] return profile, (x, y, z)
def distribution(data_, args, feat, pt_range, mass_range): """ Perform study of substructure variable distributions. Saves plot `figures/distribution_[feat].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. feat: Feature for which to plot signal- and background distributions. """ # Select data if pt_range is not None: data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])] else: data = data_ pass if mass_range is not None: data = data[(data['m'] > mass_range[0]) & (data['m'] < mass_range[1])] pass # Define bins xmin = wpercentile(data[feat].values, 1, weights=data['weight_test'].values) xmax = wpercentile(data[feat].values, 99, weights=data['weight_test'].values) snap = 0.5 # Snap to nearest multiple in appropriate direction xmin = np.floor(xmin / snap) * snap xmax = np.ceil(xmax / snap) * snap bins = np.linspace(xmin, xmax, 50 + 1, endpoint=True) # Perform plotting c = plot(args, data, feat, bins, pt_range, mass_range) # Output path = 'figures/distribution_{}{}{}.pdf'.format( standardise(feat), '__pT{:.0f}_{:.0f}'.format(pt_range[0], pt_range[1]) if pt_range is not None else '', '__mass{:.0f}_{:.0f}'.format( mass_range[0], mass_range[1]) if mass_range is not None else '') return c, args, path
def fill_profile (data, variable, bg_eff, signal_above=False): """Fill ROOT.TH2F with the measured, weighted values of the bg_eff-percentile of the background `VAR`. """ if signal_above: bg_eff = 100. - bg_eff # ensures that region above cut is counted as signal, not below # Define arrays shape = (AXIS[VARX][0], AXIS[VARY][0]) bins = [np.linspace(AXIS[var][1], AXIS[var][2], AXIS[var][0] + 1, endpoint=True) for var in VARS] x, y, z = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH2F('profile', "", len(bins[0]) - 1, bins[0].flatten('C'), len(bins[1]) - 1, bins[1].flatten('C')) # Fill profile for i,j in itertools.product(*map(range, shape)): # Bin edges in x and y edges = [bin[idx:idx+2] for idx, bin in zip([i,j],bins)] # Masks msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1]) for dim, var in enumerate(VARS)] msk = reduce(lambda x,y: x & y, msks) # Percentile perc = np.nan if np.sum(msk) > 20: # Ensure sufficient statistics for meaningful percentile perc = wpercentile(data= data.loc[msk, variable] .values, percents=bg_eff, weights=data.loc[msk, 'weight_test'].values) pass x[i,j] = np.mean(edges[0]) y[i,j] = np.mean(edges[1]) z[i,j] = perc # Set non-zero bin content if perc != np.nan: profile.SetBinContent(i + 1, j + 1, perc) pass pass # Normalise arrays x,y = standardise(x,y) # Filter out NaNs msk = ~np.isnan(z) x, y, z = x[msk], y[msk], z[msk] return profile, (x,y,z)
def jetmasscomparison(data, args, features, eff_sig=50): """ Perform study of jet mass distributions before and after subtructure cut for different substructure taggers. Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. features: Features for which to plot signal- and background distributions. eff_sig: Signal efficiency at which to impose cut. """ # Define masks and direction-dependent cut value msk_sig = data['signal'] == 1 cuts, msks_pass = dict(), dict() for feat in features: eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig cut = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight_test'].values) msks_pass[feat] = data[feat] > cut # Ensure correct cut direction if signal_low(feat): msks_pass[feat] = ~msks_pass[feat] pass pass # Perform plotting c = plot(data, args, features, msks_pass, eff_sig) # Perform plotting on individual figures plot_individual(data, args, features, msks_pass, eff_sig) # Output path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format(int(eff_sig)) return c, args, path
def main(args): # ... # Load data data_, features, _ = load_data(args.input + 'data.h5', train=True) for pt_bin in [(200., 500.), (500., 1000.)]: # Impose pT-cut data = data_[(data_['pt'] >= pt_bin[0]) & (data_['pt'] < pt_bin[1])] var = 'Tau21' msk_sig = (data['signal'] == 1) x = data[var].values m = data['m'].values w = data['weight_test'].values # Get cut value cut = wpercentile(x[msk_sig], 50., weights=w) print "Cut value: {:.2f}".format(cut) # Discard signal x = x[~msk_sig] m = m[~msk_sig] w = w[~msk_sig] # Get pass mask msk_pass = x < cut print "Background efficiency: {:.1f}%".format( 100. * w[msk_pass].sum() / w.sum()) # Canvas offset = 0.06 margin = 0.3 # @NOTE # A = Height of pad 0 # B = Height of pads 1,2 # C = Height of pad 3 # --> # A = 0.5 # # (1. - 2 * offset) * B = (1. - 2*offset - margin) * C # ==> # B = C * (1. - 2*offset - margin) / (1. - 2 * offset) # ==> # B = C * (1 - margin / (1. - 2 * offset)) # # A + 2 * B + C = 1 # ==> # A + 2 * C * (1 - margin / (1. - 2 * offset)) + C = 1 # ==> # C = (1 - A) / (1 + 2 * (1 - margin / (1. - 2 * offset))) A = 0.5 C = (1 - A) / (1 + 2 * (1 - margin / (1. - 2 * offset))) B = C * (1 - margin / (1. - 2 * offset)) c = rp.canvas(batch=True, num_pads=4, fraction=(A, B, B, C), size=(600, 700)) # Set pad margins c.pad(0)._bare().SetBottomMargin(offset) c.pad(1)._bare().SetTopMargin(offset) c.pad(1)._bare().SetBottomMargin(offset) c.pad(2)._bare().SetTopMargin(offset) c.pad(2)._bare().SetBottomMargin(offset) c.pad(3)._bare().SetTopMargin(offset) c.pad(3)._bare().SetBottomMargin(offset + margin) # Styling HISTSTYLE[True]['label'] = 'Passing cut, #it{{P}}'.format( latex(var, ROOT=True)) HISTSTYLE[False]['label'] = 'Failing cut, #it{{F}}'.format( latex(var, ROOT=True)) # Histograms F = c.hist(m[~msk_pass], bins=MASSBINS, weights=w[~msk_pass], normalise=True, **HISTSTYLE[False]) P = c.hist(m[msk_pass], bins=MASSBINS, weights=w[msk_pass], normalise=True, **HISTSTYLE[True]) P, F = map(root_numpy.hist2array, [P, F]) M = (P + F) / 2 c.hist(M, bins=MASSBINS, normalise=True, linewidth=3, linecolor=ROOT.kViolet, linestyle=2, label='Average, #it{M}') # Compute divergences KL_PM = -P * np.log2(M / P) KL_FM = -F * np.log2(M / F) JSD = (KL_PM + KL_FM) / 2. JSDsum = np.cumsum(JSD) opts = dict(bins=MASSBINS, fillcolor=ROOT.kGray, alpha=0.5) # Draw divergences c.pad(1).hist(KL_PM, **opts) c.pad(1).ylim(-0.12, 0.05) c.pad(1).yline(0.) c.pad(2).hist(KL_FM, **opts) c.pad(2).ylim(-0.05, 0.12) c.pad(2).yline(0.) c.pad(3).hist(JSD, **opts) c.pad(3).ylim(0., 0.03) c.pad(3).yline(0.) o = rp.overlay(c.pad(3), color=ROOT.kViolet, ndiv=502) o.hist(JSDsum, bins=MASSBINS, linecolor=ROOT.kViolet) o.label("#sum_{i #leq n} JSD(P #parallel F)") o.lim(0, 0.2) #o._update_overlay() # Styling axes c.pad(0)._xaxis().SetTitleOffset(999.) c.pad(1)._xaxis().SetTitleOffset(999.) c.pad(2)._xaxis().SetTitleOffset(999.) c.pad(3)._xaxis().SetTitleOffset(5.) c.pad(0)._xaxis().SetLabelOffset(999.) c.pad(1)._xaxis().SetLabelOffset(999.) c.pad(2)._xaxis().SetLabelOffset(999.) c.pad(0)._yaxis().SetNdivisions(505) c.pad(1)._yaxis().SetNdivisions(502) c.pad(2)._yaxis().SetNdivisions(502) c.pad(3)._yaxis().SetNdivisions(502) c.pad(0).ylim(0, 0.20) c.pad(0).cd() c.pad(0)._get_first_primitive().Draw('SAME AXIS') # Decorations c.text(TEXT + [ "Multijets, training dataset", "Cut on {:s} at #varepsilon_{{sig}}^{{rel}} = 50%".format( latex(var, ROOT=True)), "p_{{T}} #in [{:.0f}, {:.0f}] GeV".format(*pt_bin) ], qualifier='Simulation Internal') c.legend(width=0.25) c.xlabel("Large-#it{R} jet mass [GeV]") c.ylabel("Fraction of jets") c.pad(1).ylabel('KL(P #parallel M)') c.pad(2).ylabel('KL(F #parallel M)') c.pad(3).ylabel('JSD(P #parallel F)') # Save c.save('figures/massdecorrelationmetric_{:s}__pT{:.0f}_{:.0f}GeV.pdf'. format(var, *pt_bin)) pass return 0
def jetmasscomparison(data_, args, features, pt_range, eff_sig=50, title=None): """ Perform study of jet mass distributions before and after subtructure cut for different substructure taggers. Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. features: Features for which to plot signal- and background distributions. eff_sig: Signal efficiency at which to impose cut. pt_range: pT selection of the data. """ # Define masks and direction-dependent cut value # Select pT-range if pt_range is not None: data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])] else: data = data_ pass msk_sig = data['signal'] == 1 cuts, msks_pass = dict(), dict() for feat in features: eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig cut = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight_test'].values) msks_pass[feat] = data[feat] > cut # Ensure correct cut direction if signal_low(feat): msks_pass[feat] = ~msks_pass[feat] pass pass # Perform plotting c = plot(data, args, features, msks_pass, eff_sig, pt_range) # Perform plotting on individual figures plot_individual(data, args, features, msks_pass, eff_sig, pt_range, title) # Output #path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format(int(eff_sig)) if title is None: if pt_range is not None: path = 'figures/jetmasscomparison_pT{}to{}__eff_sig_{:d}.pdf'.format( pt_range[0], pt_range[1], int(eff_sig)) else: path = 'figures/jetmasscomparison__eff_sig_{:d}.pdf'.format( int(eff_sig)) else: if pt_range is not None: path = 'figures/' + title + '_jetmasscomparison_pT{}to{}__eff_sig_{:d}.pdf'.format( pt_range[0], pt_range[1], int(eff_sig)) else: path = 'figures/' + title + '_jetmasscomparison__eff_sig_{:d}.pdf'.format( int(eff_sig)) return c, args, path
def jsd(data_, args, feature_dict, pt_range, title=None): """ Perform study of ... Saves plot `figures/jsd.pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. features: Features for ... """ # Extract features and count appearance of each base variable features = [] appearances = [] for basevar in feature_dict.keys(): for suffix in feature_dict[basevar]: features.append(basevar + suffix) appearances.append(len(feature_dict[basevar])) # Select data if pt_range is not None: data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])] else: data = data_ pass # Create local histogram style dict histstyle = dict(**HISTSTYLE) histstyle[True]['label'] = "Pass" histstyle[False]['label'] = "Fail" # Define common variables msk = data['signal'] == 0 effs = np.linspace(0, 100, 10 * 2, endpoint=False)[1:].astype(int) # Loop tagger features jsd = {feat: [] for feat in features} for ifeat, feat in enumerate(features): if len(jsd[feat]): continue # Duplicate feature. # Define cuts cuts = list() for eff in effs: cut = wpercentile(data.loc[msk, feat].values, eff if signal_low(feat) else 100 - eff, weights=data.loc[msk, 'weight_test'].values) cuts.append(cut) pass # Compute KL divergence for successive cuts for cut, eff in zip(cuts, effs): # Create ROOT histograms msk_pass = data[feat] > cut if signal_low(feat): msk_pass = ~msk_pass pass # Get histograms / plot c = rp.canvas(batch=not args.show) h_pass = c.hist(data.loc[msk_pass & msk, 'm'].values, bins=MASSBINS, weights=data.loc[msk_pass & msk, 'weight_test'].values, normalise=True, **histstyle[True]) #, display=False) h_fail = c.hist(data.loc[~msk_pass & msk, 'm'].values, bins=MASSBINS, weights=data.loc[~msk_pass & msk, 'weight_test'].values, normalise=True, **histstyle[False]) #, display=False) # Convert to numpy arrays p = root_numpy.hist2array(h_pass) f = root_numpy.hist2array(h_fail) # Compute Jensen-Shannon divergence jsd[feat].append(JSD(p, f, base=2)) # -- Decorations #c.xlabel("Large-#it{R} jet mass [GeV]") #c.ylabel("Fraction of jets") #c.legend() #c.logy() #c.text(TEXT + [ # "{:s} {} {:.3f}".format(latex(feat, ROOT=True), '<' if signal_low(feat) else '>', cut), # "JSD = {:.4f}".format(jsd[feat][-1])] + \ # (["p_{{T}} #in [{:.0f}, {:.0f}] GeV".format(*pt_range)] if pt_range else []), # qualifier=QUALIFIER, ATLAS=False) # -- Save #if title is None: # c.save('figures/temp_jsd_{:s}_{:.0f}{}.pdf'.format(feat, eff, '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range))) #else: # c.save('figures/'+title+'_temp_jsd_{:s}_{:.0f}{}.pdf'.format(feat, eff, '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range))) pass pass # Compute meaningful limit on JSD jsd_limits = list() sigmoid = lambda x: 1. / (1. + np.exp(-x)) for eff in sigmoid(np.linspace(-5, 5, 20 + 1, endpoint=True)): limits = jsd_limit(data[msk], eff, num_bootstrap=5) jsd_limits.append((eff, np.mean(limits), np.std(limits))) pass # Perform plotting c = plot(args, data, effs, jsd, jsd_limits, features, pt_range, appearances) # Output if title is None: path = 'figures/jsd{}.pdf'.format( '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range)) else: path = 'figures/' + title + '_jsd{}.pdf'.format( '' if pt_range is None else '__pT{:.0f}_{:.0f}'.format(*pt_range)) c.save(path=path) return c, args, path
def fill_profile_1D(data): """Fill ROOT.TH2F with the measured, weighted values of the `EFF`-percentile of the background `VAR`. """ # Define arrays #bins = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0] + 1, endpoint=True) # Make variable sized bins #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True) #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000]) # Build bin structure with at least ?50 event in each bin # and bin widths of at least AXIS[VARX][0] minBinSize = 100 #AXIS[VARX][0] binEdge = AXIS[VARX][2] binList = [] binList.append(binEdge) k = 1 while binEdge - k * minBinSize > AXIS[VARX][1]: msk = (data[VARX] > binEdge - k * minBinSize) & (data[VARX] <= binEdge) if (np.sum(msk) * EFF / 100. > MIN_STAT): binEdge -= k * minBinSize binList.append(binEdge) k = 1 else: k += 1 binList.append(AXIS[VARX][1]) binList.reverse() bins = np.array(binList) print "Bins: ", len(bins), bins shape = len(bins) - 1 #AXIS[VARX][0] # x, y, e = (np.zeros(shape) for _ in range(3)) # Create `profile` histogram profile = ROOT.TH1F('profile', "", len(bins) - 1, bins) #if INPUT == "mc": # data.loc[:,'TotalEventWeight'] /= 139000000. # Fill profile for i in (range(shape)): # Masks msk = (data[VARX] > bins[i]) & (data[VARX] <= bins[i + 1]) # Percentile #perc = np.nan #if np.sum(msk) > 20: # Ensure sufficient statistics for meaningful percentile. Was 20 perc = wpercentile( data=data.loc[msk, VAR].values, percents=100 - EFF, weights=data.loc[msk, 'TotalEventWeight'].values) #wpercentile # pass x[i] = np.mean([bins[i], bins[i + 1]]) y[i] = perc if np.sum(msk) > 0: e[i] = np.sqrt(np.sum(msk)) / np.sum(msk) else: print "Bin ", i, " has np.sum(msk) < 20. Weird." e[i] = 0 # Set non-zero bin content if perc != np.nan: profile.SetBinContent(i + 1, perc) pass pass # Normalise array # x = standardise(x, rank=None) # Filter out NaNs msk = ~np.isnan(y) x, y, e = x[msk], y[msk], y[msk] return profile, (x, y, e)
def main (args): # Definitions histstyle = dict(**HISTSTYLE) # Initialise args, cfg = initialise(args) # Load data data, features, _ = load_data('data/' + args.input) #, test=True) # outFile = ROOT.TFile.Open("figures/knn_jet_ungrtrk500_eff{}_data.root".format(knn_eff),"RECREATE") EFF = 0.5 VAR = 'jet_ungrtrk500' VARX = 'dijetmass' FIT_RANGE = (0, 6000) # Necessary? #eff_sig = 0.50 #fpr, tpr, thresholds = roc_curve(data['signal'], data[kNN_basevar], sample_weight=data['weight']) #idx = np.argmin(np.abs(tpr - eff_sig)) #print "Background acceptance @ {:.2f}% sig. eff.: {:.2f}% ({} > {:.2f})".format(eff_sig * 100., (fpr[idx]) * 100., kNN_basevar, thresholds[idx]) #changed from 1-fpr[idx] #print "Chosen target efficiency: {:.2f}%".format(kNN_eff) weight = 'weight' # 'weight_test' / 'weight' bins_mjj = np.linspace(100, 8000, 20) fineBins = np.linspace(100, 8000, 7900) fineBinsRe = fineBins.reshape(-1,1) percs = [] for i in range(1, len(bins_mjj)): msk = (data[VARX] > bins_mjj[i-1]) & (data[VARX] <= bins_mjj[i]) & (data['signal']==0) if np.sum(msk) > 20: # Ensure sufficient statistics for meaningful percentile. Was 20 percs.append( wpercentile(data=data.loc[msk, VAR].values, percents=100-EFF, weights=data.loc[msk, weight].values) )#wpercentile else: percs.append(0) print "Length of percs: ", len(percs), percs percs = percs[0:-1] bins_mjj = bins_mjj[0:-1] X = bins_mjj.reshape(-1,1) X = X[1:len(bins_mjj)] print len(X), len(percs) # Fit parameters knn_neighbors = 2 knn_weights = 'uniform' fit_deg = 1 knn = KNeighborsRegressor(n_neighbors=5, weights='distance') y_knn = knn.fit(X, percs).predict(fineBinsRe) c = rp.canvas(batch=True) knnFit = c.plot(y_knn, bins=fineBins, linecolor=ROOT.kRed+2, linewidth=2, linestyle=1, label="knn fit, uniform", option='L') c.save('figures/distributions/percentile_test.pdf'.format(EFF, args.input)) outFile.cd() knnFit.SetName("kNNfit") knnFit.Write() outFile.Close() """
def jetmasscomparison(data, args, features, eff_sig=25): """ Perform study of jet mass distributions before and after subtructure cut for different substructure taggers. Saves plot `figures/jetmasscomparison__eff_sig_[eff_sig].pdf` Arguments: data: Pandas data frame from which to read data. args: Namespace holding command-line arguments. features: Features for which to plot signal- and background distributions. eff_sig: Signal efficiency at which to impose cut. """ # Define masks and direction-dependent cut value msk_sig = data['sigType'] == 1 cuts, msks_pass = dict(), dict() lead_features = [] print "Features: ", features for feat in features: eff_cut = eff_sig if signal_low(feat) else 100 - eff_sig if (not 'lead' in feat) and (not 'sub' in feat): print "hej" cut = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight'].values) msk = (data[feat] > cut) fpr, tpr, thresholds = roc_curve(data['signal'], data[feat], sample_weight=data['weight']) idx = np.argmin(np.abs(tpr - eff_sig / 100.)) print "Pass criteria:", feat, " > ", cut print "Background acceptance @ {:.2f}% sig. eff.: {:.5f}% ({} > {:.2f})".format( eff_sig, (fpr[idx]) * 100., feat, thresholds[idx]) msks_pass[feat] = msk lead_features.append(feat) else: if 'lead' in feat: cut1 = wpercentile(data.loc[msk_sig, feat].values, eff_cut, weights=data.loc[msk_sig, 'weight'].values) msk1 = (data[feat] > cut1) fpr, tpr, thresholds = roc_curve(data['signal'], data[feat], sample_weight=data['weight']) idx = np.argmin(np.abs(tpr - eff_sig / 100.)) print "H Pass criteria:", feat, " > ", cut1 print "H Background acceptance @ {:.2f}% sig. eff.: {:.6f}% ({} > {:.2f})".format( eff_sig, (fpr[idx]) * 100., feat, thresholds[idx]) lead_features.append(feat) subfeat = feat.replace("lead", "sub") data1 = data[msk1] cut2 = wpercentile(data1.loc[msk_sig, subfeat].values, eff_cut, weights=data1.loc[msk_sig, 'weight'].values) fpr, tpr, thresholds = roc_curve(data1['signal'], data1[subfeat], sample_weight=data1['weight']) idx = np.argmin(np.abs(tpr - eff_sig / 100.)) idy = np.argmin(np.abs(thresholds - cut1)) print "H Pass criteria:", subfeat, " > ", cut2, idy, len( thresholds) print "H Background acceptance @ {:.5f}% sig. eff.: {:.5f}% ({} > {:.5f})".format( (tpr[idy]) * 100, (fpr[idy]) * 100., subfeat, thresholds[idy]) #msks_pass[feat]=(data[feat]>cut1) | (data[subfeat]>cut1) msks_pass[feat] = (data[feat] > cut1) & (data[subfeat] > cut1) # Ensure correct cut direction if signal_low(feat): msks_pass[feat] = ~msks_pass[feat] pass pass # Perform plotting #c = plot(data, args, features, msks_pass, eff_sig) # Perform plotting on individual figures c = plot_individual(data, args, lead_features, msks_pass, eff_sig) # Output path = 'figures/jetmasscomparison__eff_sig_{:d}_{}.pdf'.format( int(eff_sig), MODEL) path = 'figures/jetmasscomparison__eff_sig_{:d}_{}.eps'.format( int(eff_sig), MODEL) return c, args, path