Exemplo n.º 1
0
    def wrapper(*args, **kwargs):
        # Run study
        c, args, path = f(*args, **kwargs)

        # Save
        if args.save:
            dir = '/'.join(path.split('/')[:-1])
            mkdir(dir)
            suffix = path.split('.')[-1]
            if len(suffix) < 4:
                base = '.'.join(path.split('.')[:-1])
                c.save(base + '.eps')
                c.save(base + '.pdf')
                c.save(base + '.C')
            else:
                c.save(path)
                pass

            pass

        # Show
        if args.show:
            c.show()
            pass
        return
Exemplo n.º 2
0
def distribution(data_, args, feat, pt_range, mass_range, title=None):
    """
    Perform study of substructure variable distributions.

    Saves plot `figures/distribution_[feat].pdf`

    Arguments:
        data: Pandas data frame from which to read data.
        args: Namespace holding command-line arguments.
        feat: Feature for which to plot signal- and background distributions.
    """

    # Select data
    if pt_range is not None:
        data = data_[(data_['pt'] > pt_range[0]) & (data_['pt'] < pt_range[1])]
    else:
        data = data_
        pass

    if mass_range is not None:
        data = data[(data['m'] > mass_range[0]) & (data['m'] < mass_range[1])]
        pass

    # Define bins
    xmin = wpercentile(data[feat].values,
                       1,
                       weights=data['weight_test'].values)
    xmax = wpercentile(data[feat].values,
                       99,
                       weights=data['weight_test'].values)

    if feat == 'D2-k#minusNN':
        print "distribution: kNN feature '{}'".format(feat)
        xmin, xmax = -1., 2.
    elif feat.lower().startswith('d2'):
        print "distribution: D2  feature '{}'".format(feat)
        xmin, xmax = 0., 3.
    elif 'tau21' in feat.lower():
        xmin, xmax = 0., 1.
        pass

    snap = 0.5  # Snap to nearest multiple in appropriate direction
    xmin = np.floor(xmin / snap) * snap
    xmax = np.ceil(xmax / snap) * snap

    bins = np.linspace(xmin, xmax, 50 + 1, endpoint=True)

    # Perform plotting
    c = plot(args, data, feat, bins, pt_range, mass_range)

    # Output
    mkdir('figures/distribution/')
    path = 'figures/distribution/distribution_{}{}{}.pdf'.format(
        standardise(feat), '__pT{:.0f}_{:.0f}'.format(pt_range[0], pt_range[1])
        if pt_range is not None else '', '__mass{:.0f}_{:.0f}'.format(
            mass_range[0], mass_range[1]) if mass_range is not None else '')

    c.save(path=path)  #this was actually missing, lol

    return c, args, path
Exemplo n.º 3
0
def plot2D (*argv):
    """
    Method for delegating 2D plotting.
    """

    # Unpack arguments
    data, ddt, lda, contours, binsx, binsy, variable = argv

    with TemporaryStyle() as style:

        # Style
        style.SetNumberContours(10)

        # Canvas
        c = rp.canvas(batch=True)

        # Axes
        c.hist([binsy[0]], bins=[binsx[0], binsx[-1]], linestyle=0, linewidth=0)

        # Plotting contours
        for sig in [0,1]:
            c.hist2d(contours[sig], linecolor=rp.colours[1 + 3 * sig], label="Signal" if sig else "Background", option='CONT3', legend_option='L')
            pass

        # Linear fit
        x1, x2 = 1.5, 5.0
        intercept, coef = ddt.intercept_ + ddt.offset_, ddt.coef_
        y1 = intercept + x1 * coef
        y2 = intercept + x2 * coef
        c.plot([y1,y2], bins=[x1,x2], color=rp.colours[-1], label='DDT transform fit', linewidth=1, linestyle=1, option='L')

        # LDA decision boundary
        y1 = lda.intercept_ + x1 * lda.coef_
        y2 = lda.intercept_ + x2 * lda.coef_
        c.plot([y1,y2], bins=[x1,x2],  label='LDA boundary', linewidth=1, linestyle=2, option='L')

        # Decorations
        c.text(["#sqrt{s} = 13 TeV"], qualifier=QUALIFIER, ATLAS=False)
        c.legend()
        c.ylim(binsy[0], binsy[-1])
        c.xlabel("Large-#it{R} jet " + latex('rhoDDT', ROOT=True))
	if variable == VAR_TAU21:
        	c.ylabel("Large-#it{R} jet " + latex('#tau_{21}',  ROOT=True)) #changed these to latex formatting
	elif variable == VAR_N2:
		c.ylabel("Large-#it{R} jet " + latex('N_{2}',  ROOT=True))
	elif variable == VAR_DECDEEP:
		c.ylabel("Large-#it{R} jet " + latex('dec_deepWvsQCD',  ROOT=True))
	elif variable == VAR_DEEP:
		c.ylabel("Large-#it{R} jet " + latex('deepWvsQCD',  ROOT=True))

        # Save
        mkdir('figures/ddt')
        c.save('figures/ddt/ddt_{}_2d.pdf'.format(variable))
        pass
    return
Exemplo n.º 4
0
def plot(*argv):
    """
    Method for delegating plotting.
    """

    # Unpack arguments
    experiment, means, graph, idx_improvements, best_mean, bins = argv

    # Plot results
    c = rp.canvas(batch=True)
    ymax = 1.0  # 1.5
    ymin = 0.3
    oobx = map(lambda t: t[0], filter(lambda t: t[1] > ymax, enumerate(means)))
    ooby = np.ones_like(oobx) * 0.96 * (ymax - ymin) + ymin

    # Plots
    c.graph(graph,
            markercolor=rp.colours[1],
            linecolor=rp.colours[1],
            markersize=0.7,
            option='AP',
            label='Evaluations',
            legend_option='PE')
    c.graph(ooby,
            bins=oobx,
            markercolor=rp.colours[1],
            markerstyle=22,
            option='P')
    c.graph(best_mean,
            bins=bins,
            linecolor=rp.colours[5],
            linewidth=2,
            option='L',
            label='Best result')
    c.graph(best_mean[idx_improvements],
            bins=bins[idx_improvements],
            markercolor=rp.colours[5],
            markersize=0.5,
            option='P')

    # Decorations
    c.pad()._yaxis().SetNdivisions(505)
    c.xlabel("Bayesian optimisation step")
    c.ylabel("Cross-validation optimisation metric, L_{clf}^{val}")
    c.xlim(0, len(bins))
    #c.ylim(0, ymax)
    c.ylim(0.3, 1.0)
    c.legend(width=0.22, ymax=0.816)
    c.text(["#sqrt{s} = 13 TeV", "Neural network (NN) classifier"],
           qualifier=QUALIFIER)
    # Save
    mkdir('figures/optimisation/')
    c.save('figures/optimisation/optimisation_{}.pdf'.format(experiment))

    return
Exemplo n.º 5
0
def plot(profile, fit):
    """
    Method for delegating plotting.
    """

    # rootplotting
    c = rp.canvas(batch=True)
    pad = c.pads()[0]._bare()
    pad.cd()
    pad.SetRightMargin(0.20)
    pad.SetLeftMargin(0.15)
    pad.SetTopMargin(0.10)

    # Styling
    profile.GetXaxis().SetTitle(latex(VARX, ROOT=True) +
                                " [GeV]")  #+ " = log(m^{2}/p_{T}^{2})")
    profile.GetYaxis().SetTitle(latex(VARY, ROOT=True) + " [GeV]")
    profile.GetZaxis().SetTitle("%s %s^{(%s%%)}" %
                                ("#it{k}-NN fitted" if fit else "Measured",
                                 latex(VAR, ROOT=True), EFF))

    profile.GetYaxis().SetNdivisions(505)
    profile.GetZaxis().SetNdivisions(505)
    profile.GetXaxis().SetTitleOffset(1.4)
    profile.GetYaxis().SetTitleOffset(1.8)
    profile.GetZaxis().SetTitleOffset(1.3)
    if ZRANGE:
        profile.GetZaxis().SetRangeUser(*ZRANGE)
        pass
    profile.SetContour(NB_CONTOUR)

    # Draw
    profile.Draw('COLZ')
    BOUNDS[0].DrawCopy("SAME")
    BOUNDS[1].DrawCopy("SAME")
    #c.latex("m > 50 GeV",  -4.5, BOUNDS[0].Eval(-4.5) + 30, align=21, angle=-37, textsize=13, textcolor=ROOT.kGray + 3)
    #c.latex("m < 300 GeV", -2.5, BOUNDS[1].Eval(-2.5) - 30, align=23, angle=-57, textsize=13, textcolor=ROOT.kGray + 3)

    # Decorations
    #c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15)
    c.text(["#sqrt{s} = 13 TeV", "Multijets"],
           ATLAS=False,
           textcolor=ROOT.kWhite)

    # Save
    mkdir('figures/knn/')
    c.save('figures/knn/knn_{}_{:s}_{}_{}.pdf'.format(
        'fit' if fit else 'profile', VAR, EFF, MODEL))
    c.save('figures/knn/knn_{}_{:s}_{}_{}.eps'.format(
        'fit' if fit else 'profile', VAR, EFF, MODEL))
    pass
Exemplo n.º 6
0
    def wrapper(*args, **kwargs):
        # Run study
        c, args, path = f(*args, **kwargs)

        # Save
        if args.save:
            dir = '/'.join(path.split('/')[:-1])
            mkdir(dir)
            c.save(path)
            pass

        # Show
        if args.show:
            c.show()
            pass
        return
Exemplo n.º 7
0
Arquivo: test.py Projeto: nethemis/ANN
def plot1D (*argv):
    """
    Method for delegating 1D plotting.
    """

    # Unpack arguments
    graphs, ddt, arr_x = argv

    # Style
    ROOT.gStyle.SetTitleOffset(1.4, 'x')

    # Canvas
    c = rp.canvas(batch=True)

    # Setup
    pad = c.pads()[0]._bare()
    pad.cd()
    pad.SetTopMargin(0.10)
    pad.SetTopMargin(0.10)

    # Profiles
    c.graph(graphs['Tau21'],    label="Original, #tau_{21}",          linecolor=rp.colours[4], markercolor=rp.colours[4], markerstyle=24, legend_option='PE')
    c.graph(graphs['Tau21DDT'], label="Transformed, #tau_{21}^{DDT}", linecolor=rp.colours[1], markercolor=rp.colours[1], markerstyle=20, legend_option='PE')

    # Fit
    x1, x2 = min(arr_x), max(arr_x)
    intercept, coef = ddt.intercept_ + ddt.offset_, ddt.coef_
    y1 = intercept + x1 * coef
    y2 = intercept + x2 * coef
    c.plot([y1,y2], bins=[x1,x2], color=rp.colours[-1], label='Linear fit', linewidth=1, linestyle=1, option='L')

    # Decorations
    c.xlabel("Large-#it{R} jet #rho^{DDT} = log(m^{2}/ p_{T} / 1 GeV)")
    c.ylabel("#LT#tau_{21}#GT, #LT#tau_{21}^{DDT}#GT")

    c.text(["#sqrt{s} = 13 TeV,  Multijets"], qualifier=QUALIFIER)
    c.legend(width=0.25, xmin=0.57, ymax=None if "Internal" in QUALIFIER else 0.85)

    c.ylim(0, 1.4)
    c.latex("Fit range", sum(FIT_RANGE) / 2., 0.08, textsize=13, textcolor=ROOT.kGray + 2)
    c.xline(FIT_RANGE[0], ymax=0.82, text_align='BR', linecolor=ROOT.kGray + 2)
    c.xline(FIT_RANGE[1], ymax=0.82, text_align='BL', linecolor=ROOT.kGray + 2)

    # Save
    mkdir('figures/ddt/')
    c.save('figures/ddt/ddt.pdf')
    return
Exemplo n.º 8
0
def save_hdf5 (data, path, name='dataset', gzip=True):
    """
    Save numpy recarray to HDF5 file.

    Arguments:
        data: Numpy recarray to be saved to file.
        path: Path to HDF5 save file.
        name: Name of dataset in which to store the data.
        gzip: Whether to apply gzip compression to HDF5 file.
    """

    # Ensure directory exists
    basedir = '/'.join(path.split('/')[:-1])
    if basedir: mkdir(basedir)

    # Save array to HDF5 file
    with h5py.File(path, 'w') as hf:
        hf.create_dataset(name,  data=data, compression="gzip" if gzip else None)
        pass

    return
Exemplo n.º 9
0
def save_patch(patch, filename):
    """
    ...
    
    Arguments:
        ...
    """

    # @TEMP: Debug
    print "- " * 40
    print "Saving the following patch to '{}':".format(filename)
    print patch
    print "- " * 40

    # Make sure target directory exists
    directory = '/'.join(filename.split('/')[:-1])
    mkdir(directory)

    # Dump patch to JSONo file
    with open(filename, 'w') as f:
        json.dump(patch, f, indent=4, sort_keys=True)
        pass

    return
Exemplo n.º 10
0
def main(args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data(args.input + 'data.h5', train=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, _ = fill_profile(data[msk_bkg])

    # Add k-NN variable
    knnfeat = 'knn'
    add_knn(data,
            newfeat=knnfeat,
            path='models/knn/knn_{}_{}.pkl.gz'.format(VAR, EFF))

    # Loading KNN classifier
    knn = loadclf('models/knn/knn_{:s}_{:.0f}.pkl.gz'.format(VAR, EFF))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges  @TODO: Make standardised right away?
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    with Profile("Plotting"):
        for fit in [False, True]:

            # Select correct profile
            profile = profile_fit if fit else profile_meas

            # Plot
            plot(profile, fit)
            pass
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency
    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        if sig:
            rgbs = [(247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.linspace(0, 1, nb_cols, endpoint=True)
        else:
            rgbs = [(255 / 255., 51 / 255., 4 / 255.),
                    (247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.array([0] + list(
                np.linspace(0, 1, nb_cols - 1, endpoint=True) *
                (1. - EFF / 100.) + EFF / 100.))
            pass

        ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue,
                                             NB_CONTOUR)

        # Define arrays
        shape = (AXIS[VARX][0], AXIS[VARY][0])
        bins = [
            np.linspace(AXIS[var][1],
                        AXIS[var][2],
                        AXIS[var][0] + 1,
                        endpoint=True) for var in VARS
        ]
        x, y, z = (np.zeros(shape) for _ in range(3))

        # Create `profile` histogram
        profile = ROOT.TH2F('profile', "",
                            len(bins[0]) - 1, bins[0].flatten('C'),
                            len(bins[1]) - 1, bins[1].flatten('C'))

        # Compute inclusive efficiency in bins of `VARY`
        effs = list()
        for edges in zip(bins[1][:-1], bins[1][1:]):
            msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1])
            msk_pass = data[knnfeat] < 0
            num = data.loc[msk & msk_bin & msk_pass,
                           'weight_test'].values.sum()
            den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
            effs.append(num / den)
            pass

        # Fill profile
        for i, j in itertools.product(*map(range, shape)):

            # Bin edges in x and y
            edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

            # Masks
            msks = [(data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                    for dim, var in enumerate(VARS)]
            msk_bin = reduce(lambda x, y: x & y, msks)
            data_ = data[msk & msk_bin]

            # Set non-zero bin content
            if np.sum(msk & msk_bin):
                msk_pass = data_[knnfeat] < 0
                num = data.loc[msk & msk_bin & msk_pass,
                               'weight_test'].values.sum()
                den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
                eff = num / den
                profile.SetBinContent(i + 1, j + 1, eff)
                pass
            pass

        c = rp.canvas(batch=True)
        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.20)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile.GetXaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARX, ROOT=True) +
                                    " = log(m^{2}/p_{T}^{2})")
        profile.GetYaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARY, ROOT=True) + " [GeV]")
        profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" %
                                    (latex(VAR, ROOT=True), EFF))

        profile.GetYaxis().SetNdivisions(505)
        profile.GetZaxis().SetNdivisions(505)
        profile.GetXaxis().SetTitleOffset(1.4)
        profile.GetYaxis().SetTitleOffset(1.8)
        profile.GetZaxis().SetTitleOffset(1.3)
        zrange = (0., 1.)
        if zrange:
            profile.GetZaxis().SetRangeUser(*zrange)
            pass
        profile.SetContour(NB_CONTOUR)

        # Draw
        profile.Draw('COLZ')

        # Decorations
        c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15)
        c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"],
               ATLAS=False)

        # -- Efficiencies
        xaxis = profile.GetXaxis()
        yaxis = profile.GetYaxis()
        tlatex = ROOT.TLatex()
        tlatex.SetTextColor(ROOT.kGray + 2)
        tlatex.SetTextSize(0.023)
        tlatex.SetTextFont(42)
        tlatex.SetTextAlign(32)
        xt = xaxis.GetBinLowEdge(xaxis.GetNbins())
        for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)):
            yt = yaxis.GetBinCenter(ibin)
            tlatex.DrawLatex(
                xt, yt, "%s%.1f%%" %
                ("#bar{#varepsilon}^{rel}_{%s} = " %
                 ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.))
            pass

        # -- Bounds
        BOUNDS[0].DrawCopy("SAME")
        BOUNDS[1].DrawCopy("SAME")
        c.latex("m > 50 GeV",
                -4.5,
                BOUNDS[0].Eval(-4.5) + 30,
                align=21,
                angle=-37,
                textsize=13,
                textcolor=ROOT.kGray + 3)
        c.latex("m < 300 GeV",
                -2.5,
                BOUNDS[1].Eval(-2.5) - 30,
                align=23,
                angle=-57,
                textsize=13,
                textcolor=ROOT.kGray + 3)

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', VAR, EFF))
        pass

    return
Exemplo n.º 11
0
Arquivo: loss.py Projeto: nethemis/ANN
def plot_classifier_training_loss(
        num_folds, basedir='models/adversarial/classifier/crossval/'):
    """
    Plot the classifier training loss.
    """

    # Check(s)
    if not basedir.endswith('/'):
        basedir += '/'
        pass

    # Get paths to classifier training losses
    paths = sorted(
        glob.glob(
            basedir +
            '/history__crossval_classifier__*of{}.json'.format(num_folds)))

    if len(paths) == 0:
        print "No models found for classifier CV study."
        return

    # Read losses from files
    losses = {'train': list(), 'val': list()}
    for path in paths:
        with open(path, 'r') as f:
            d = json.load(f)
            pass

        loss = np.array(d['val_loss'])
        print "Outliers:", loss[np.abs(loss - 0.72) < 0.02]
        loss[np.abs(loss - 0.72) <
             0.02] = np.nan  # @FIXME: This probably isn't completely kosher
        losses['val'].append(loss)
        loss = np.array(d['loss'])
        losses['train'].append(loss)
        pass

    # Define variable(s)
    bins = np.arange(len(loss))
    histbins = np.arange(len(loss) + 1) + 0.5

    # Canvas
    c = rp.canvas(batch=True)

    # Plots
    categories = list()

    for name, key, colour, linestyle in zip(['Validation', 'Training'],
                                            ['val', 'train'],
                                            [rp.colours[4], rp.colours[1]],
                                            [1, 2]):

        # Histograms
        loss_mean = np.nanmean(losses[key], axis=0)
        loss_std = np.nanstd(losses[key], axis=0)
        hist = ROOT.TH1F(key + '_loss', "", len(histbins) - 1, histbins)
        for idx in range(len(loss_mean)):
            hist.SetBinContent(idx + 1, loss_mean[idx])
            hist.SetBinError(idx + 1, loss_std[idx])
            pass

        c.hist([0], bins=[0, max(bins)], linewidth=0,
               linestyle=0)  # Force correct x-axis
        c.hist(hist, fillcolor=colour, alpha=0.3, option='LE3')
        c.hist(hist,
               linecolor=colour,
               linewidth=3,
               linestyle=linestyle,
               option='HISTL')

        categories += [(name, {
            'linestyle': linestyle,
            'linewidth': 3,
            'linecolor': colour,
            'fillcolor': colour,
            'alpha': 0.3,
            'option': 'FL'
        })]
        pass

    # Decorations
    c.pads()[0]._yaxis().SetNdivisions(505)
    c.xlabel("Training epoch")
    c.ylabel("Cross-validation classifier loss, L_{clf}")
    c.xlim(0, max(bins))
    c.ylim(0.3, 0.5)
    c.legend(categories=categories, width=0.25)  # ..., xmin=0.475
    c.text(TEXT + ["#it{W} jet tagging", "Neural network (NN) classifier"],
           qualifier=QUALIFIER)
    # Save
    mkdir('figures/')
    c.save('figures/loss_classifier.pdf')
    return
Exemplo n.º 12
0
Arquivo: loss.py Projeto: nethemis/ANN
def plot_adversarial_training_loss(
        lambda_reg,
        num_folds,
        pretrain_epochs,
        H_prior=None,
        basedir='models/adversarial/combined/crossval/'):
    """
    Plot the classifier, adversary, and combined losses for the adversarial
    training of the jet classifier.
    """

    # Check(s)
    if not basedir.endswith('/'):
        basedir += '/'
        pass

    # Define variable(s)
    digits = int(np.ceil(max(-np.log10(lambda_reg), 0)))
    lambda_str = '{l:.{d:d}f}'.format(d=digits, l=lambda_reg).replace('.', 'p')

    # Get paths to all cross-validation adversarially trained classifiers
    if num_folds:
        paths = sorted(
            glob.glob(basedir +
                      'history__combined_lambda{}__*of{}.json'.format(
                          lambda_str, num_folds)))
    else:
        paths = glob.glob(basedir +
                          'history__combined_lambda{}.json'.format(lambda_str))
        pass

    print "Found {} paths.".format(len(paths))
    if len(paths) == 0:
        return

    # Store losses
    keys = [
        'train_comb', 'train_clf', 'train_adv', 'val_comb', 'val_clf',
        'val_adv'
    ]
    losses = {key: list() for key in keys}
    for path in paths:
        with open(path, 'r') as f:
            d = json.load(f)
            pass

        # Loop loss classes
        for name, prefix in zip(['train', 'val'], ['', 'val_']):
            try:
                # Classifier
                loss = np.array(d[prefix + 'classifier_loss'])
                loss[loss > 7.0] = np.nan
                losses[name + '_clf'].append(loss)

                # Adversary
                loss = np.array(d[prefix + 'adversary_loss'])
                losses[name + '_adv'].append(loss)

                # Combined
                losses[name +
                       '_comb'].append(losses[name + '_clf'][-1] -
                                       lambda_reg * losses[name + '_adv'][-1])
            except KeyError:
                pass  # No validation
            pass

    # Plot results
    c = rp.canvas(batch=True, num_pads=3, ratio=False, size=(600, 800))
    bins = np.arange(len(loss))
    histbins = np.arange(len(loss) + 1) - 0.5

    # Axes
    for idx in range(3):
        c.pads()[idx].hist([0],
                           bins=[0, len(bins) - 1],
                           linewidth=0,
                           linestyle=0)  # Force correct x-axis
        pass

    # Plots
    categories = list()
    for ityp, typ in enumerate(['val', 'train']):
        for igrp, grp in enumerate(['clf', 'adv', 'comb']):
            key = '{}_{}'.format(typ, grp)
            colour = rp.colours[1 if typ == 'train' else 4]

            # Create histogram
            try:
                loss_mean = np.nanmean(losses[key], axis=0)
                loss_std = np.nanstd(losses[key], axis=0)
                hist = ROOT.TH1F(key, "", len(histbins) - 1, histbins)
                for ibin in range(len(loss_mean)):
                    hist.SetBinContent(ibin + 1, loss_mean[ibin])
                    hist.SetBinError(ibin + 1, loss_std[ibin])
                    pass

                c.pads()[igrp].hist(hist,
                                    fillcolor=colour,
                                    linestyle=ityp + 1,
                                    linewidth=0,
                                    alpha=0.3,
                                    option='LE3')
                c.pads()[igrp].hist(hist,
                                    fillcolor=0,
                                    fillstyle=0,
                                    linecolor=colour,
                                    linestyle=ityp + 1,
                                    linewidth=3,
                                    option='HISTL')
            except TypeError:
                pass  # No validation

            if igrp == 0:
                categories += [('Training' if typ == 'train' else 'Validation',
                                {
                                    'linestyle': ityp + 1,
                                    'linewidth': 3,
                                    'fillcolor': colour,
                                    'alpha': 0.3,
                                    'linecolor': colour,
                                    'option': 'FL'
                                })]
                pass
            pass
        pass

    # Formatting pads
    margin = 0.2
    ymins, ymaxs = list(), list()
    clf_opt_val = None
    for ipad, pad in enumerate(c.pads()):
        tpad = pad._bare()  # ROOT.TPad
        f = ipad / float(len(c.pads()) - 1)
        tpad.SetLeftMargin(0.20)
        tpad.SetBottomMargin(f * margin)
        tpad.SetTopMargin((1 - f) * margin)
        pad._xaxis().SetNdivisions(505)
        pad._yaxis().SetNdivisions(505)
        if ipad < len(c.pads()) - 1:  # Not bottom pad
            pad._xaxis().SetLabelOffset(9999.)
            pad._xaxis().SetTitleOffset(9999.)
        else:
            pad._xaxis().SetTitleOffset(3.5)
            pass

        ymin, ymax = list(), list()
        for hist in pad._primitives:
            if not isinstance(hist, ROOT.TGraph):
                ymin.append(get_min(hist))
                ymax.append(get_max(hist))
                pass
            pass

        # Get reference-line value
        clf_opt_val = clf_opt_val or c.pads()[0]._primitives[1].GetBinContent(
            1)
        ref = clf_opt_val if ipad == 0 else (
            H_prior if ipad == 1 else clf_opt_val - lambda_reg * H_prior)

        ymin = min(ymin + [ref])
        ymax = max(ymax + [ref])

        ydiff = ymax - ymin
        ymin -= ydiff * 0.2
        ymax += ydiff * (0.7 if ipad == 0 else (0.7 if ipad == 1 else 0.2))

        if ipad == 0:
            #    ymin = 0.25
            ymax *= 1.2
            pass

        pad.ylim(ymin, ymax)

        ymins.append(ymin)
        ymaxs.append(ymax)
        pass

    c._bare().Update()

    # Pre-training boxes
    boxes = list()
    for ipad, pad in enumerate(c.pads()):
        pad._bare().cd()
        boxes.append(ROOT.TBox(0, ymins[ipad], pretrain_epochs, ymaxs[ipad]))
        boxes[-1].SetFillColorAlpha(ROOT.kBlack, 0.05)
        boxes[-1].Draw("SAME")
        pass

    # Vertical lines
    for ipad in range(len(c.pads())):
        align = 'TR' if ipad < 2 else 'BR'
        c.pads()[ipad].xline(
            pretrain_epochs,
            ymin=ymins[ipad],
            ymax=ymaxs[ipad],
            text='  Adv. pre-training  ' if ipad == 0 else None,
            text_align=align,
            linestyle=1,
            linecolor=ROOT.kGray + 2)
        pass

    # Horizontal lines
    c.pads()[0].yline(clf_opt_val)
    if H_prior is not None:
        c.pads()[1].yline(H_prior)
        c.pads()[2].yline(clf_opt_val - lambda_reg * (H_prior))
        pass

    opts = dict(align=31, textcolor=ROOT.kGray + 2, textsize=14)
    c.pads()[0].latex("Stand-alone NN  ", bins[-1] * 0.98,
                      clf_opt_val + (ymaxs[0] - ymins[0]) * 0.03, **opts)

    if H_prior is not None:
        c.pads()[1].latex("#it{H}(prior)  ", bins[-1] * 0.98,
                          H_prior + (ymaxs[1] - ymins[1]) * 0.03, **opts)
        opts['align'] = 33
        c.pads()[2].latex(
            "Ideal  ", bins[-1] * 0.98, clf_opt_val - lambda_reg * (H_prior) -
            (ymaxs[2] - ymins[2]) * 0.03, **opts)
        pass

    # Decorations
    ROOT.gStyle.SetTitleOffset(2.0, 'y')  # 2.2
    c.xlabel("Training epoch")
    c.pads()[0].ylabel("#it{L}_{clf.}")
    c.pads()[1].ylabel("#it{L}_{adv.}")
    c.pads()[2].ylabel("#it{L}_{clf.} #minus #lambda #it{L}_{adv.}")
    for pad in c.pads():
        pad.xlim(0, max(bins) - 1)
        pass

    c.pads()[0].text([], xmin=0.2, ymax=0.85, qualifier=QUALIFIER)

    c.pads()[1].text([
        "#sqrt{s} = 13 TeV", "#it{W} jet tagging",
        "Adversarial training (#lambda = %s)" % (lambda_str.replace('p', '.'))
    ],
                     ATLAS=False,
                     ymax=0.70,
                     xmin=0.27)
    c.pads()[0].legend(xmin=0.60, ymax=0.70, categories=categories)

    # Save
    mkdir('figures/')
    c.save('figures/loss_adversarial_lambda{}_{}.pdf'.format(
        lambda_str, 'full' if num_folds is None else 'cv'))
    return
Exemplo n.º 13
0
def test(data, variable, bg_eff, signal_above=False):
    # Shout out to Cynthia Brewer and Mark Harrower
    # [http://colorbrewer2.org]. Palette is colorblind-safe.
    rgbs = [(247 / 255., 251 / 255., 255 / 255.),
            (222 / 255., 235 / 255., 247 / 255.),
            (198 / 255., 219 / 255., 239 / 255.),
            (158 / 255., 202 / 255., 225 / 255.),
            (107 / 255., 174 / 255., 214 / 255.),
            (66 / 255., 146 / 255., 198 / 255.),
            (33 / 255., 113 / 255., 181 / 255.),
            (8 / 255., 81 / 255., 156 / 255.),
            (8 / 255., 48 / 255., 107 / 255.)]

    red, green, blue = map(np.array, zip(*rgbs))
    nb_cols = len(rgbs)
    stops = np.linspace(0, 1, nb_cols, endpoint=True)
    ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green, blue,
                                         NB_CONTOUR)

    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # Fill measured profile
    with Profile("filling profile"):
        profile_meas, _ = fill_profile(data[msk_bkg],
                                       variable,
                                       bg_eff,
                                       signal_above=signal_above)

    # Add k-NN variable
    with Profile("adding variable"):
        knnfeat = 'knn'
        #add_knn(data, feat=variable, newfeat=knnfeat, path='knn_fitter/models/knn_{}_{}.pkl.gz'.format(variable, bg_eff))
        add_knn(data,
                feat=variable,
                newfeat=knnfeat,
                path=args.output +
                '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))

    # Loading KNN classifier
    with Profile("loading model"):
        #knn = loadclf('knn_fitter/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))
        knn = loadclf(
            args.output +
            '/models/knn_{:s}_{:.0f}.pkl.gz'.format(variable, bg_eff))

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8
        edges, centres = dict(), dict()
        for ax, var in zip(['x', 'y'], [VARX, VARY]):

            # Short-hands
            vbins, vmin, vmax = AXIS[var]

            # Re-binned bin edges
            edges[ax] = np.interp(
                np.linspace(0, vbins, vbins * rebin + 1, endpoint=True),
                range(vbins + 1),
                np.linspace(vmin, vmax, vbins + 1, endpoint=True))

            # Re-binned bin centres
            centres[ax] = edges[ax][:-1] + 0.5 * np.diff(edges[ax])
            pass

        # Get predictions evaluated at re-binned bin centres
        g = dict()
        g['x'], g['y'] = np.meshgrid(centres['x'], centres['y'])
        g['x'], g['y'] = standardise(g['x'], g['y'])

        X = np.vstack((g['x'].flatten(), g['y'].flatten())).T
        fit = knn.predict(X).reshape(g['x'].shape).T

        # Fill ROOT "profile"
        profile_fit = ROOT.TH2F('profile_fit', "",
                                len(edges['x']) - 1, edges['x'].flatten('C'),
                                len(edges['y']) - 1, edges['y'].flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        pass

    # Plotting
    for fit in [False, True]:

        # Select correct profile
        profile = profile_fit if fit else profile_meas

        # Plot
        plot(profile, fit, variable, bg_eff)
        pass
    pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency
    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):
        if sig:
            print "working on signal"
        else:
            print "working on bg"

        if sig:
            rgbs = [(247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.linspace(0, 1, nb_cols, endpoint=True)
        else:
            rgbs = [(255 / 255., 51 / 255., 4 / 255.),
                    (247 / 255., 251 / 255., 255 / 255.),
                    (222 / 255., 235 / 255., 247 / 255.),
                    (198 / 255., 219 / 255., 239 / 255.),
                    (158 / 255., 202 / 255., 225 / 255.),
                    (107 / 255., 174 / 255., 214 / 255.),
                    (66 / 255., 146 / 255., 198 / 255.),
                    (33 / 255., 113 / 255., 181 / 255.),
                    (8 / 255., 81 / 255., 156 / 255.),
                    (8 / 255., 48 / 255., 107 / 255.)]

            red, green, blue = map(np.array, zip(*rgbs))
            nb_cols = len(rgbs)
            stops = np.array([0] + list(
                np.linspace(0, 1, nb_cols - 1, endpoint=True) *
                (1. - bg_eff / 100.) + bg_eff / 100.))
            pass

            ROOT.TColor.CreateGradientColorTable(nb_cols, stops, red, green,
                                                 blue, NB_CONTOUR)

        # Define arrays
        shape = (AXIS[VARX][0], AXIS[VARY][0])
        bins = [
            np.linspace(AXIS[var][1],
                        AXIS[var][2],
                        AXIS[var][0] + 1,
                        endpoint=True) for var in VARS
        ]
        x, y, z = (np.zeros(shape) for _ in range(3))

        # Create `profile` histogram
        profile = ROOT.TH2F('profile', "",
                            len(bins[0]) - 1, bins[0].flatten('C'),
                            len(bins[1]) - 1, bins[1].flatten('C'))

        # Compute inclusive efficiency in bins of `VARY`
        effs = list()
        for edges in zip(bins[1][:-1], bins[1][1:]):
            msk_bin = (data[VARY] > edges[0]) & (data[VARY] < edges[1])
            if signal_above:
                msk_pass = data[knnfeat] > 0  # ensure correct cut direction
            else:
                msk_pass = data[knnfeat] < 0
            num_msk = msk * msk_bin * msk_pass
            num = data.loc[num_msk, 'weight_test'].values.sum()
            den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
            effs.append(num / den)
            pass

        # Fill profile
        with Profile("Fill profile"):
            for i, j in itertools.product(*map(range, shape)):
                #print "Fill profile - (i, j) = ({}, {})".format(i,j)
                # Bin edges in x and y
                edges = [bin[idx:idx + 2] for idx, bin in zip([i, j], bins)]

                # Masks
                msks = [
                    (data[var] > edges[dim][0]) & (data[var] <= edges[dim][1])
                    for dim, var in enumerate(VARS)
                ]
                msk_bin = reduce(lambda x, y: x & y, msks)

                # Set non-zero bin content
                if np.sum(msk & msk_bin):
                    if signal_above:
                        msk_pass = data[
                            knnfeat] > 0  # ensure correct cut direction
                    else:
                        msk_pass = data[knnfeat] < 0
                    num_msk = msk * msk_bin * msk_pass
                    num = data.loc[num_msk, 'weight_test'].values.sum()
                    den = data.loc[msk & msk_bin, 'weight_test'].values.sum()
                    eff = num / den
                    profile.SetBinContent(i + 1, j + 1, eff)
                    pass

        c = rp.canvas(batch=True)
        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.20)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile.GetXaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARX, ROOT=True) +
                                    " = log(m^{2}/p_{T}^{2})")
        profile.GetYaxis().SetTitle("Large-#it{R} jet " +
                                    latex(VARY, ROOT=True) + " [GeV]")
        profile.GetZaxis().SetTitle("Selection efficiency for %s^{(%s%%)}" %
                                    (latex(variable, ROOT=True), bg_eff))

        profile.GetYaxis().SetNdivisions(505)
        profile.GetZaxis().SetNdivisions(505)
        profile.GetXaxis().SetTitleOffset(1.4)
        profile.GetYaxis().SetTitleOffset(1.8)
        profile.GetZaxis().SetTitleOffset(1.3)
        zrange = (0., 1.)
        if zrange:
            profile.GetZaxis().SetRangeUser(*zrange)
            pass
        profile.SetContour(NB_CONTOUR)

        # Draw
        profile.Draw('COLZ')

        # Decorations
        c.text(qualifier=QUALIFIER, ymax=0.92, xmin=0.15, ATLAS=False)
        c.text(["#sqrt{s} = 13 TeV", "#it{W} jets" if sig else "Multijets"],
               ATLAS=False)

        # -- Efficiencies
        xaxis = profile.GetXaxis()
        yaxis = profile.GetYaxis()
        tlatex = ROOT.TLatex()
        tlatex.SetTextColor(ROOT.kGray + 2)
        tlatex.SetTextSize(0.023)
        tlatex.SetTextFont(42)
        tlatex.SetTextAlign(32)
        xt = xaxis.GetBinLowEdge(xaxis.GetNbins())
        for eff, ibin in zip(effs, range(1, yaxis.GetNbins() + 1)):
            yt = yaxis.GetBinCenter(ibin)
            tlatex.DrawLatex(
                xt, yt, "%s%.1f%%" %
                ("#bar{#varepsilon}^{rel}_{%s} = " %
                 ('sig' if sig else 'bkg') if ibin == 1 else '', eff * 100.))
            pass

        # -- Bounds
        BOUNDS[0].DrawCopy("SAME")
        BOUNDS[1].DrawCopy("SAME")
        c.latex("m > 50 GeV",
                -4.5,
                BOUNDS[0].Eval(-4.5) + 30,
                align=21,
                angle=-37,
                textsize=13,
                textcolor=ROOT.kGray + 3)
        c.latex("m < 300 GeV",
                -2.5,
                BOUNDS[1].Eval(-2.5) - 30,
                align=23,
                angle=-57,
                textsize=13,
                textcolor=ROOT.kGray + 3)

        # Save
        mkdir('knn_fitter/figures/')
        c.save('knn_fitter/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', variable, bg_eff))
        mkdir(args.output + '/figures/')
        c.save(args.output + '/figures/knn_eff_{}_{:s}_{:.0f}.pdf'.format(
            'sig' if sig else 'bkg', variable, bg_eff))
        pass

    return
Exemplo n.º 14
0
def plot(*argv):
    """
    Method for delegating plotting.
    """

    # Unpack arguments
    experiment, means, graph, idx_improvements, best_mean, bins = argv

    # Plot results
    c = rp.canvas(batch=True)
    if experiment == 'classifier':
        ymax = 1.0  # 1.5
        ymin = 0.3
    else:
        ymax = 1500.0
        ymin = 0.0
        pass
    oobx = map(lambda t: t[0], filter(lambda t: t[1] > ymax, enumerate(means)))
    ooby = np.ones_like(oobx) * 0.96 * (ymax - ymin) + ymin

    # Plots
    markersize = 0.8
    c.graph(graph,
            markercolor=rp.colours[1],
            linecolor=rp.colours[1],
            markerstyle=20,
            markersize=markersize,
            option='AP',
            label='Evaluations',
            legend_option='PE')
    if len(ooby):
        c.graph(ooby,
                bins=oobx,
                markercolor=rp.colours[1],
                markerstyle=22,
                option='P')
        pass
    c.graph(best_mean,
            bins=bins,
            linecolor=rp.colours[5],
            linewidth=2,
            option='L')
    c.graph(best_mean[idx_improvements],
            bins=bins[idx_improvements],
            markercolor=rp.colours[5],
            markerstyle=24,
            markersize=markersize,
            option='P')

    # Decorations
    c.pad()._yaxis().SetNdivisions(505)
    c.xlabel("Bayesian optimisation step")
    c.ylabel("Cross-val. optimisation metric, " +
             ("L_{clf}^{val}" if experiment ==
              'classifier' else '1/#varepsilon_{bkg}^{rel} + #lambda/JSD'))
    c.xlim(0, len(bins))
    c.ylim(ymin, ymax)
    c.legend(width=0.22,
             ymax=0.816,
             categories=[
                 ('Best result',
                  dict(linecolor=rp.colours[5],
                       linewidth=2,
                       markercolor=rp.colours[5],
                       markerstyle=24,
                       option='LP')),
             ])
    c.text(["#sqrt{s} = 13 TeV"] + \
          (["Neural network (NN) classifier"] if experiment == 'classifier' else ["Adversarial neural network (ANN)", "classifier"]),
           qualifier=QUALIFIER)
    # Save
    mkdir('figures/optimisation/')
    c.save('figures/optimisation/optimisation_{}.pdf'.format(experiment))

    return
Exemplo n.º 15
0
def main (args):

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, _, _ = load_data('data/' + args.input) #, test=True)
    msk_sig = data['signal'] == 1
    msk_bkg = ~msk_sig

    # -------------------------------------------------------------------------
    ####
    #### # Initialise Keras backend
    #### initialise_backend(args)
    ####
    #### # Neural network-specific initialisation of the configuration dict
    #### initialise_config(args, cfg)
    ####
    #### # Keras import(s)
    #### from keras.models import load_model
    ####
    #### # NN
    #### from run.adversarial.common import add_nn
    #### with Profile("NN"):
    ####     classifier = load_model('models/adversarial/classifier/full/classifier.h5')
    ####     add_nn(data, classifier, 'NN')
    ####     pass
    # -------------------------------------------------------------------------

    # Fill measured profile
    profile_meas, (x,percs, err) = fill_profile_1D(data[msk_bkg])
    weights = 1/err

    # Add k-NN variable
    knnfeat = 'knn'
    orgfeat = VAR
    add_knn(data, newfeat=knnfeat, path='models/knn/{}_{}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL)) 

    # Loading KNN classifier
    knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))
    #knn = loadclf('models/knn/{}_{:s}_{}_{}.pkl.gz'.format(FIT, VAR, EFF, MODEL))

    X = x.reshape(-1,1)

    # Filling fitted profile
    with Profile("Filling fitted profile"):
        rebin = 8

        # Short-hands
        vbins, vmin, vmax = AXIS[VARX]

        # Re-binned bin edges  @TODO: Make standardised right away?
        # edges = np.interp(np.linspace(0, vbins, vbins * rebin + 1, endpoint=True), 
        #                  range(vbins + 1),
        #                  np.linspace(vmin, vmax,  vbins + 1,         endpoint=True))

        fineBins = np.linspace(vmin, vmax,  vbins*rebin + 1,         endpoint=True)
        orgBins = np.linspace(vmin, vmax,  vbins + 1,         endpoint=True)

        # Re-binned bin centres
        fineCentres = fineBins[:-1] + 0.5 * np.diff(fineBins)
        orgCentres = orgBins[:-1] + 0.5 * np.diff(orgBins)
        
        pass

        # Get predictions evaluated at re-binned bin centres
        if 'erf' in FIT:
            fit = func(fineCentres, knn[0], knn[1], knn[2])
            print "Check: ", func([1500, 2000], knn[0], knn[1], knn[2]) 
        else:
            fit = knn.predict(fineCentres.reshape(-1,1)) #centres.reshape(-1,1))

        # Fill ROOT "profile"
        profile_fit = ROOT.TH1F('profile_fit', "", len(fineBins) - 1, fineBins.flatten('C'))
        root_numpy.array2hist(fit, profile_fit)
        
        knn1 = PolynomialFeatures(degree=2)                                           
        X_poly = knn1.fit_transform(X)
        reg = LinearRegression(fit_intercept=False) #fit_intercept=False)
        reg.fit(X_poly, percs, weights)
        score = round(reg.score(X_poly, percs), 4)
        coef = reg.coef_
        intercept = reg.intercept_
        print "COEFFICIENTS: ", coef, intercept
        
        TCoef = ROOT.TVector3(coef[0], coef[1], coef[2]) 
        outFile = ROOT.TFile.Open("models/{}_jet_ungrtrk500_eff{}_stat{}_{}.root".format(FIT, EFF, MIN_STAT, MODEL),"RECREATE")
        outFile.cd()
        TCoef.Write()
        profile_fit.SetName("kNNfit")
        profile_fit.Write()
        outFile.Close()

        # profile_meas2 = ROOT.TH1F('profile_meas', "", len(x) - 1, x.flatten('C'))
        # root_numpy.array2hist(percs, profile_meas2)
        profile_meas2 = ROOT.TGraph(len(x), x, percs) 
        pass


    # Plotting
    with Profile("Plotting"):
        # Plot
        plot(profile_meas2, profile_fit)
        pass

    # Plotting local selection efficiencies for D2-kNN < 0
    # -- Compute signal efficiency

    # MC weights are scaled with lumi. This is just for better comparison
    #if INPUT =="mc": 
    #    data.loc[:,'TotalEventWeight'] /=  139000000. 

    for sig, msk in zip([True, False], [msk_sig, msk_bkg]):

        # Define arrays
        shape   = AXIS[VARX][0]
        bins    = np.linspace(AXIS[VARX][1], AXIS[VARX][2], AXIS[VARX][0]+ 1, endpoint=True)
        #bins = np.linspace(AXIS[VARX][1], 4000, 40, endpoint=True)
        #bins = np.append(bins, [4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000])

        print "HERE: ", bins 
        
        #x, y = (np.zeros(shape) for _ in range(2))

        # Create `profile` histogram
        profile_knn = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )
        profile_org = ROOT.TH1F('profile', "", len(bins) - 1, bins ) #.flatten('C') )

        # Compute inclusive efficiency in bins of `VARX`
        effs = list()
        
        for i in range(shape):
            msk_bin  = (data[VARX] > bins[i]) & (data[VARX] <= bins[i+1])
            msk_pass =  data[knnfeat] > 0 # <?
            msk_pass_org =  data[orgfeat] > 70 # <?
            num = data.loc[msk & msk_bin & msk_pass, 'TotalEventWeight'].values.sum()
            num_org = data.loc[msk & msk_bin & msk_pass_org, 'TotalEventWeight'].values.sum()
            den = data.loc[msk & msk_bin,'TotalEventWeight'].values.sum()
            if den > 0:
                eff = num/den *100.
                eff_org = num_org/den *100.
                profile_knn.SetBinContent(i + 1, eff)
                profile_org.SetBinContent(i + 1, eff_org)
                effs.append(eff)
            #else:
            #print i, "Density = 0"
            pass

        c = rp.canvas(batch=True)
        leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85)
        leg.AddEntry(profile_knn, "#it{n}_{trk}^{#varepsilon=%s%%} > 0" % ( EFF), "l")
        leg.AddEntry(profile_org, "#it{n}_{trk} > 70", "l")
        leg.Draw()

        pad = c.pads()[0]._bare()
        pad.cd()
        pad.SetRightMargin(0.10)
        pad.SetLeftMargin(0.15)
        pad.SetTopMargin(0.10)

        # Styling
        profile_knn.SetLineColor(rp.colours[1])
        profile_org.SetLineColor(rp.colours[2])
        profile_knn.SetMarkerStyle(24)
        profile_knn.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + "[GeV]") #+ " = log(m^{2}/p_{T}^{2})")
        #profile.GetXaxis().SetTitle("Large-#it{R} jet " + latex(VARX, ROOT=True))# + " = log(m^{2}/p_{T}^{2})")
        profile_org.GetYaxis().SetTitle("Selection efficiency (%)") # for #it{n}_{trk}^{#varepsilon=%s%%}>0" % ( EFF))

        profile_knn.GetYaxis().SetNdivisions(505)
        #profile_knn.GetXaxis().SetNdivisions(505)
        profile_knn.GetXaxis().SetTitleOffset(1.4)
        profile_knn.GetYaxis().SetTitleOffset(1.8)
        profile_knn.GetXaxis().SetRangeUser(*XRANGE)
        profile_org.GetXaxis().SetRangeUser(*XRANGE)

        yrange = (0., EFF*3) #2.0 percent
        if yrange:
            profile_knn.GetYaxis().SetRangeUser(*yrange)
            profile_org.GetYaxis().SetRangeUser(*yrange)
            pass

        # Draw
        profile_org.Draw()
        profile_knn.Draw("same")

        # Save
        mkdir('figures/knn/')
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.pdf'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        #c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.png'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL, MIN_STAT))
        c.save('figures/knn/{}_eff_{}_{:s}_{}_{}_stat{}.eps'.format(FIT, 'sig' if sig else 'bkg', VAR, EFF, MODEL+INPUT, MIN_STAT))
        del c
        
        pass

    return
Exemplo n.º 16
0
def plot (profile, fit):
    """
    Method for delegating plotting.
    """

    # rootplotting
    c = rp.canvas(batch=True)
    pad = c.pads()[0]._bare()
    pad.cd()
    pad.SetRightMargin(0.20)
    pad.SetLeftMargin(0.15)
    pad.SetTopMargin(0.10)

    # Styling
    #profile.SetLineColor(4)
    profile.SetMarkerColor(4)
    profile.SetMarkerStyle(20)
    fit.SetLineColor(2)
    fit.SetMarkerColor(4)
    fit.SetMarkerStyle(20)
    profile.GetXaxis().SetTitle( "#it{m}_{jj} [GeV]" ) #latex(VARX, ROOT=True) + " [GeV]") #+ " = log(m^{2}/p_{T}^{2})")
    profile.GetYaxis().SetTitle( "#it{P}^{#varepsilon=%s%%}" % (EFF) )
#"%s %s^{(%s%%)}" % ("#it{k}-NN fitted" if fit else "Measured", latex(VAR, ROOT=True), EFF))

    profile.GetYaxis().SetNdivisions(505)
    profile.GetXaxis().SetTitleOffset(1.4)
    profile.GetYaxis().SetTitleOffset(1.4)
    profile.GetXaxis().SetRangeUser(*XRANGE)
    #profile.GetXaxis().SetRangeUser(1000, 9000)
    #fit.GetXaxis().SetRangeUser(1000, 8000)

    if YRANGE:
        profile.GetYaxis().SetRangeUser(*YRANGE)
        pass

    # Draw Goddamn it

    #    print profile.GetBinContent(10), profile.GetNbinsX(), profile.GetEntries()

    profile.Draw("AP")
    fit.Draw("SAME") #("SAME")
    
    leg = ROOT.TLegend(0.2, 0.75, 0.5, 0.85)

    if INPUT=='data':
        leg.AddEntry(profile, "CR Data", "p")
    elif INPUT=='mcCR':
        leg.AddEntry(profile, "CR MC", "p")
    elif INPUT=='mc':
        leg.AddEntry(profile, "Full MC", "p")


    if 'knn' in FIT:
        fitLegend =  "k-NN fit "
    elif 'poly2' in FIT:
        fitLegend = "2. order polynomial fit " 
    elif 'poly3' in FIT:
        fitLegend = "3. order polynomial fit "
    elif 'erf' in FIT:
        fitLegend = "Error function fit "

    if MODEL=='data':
        fitLegend += "to CR Data"
    elif MODEL=='mcCR':
        fitLegend += "to CR MC"
    elif MODEL=='mc':
        fitLegend += "to Full MC"

    leg.AddEntry(fit, fitLegend, "l")
    leg.Draw() 


    # Save
    mkdir('figures/knn/')
    c.save('figures/knn/{}_profile_{:s}_{}_{}_stat{}.pdf'.format( FIT, VAR, EFF, MODEL+INPUT, MIN_STAT))
    #c.save('figures/knn/{}_profile_{:s}_{}_{}_stat{}.png'.format( FIT, VAR, EFF, MODEL, MIN_STAT))
    c.save('figures/knn/{}_profile_{:s}_{}_{}_stat{}.eps'.format( FIT, VAR, EFF, MODEL+INPUT, MIN_STAT))
    
    del c
    pass
Exemplo n.º 17
0
def perform_optimisation(var, bins, data):
    """
    ...
    """

    # Fill 2D substructure profile
    profile2d = fill_2d_profile(data, var, bins, "m", MASS_BINS)

    # Get 1D profile for lowest mass bin
    profile0 = profile2d.ProjectionY("%s_lowMass" % profile2d.GetName(), 1, 1)
    profile0 = kde(profile0)
    normalise(profile0, density=True)

    # Perform the optimisation
    bestShapeVal = 0
    bestSumChi2 = 1e20
    for shapeVal in SHAPEVAL_RANGE:
        print "Shape value: ", shapeVal
        sumChi2 = 0.

        # Each mass bin needs to be optimized over omega
        for mass in range(len(MASS_BINS) - 1):
            print "   Mass bin: ", mass

            # Get 1D profile for current mass bin
            profile = profile2d.ProjectionY(
                "%s_bin_%i" % (profile2d.GetName(), mass), mass + 1, mass + 1)

            # Fit current profile to low-mass profile
            chi2, bestOmega, _, _ = fit(profile, shapeVal, profile0,
                                        "%.2f" % mass)

            # Accumulate chi2
            sumChi2 += chi2
            pass

        # Update chi2 for current `shapeVal`
        print "-- sumChi2: {} (cp. {})".format(sumChi2, bestSumChi2)
        if sumChi2 < bestSumChi2:
            bestSumChi2 = sumChi2
            bestShapeVal = shapeVal
            pass
        pass

    # Saving CSS transforms
    with Profile("Saving CSS transform"):

        # Ensure model directory exists
        mkdir('models/css/')
        mkdir(
            'figures/css/'
        )  ## put in by me because errors were eturned when saving the pdfs

        # Get the optimal, measured `omega`s for each mass-bin
        bestOmegas = list()
        for mass in range(len(MASS_BINS) - 1):
            profile = profile2d.ProjectionY(
                "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1,
                mass + 1)
            sumChi2, bestOmega, profile_css, profile0rebin = fit(
                profile, bestShapeVal, profile0, "%.2f" % mass)

            # Test-plot distributions used for fitting!
            # -- Canvas
            c = rp.canvas(batch=True)

            # -- Plot
            profile = kde(profile)
            normalise(profile, density=True)

            lowmassbin = "#it{{m}} #in  [{:.1f}, {:.1f}] GeV".format(
                MASS_BINS[0], MASS_BINS[1]).replace('.0', '')
            massbin = "#it{{m}} #in  [{:.1f}, {:.1f}] GeV".format(
                MASS_BINS[mass], MASS_BINS[mass + 1]).replace('.0', '')
            c.hist(profile0rebin,
                   label=latex(var, ROOT=True) + ",    {}".format(lowmassbin),
                   linecolor=rp.colours[1],
                   fillcolor=rp.colours[1],
                   alpha=0.5,
                   option='HISTL',
                   legend_option='FL')
            c.hist(profile,
                   label=latex(var, ROOT=True) + ",    {}".format(massbin),
                   linecolor=rp.colours[4],
                   linestyle=2,
                   option='HISTL')
            c.hist(profile_css,
                   label=latex(var + 'CSS', ROOT=True) +
                   ", {}".format(massbin),
                   linecolor=rp.colours[3],
                   option='HISTL')

            # -- Decorations
            c.xlabel(
                latex(var, ROOT=True) + ", " + latex(var + 'CSS', ROOT=True))
            c.ylabel("Number of jets p.d.f.")
            c.legend(xmin=0.45, ymax=0.76, width=0.25)
            c.text(["#sqrt{s} = 13 TeV,  Multijets", "KDE smoothed"],
                   qualifier=QUALIFIER,
                   ATLAS=False)
            c.pad()._xaxis().SetTitleOffset(1.3)
            c.pad()._yaxis().SetNdivisions(105)
            c.pad()._primitives[-1].Draw('SAME AXIS')
            c.padding(0.50)

            # -- Save
            c.save('figures/css/css_test_{}_mass{}.pdf'.format(var, mass))

            # Store best-fit omega in array
            print mass, bestOmega
            bestOmegas.append(bestOmega)
            pass

        # Fit best omega vs. mass
        x = MASS_BINS[:-1] + 0.5 * np.diff(MASS_BINS)
        y = np.array(bestOmegas)

        h = ROOT.TH1F('hfit', "", len(MASS_BINS) - 1, MASS_BINS)
        root_numpy.array2hist(y, h)
        for ibin in range(1, len(x) + 1):
            h.SetBinError(
                ibin,
                0.02)  # Just some value to ensure equal errors on all points
            pass

        m0 = 0.5 * (MASS_BINS[0] + MASS_BINS[1])
        f = ROOT.TF1(
            "fit",
            "[0] * (1./{m0}  - 1./x) + [1] * TMath::Log(x/{m0})".format(m0=m0),
            m0, 300)
        f.SetLineColor(rp.colours[4])
        f.SetLineStyle(2)
        h.Fit(f)

        # Write out the optimal configuration for each mass bin
        for mass in range(len(MASS_BINS) - 1):
            profile = profile2d.ProjectionY(
                "%s_bin_%i_final" % (profile2d.GetName(), mass), mass + 1,
                mass + 1)
            profile = kde(profile)
            normalise(profile, density=True)
            bestOmegaFitted_ = f.Eval(
                h.GetBinCenter(mass + 1)) + np.finfo(float).eps
            bestOmegaFitted = max(bestOmegaFitted_, 1E-04)
            #bestOmegaFitted = h.GetBinContent(mass + 1)
            print "bestOmegaFitted[{}] = {} --> {}".format(
                mass, bestOmegaFitted_, bestOmegaFitted)
            F, Ginv = get_css_fns(bestShapeVal, bestOmegaFitted, profile, "")

            # Save classifier
            saveclf(F, 'models/css/css_%s_F_%i.pkl.gz' % (var, mass))
            saveclf(Ginv, 'models/css/css_%s_Ginv_%i.pkl.gz' % (var, mass))
            pass

        # Plot best omega vs. mass
        # -- Canvas
        c = rp.canvas(batch=True)

        # -- Plots
        #c.hist(bestOmegas, bins=MASS_BINS, linecolor=rp.colours[1])
        c.hist(h, linecolor=rp.colours[1], option='HIST', label="Measured")
        f.Draw('SAME')

        # -- Decorations
        c.xlabel("Large-#it{R} jet mass [GeV]")
        c.ylabel("Best-fit #Omega_{D}")
        c.text([
            "#sqrt{s} = 13 TeV,  Multijets", "CSS applied to {}".format(
                latex(var, ROOT=True)),
            "Best-fit #alpha = {:.1f}".format(bestShapeVal)
        ],
               qualifier=QUALIFIER,
               ATLAS=False)
        c.legend(categories=[('Functional fit', {
            'linewidth': 2,
            'linestyle': 2,
            'linecolor': rp.colours[4]
        })])
        # Save
        c.save('figures/css/cssBestOmega_{}.pdf'.format(var))
        pass

    return 0
Exemplo n.º 18
0
def main (args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5', background=True, train=True)

    pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True)
    pt_bins = zip(pt_bins[:-1], pt_bins[1:])
    bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True)

    for pt_bin in pt_bins:

        histstyle[True] ['label'] = 'Inclusive'
        histstyle[False]['label'] = 'p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(*pt_bin)

        # Canvas
        c = rp.canvas(batch=True)

        # Plots
        msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1])
        c.hist(data['m'].values,      bins=bins, weight=data['weight_adv'] .values,      normalise=True, **histstyle[True])
        c.hist(data['m'].values[msk], bins=bins, weight=data['weight_adv'] .values[msk], normalise=True, **histstyle[False])
        c.hist(data['m'].values[msk], bins=bins, weight=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen)

        # Decorations
        c.legend()
        c.xlabel("Large-#it{R} jet mass [GeV]")
        c.ylabel("Fraction of jets")

        # Save
        c.save('figures/temp_mass_pT{:.0f}_{:.0f}.pdf'.format(*pt_bin))
        pass

    return


    # Perform selection  @NOTE: For Rel. 20.7 only
    #data = data[(data['m']  >  50) & (data['m']  <  300)]
    #data = data[(data['pt'] > 200) & (data['pt'] < 2000)]

    # Add variables  @NOTE: For Rel. 20.7 only
    #data['rho']    = pd.Series(np.log(np.square(data['m']) / np.square(data['pt'])), index=data.index)
    #data['rhoDDT'] = pd.Series(np.log(np.square(data['m']) / data['pt'] / 1.), index=data.index)

    data['logm'] = pd.Series(np.log(data['m']), index=data.index)

    # Check variable distributions
    axes = {
        'pt':   (45, 200, 2000),
        'm':    (50,  50,  300),
        'rho':  (50,  -8,    0),
        'logm': (50,  np.log(50),  np.log(300)),
    }
    weight = 'weight_adv'  # 'weight_test' / 'weight'
    pt_range = (200., 2000.)
    msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1])
    for var in axes:

        # Canvas
        c = rp.canvas(num_pads=2, batch=True)

        # Plot
        bins = np.linspace(axes[var][1], axes[var][2], axes[var][0] + 1, endpoint=True)
        for adv in [0,1]:
            msk  = data['signal'] == 0   # @TEMP signal
            msk &= msk_pt
            opts = dict(normalise=True, **HISTSTYLE[adv])  # @TEMP signal
            opts['label'] = 'adv' if adv else 'test'
            if adv:
                h1 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, weight].values, **opts)
            else:
                h2 = c.hist(data.loc[msk, var].values, bins=bins, weights=data.loc[msk, 'weight_test'].values, **opts)
                pass
            pass

        # Ratio
        c.pads()[1].ylim(0,2)
        c.ratio_plot((h1,h2), oob=True)

        # Decorations
        c.legend()
        c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("adv/test")
        #c.logy()
        c.text(TEXT + ['p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(pt_range[0], pt_range[1])], qualifier=QUALIFIER)

        # Save
        mkdir('figures/distributions')
        c.save('figures/distributions/incl_{}.pdf'.format(var))
        pass


    # 2D histograms
    msk = data['signal'] == 0
    axisvars = sorted(list(axes))
    for i,varx in enumerate(axisvars):
        for vary in axisvars[i+1:]:
            # Canvas
            c = ROOT.TCanvas()
            c.SetRightMargin(0.20)

            # Create, fill histogram
            h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "", *(axes[varx] + axes[vary]))
            root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values, 100. * data.loc[msk, weight].values)

            # Draw
            h2.Draw("COLZ")

            # Decorations
            h2.GetXaxis().SetTitle(latex(varx, ROOT=True))
            h2.GetYaxis().SetTitle(latex(vary, ROOT=True))
            c.SetLogz()

            # Save
            c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary))
            pass
        pass

    return
Exemplo n.º 19
0
def plot1D (*argv):
    """
    Method for delegating 1D plotting.
    """

    # Unpack arguments
    graphs, ddt, arr_x, variable, fit_range = argv

    # Style
    ROOT.gStyle.SetTitleOffset(1.4, 'x')

    # Canvas
    c = rp.canvas(batch=True)

    # Setup
    pad = c.pads()[0]._bare()
    pad.cd()
    pad.SetTopMargin(0.10)
    pad.SetTopMargin(0.10)

    # Profiles
    if variable == VAR_TAU21:
    	c.graph(graphs[variable],         label="Original, #tau_{21}",          linecolor=rp.colours[4], markercolor=rp.colours[4], markerstyle=24, legend_option='PE')
    	c.graph(graphs[variable + 'DDT'], label="Transformed, #tau_{21}^{DDT}", linecolor=rp.colours[1], markercolor=rp.colours[1], markerstyle=20, legend_option='PE')
    elif variable == VAR_N2:
    	c.graph(graphs[variable],         label="Original, N_{2}",          linecolor=rp.colours[4], markercolor=rp.colours[4], markerstyle=24, legend_option='PE')
    	c.graph(graphs[variable + 'DDT'], label="Transformed, N_{2}^{DDT}", linecolor=rp.colours[1], markercolor=rp.colours[1], markerstyle=20, legend_option='PE')
    elif variable == VAR_DECDEEP:
    	c.graph(graphs[variable],         label="Original, dec_deepWvsQCD",          linecolor=rp.colours[4], markercolor=rp.colours[4], markerstyle=24, legend_option='PE')
    	c.graph(graphs[variable + 'DDT'], label="Transformed, dec_deepWvsQCD^{DDT}", linecolor=rp.colours[1], markercolor=rp.colours[1], markerstyle=20, legend_option='PE')
    elif variable == VAR_DEEP:
    	c.graph(graphs[variable],         label="Original, deepWvsQCD",          linecolor=rp.colours[4], markercolor=rp.colours[4], markerstyle=24, legend_option='PE')
    	c.graph(graphs[variable + 'DDT'], label="Transformed, deepWvsQCD^{DDT}", linecolor=rp.colours[1], markercolor=rp.colours[1], markerstyle=20, legend_option='PE')


    # Fit
    x1, x2 = min(arr_x), max(arr_x)
    intercept, coef = ddt.intercept_ + ddt.offset_, ddt.coef_
    y1 = intercept + x1 * coef
    y2 = intercept + x2 * coef
    c.plot([y1,y2], bins=[x1,x2], color=rp.colours[-1], label='Linear fit', linewidth=1, linestyle=1, option='L')

    # Decorations
    c.xlabel("jet #rho^{DDT} = log[m^{2} / (p_{T} #times 1 GeV)]")
    if variable == VAR_TAU21:
        c.ylabel("#LT#tau_{21}#GT, #LT#tau_{21}^{DDT}#GT")
    elif variable == VAR_N2:
	c.ylabel("#LTN_{2}#GT, #LTN_{2}^{DDT}#GT")
    elif variable == VAR_DECDEEP:
	c.ylabel("#LTdec_deepWvsQCD#GT, #LTdec_deepWvsQCD^{DDT}#GT")
    elif variable == VAR_DEEP:
	c.ylabel("#LTdeepWvsQCD#GT, #LTdeepWvsQCD^{DDT}#GT")

    c.text(["#sqrt{s} = 13 TeV,  Multijets"], qualifier=QUALIFIER, ATLAS=False)
    c.legend(width=0.25, xmin=0.57, ymax=0.86) #None if "Internal" in QUALIFIER else 0.93)

    c.xlim(0, 6.0)
    if variable == VAR_N2:
	ymax = 0.8
    else:
	ymax = 1.4
    c.ylim(0, ymax)
    c.latex("Fit range", sum(fit_range) / 2., 0.08, textsize=13, textcolor=ROOT.kGray + 2)
    c.latex("Fit parameters:", 0.37, 0.7*ymax, align=11, textsize=14, textcolor=ROOT.kBlack)
    c.latex("  intercept = {:7.4f}".format(intercept[0]), 0.37, 0.65*ymax, align=11, textsize=14, textcolor=ROOT.kBlack)
    c.latex("  coef = {:7.4f}".format(coef[0]), 0.37, 0.6*ymax, align=11, textsize=14, textcolor=ROOT.kBlack)
    c.xline(fit_range[0], ymax=0.82, text_align='BR', linecolor=ROOT.kGray + 2)
    c.xline(fit_range[1], ymax=0.82, text_align='BL', linecolor=ROOT.kGray + 2)

    # Save
    mkdir('figures/ddt/')
    c.save('figures/ddt/ddt_{}.pdf'.format(variable))
    return
Exemplo n.º 20
0
def main(args):

    # Initialising
    # --------------------------------------------------------------------------
    args, cfg = initialise(args)

    # Loading data
    # --------------------------------------------------------------------------
    data, features, _ = load_data(args.input + 'data_1M_10M.h5')
    #data = data.sample(frac=0.5, random_state=32)  # @TEMP
    data = data[data['train'] == 1]

    # Reduce size of data
    drop_features = [
        feat for feat in list(data)
        if feat not in features + ['m', 'signal', 'weight_adv']
    ]
    data.drop(drop_features, axis=1)

    cfg['uBoost']['train_features'] = features
    cfg['uBoost']['random_state'] = SEED
    cfg['DecisionTreeClassifier']['random_state'] = SEED

    # Arrays
    X = data

    #print(X.head())

    w = np.array(data['weight_adv']).flatten()
    y = np.array(data['signal']).flatten()

    # Fit uBoost classifier
    # --------------------------------------------------------------------------
    with Profile("Fitting uBoost classifier"):

        # @NOTE: There might be an issue with the sample weights, because the
        #        local efficiencies computed using kNN does not seem to take the
        #        sample weights into account.
        #
        #        See:
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/uboost.py#L247-L248
        #        and
        #          https://github.com/arogozhnikov/hep_ml/blob/master/hep_ml/metrics_utils.py#L159-L176
        #        with `divided_weights` not set.
        #
        #        `sample_weight` seem to be use only as a starting point for the
        #        boosted, and so not used for the efficiency calculation.
        #
        #        If this is indeed the case, it would be possible to simply
        #        sample MC events by their weight, and use `sample_weight = 1`
        #        for all samples passed to uBoost.
        #
        # @NOTE: I have gotten less sure of the above, so probably no panic.

        def train_uBoost(X, y, w, cfg, uniforming_rate):
            """
            ...
            """

            # Create base classifier
            base_tree = DecisionTreeClassifier(**cfg['DecisionTreeClassifier'])

            # Update training configuration
            these_cfg = dict(**cfg['uBoost'])
            these_cfg['uniforming_rate'] = uniforming_rate

            # Create uBoost classifier
            uboost = uBoostBDT(base_estimator=base_tree, **these_cfg)

            # Fit uBoost classifier
            uboost.fit(X, y, sample_weight=w)

            return uboost

        #uniforming_rates = [0.0, 0.01, 0.1, 0.3, 1.0, 3.0, 10.0, 30.0, 100.0]
        uniforming_rates = [0.0, 0.01, 0.1, 0.3, 0.5, 1.0]
        #uniforming_rates = [0.5, 1.0]
        n_jobs = min(7, len(uniforming_rates))  # ...(10, ...

        jobs = [
            delayed(train_uBoost, check_pickle=False)(X, y, w, cfg,
                                                      uniforming_rate)
            for uniforming_rate in uniforming_rates
        ]

        result = Parallel(n_jobs=n_jobs, backend="threading")(jobs)
        pass

    # Saving classifiers
    # --------------------------------------------------------------------------
    for uboost, uniforming_rate in zip(result, uniforming_rates):
        with Profile("Saving classifiers"):

            # Ensure model directory exists
            mkdir('models/uboost/')

            suffix_ur = "ur_{:s}".format(
                ("%.2f" % uniforming_rate).replace('.', 'p'))
            suffix_te = "te_{:d}".format(
                int(cfg['uBoost']['target_efficiency'] * 100))

            # Save uBoost classifier
            with gzip.open(
                    'models/uboost/uboost_{}_{}_rel21_fixed_def_cfg_1000boost.pkl.gz'
                    .format(suffix_ur, suffix_te), 'w') as f:
                pickle.dump(uboost, f)
                pass
            pass
        pass

    return 0
Exemplo n.º 21
0
def main(args):

    # Definitions
    histstyle = dict(**HISTSTYLE)

    # Initialise
    args, cfg = initialise(args)

    # Load data
    data, features, _ = load_data(args.input + 'data.h5',
                                  background=True,
                                  train=True)

    pt_bins = np.linspace(200, 2000, 18 + 1, endpoint=True)
    pt_bins = [None] + zip(pt_bins[:-1], pt_bins[1:])

    vars = ['m', 'pt']
    for var, pt_bin, log in itertools.product(vars, pt_bins, [True, False]):

        if var == 'm':
            bins = np.linspace(50, 300, (300 - 50) // 10 + 1, endpoint=True)
        else:
            bins = np.linspace(200,
                               2000, (2000 - 200) // 50 + 1,
                               endpoint=True)
            pass

        histstyle[True]['label'] = 'Training weight'
        histstyle[False]['label'] = 'Testing weight'

        # Canvas
        c = rp.canvas(batch=True)

        # Plots
        if pt_bin is not None:
            msk = (data['pt'] > pt_bin[0]) & (data['pt'] < pt_bin[1])
        else:
            msk = np.ones(data.shape[0], dtype=bool)
            pass

        if pt_bin is not None:
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_test'].values[msk],
                   normalise=True,
                   **histstyle[False])
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_adv'].values[msk],
                   normalise=True,
                   **histstyle[True])
            #c.hist(data[var].values,      bins=bins, weights=data['weight_adv'] .values,      normalise=True, **histstyle[True])
            #c.hist(data[var].values[msk], bins=bins, weights=data['weight_adv'] .values[msk], normalise=True, **histstyle[False])
            #c.hist(data[var].values[msk], bins=bins, weights=data['weight_test'].values[msk], normalise=True, label="Testing weight", linewidth=2, linecolor=ROOT.kGreen)
        else:
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_test'].values[msk],
                   normalise=True,
                   **histstyle[False])
            c.hist(data[var].values[msk],
                   bins=bins,
                   weights=data['weight_adv'].values[msk],
                   normalise=True,
                   **histstyle[True])
            pass

        # Decorations
        c.text(TEXT + ["Multijets", "Training dataset"] +
               (['p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(
                   *pt_bin)] if pt_bin is not None else []),
               qualifier='Simulation Internal')
        c.legend()
        c.xlabel("Large-#it{{R}} jet {:s} [GeV]".format('mass' if var ==
                                                        'm' else 'p_{T}'))
        c.ylabel("Fraction of jets")
        if log:
            c.logy()
            pass

        # Save
        c.save('figures/weighting_{}{:s}{}.pdf'.format(
            'mass' if var == 'm' else var,
            '_pT{:.0f}_{:.0f}'.format(*pt_bin) if pt_bin is not None else '',
            '_log' if log else ''))
        pass

    return

    data['logm'] = pd.Series(np.log(data['m']), index=data.index)

    # Check variable distributions
    axes = {
        'pt': (45, 200, 2000),
        'm': (50, 50, 300),
        'rho': (50, -8, 0),
        'logm': (50, np.log(50), np.log(300)),
    }
    weight = 'weight_adv'  # 'weight_test' / 'weight'
    pt_range = (200., 2000.)
    msk_pt = (data['pt'] > pt_range[0]) & (data['pt'] < pt_range[1])
    for var in axes:

        # Canvas
        c = rp.canvas(num_pads=2, batch=True)

        # Plot
        bins = np.linspace(axes[var][1],
                           axes[var][2],
                           axes[var][0] + 1,
                           endpoint=True)
        for adv in [0, 1]:
            msk = data['signal'] == 0  # @TEMP signal
            msk &= msk_pt
            opts = dict(normalise=True, **HISTSTYLE[adv])  # @TEMP signal
            opts['label'] = 'adv' if adv else 'test'
            if adv:
                h1 = c.hist(data.loc[msk, var].values,
                            bins=bins,
                            weights=data.loc[msk, weight].values,
                            **opts)
            else:
                h2 = c.hist(data.loc[msk, var].values,
                            bins=bins,
                            weights=data.loc[msk, 'weight_test'].values,
                            **opts)
                pass
            pass

        # Ratio
        c.pads()[1].ylim(0, 2)
        c.ratio_plot((h1, h2), oob=True)

        # Decorations
        c.legend()
        c.xlabel(latex(var, ROOT=True))
        c.ylabel("Fraction of jets")
        c.pads()[1].ylabel("adv/test")
        #c.logy()
        c.text(TEXT + [
            'p_{{T}} #in  [{:.0f}, {:.0f}] GeV'.format(pt_range[0],
                                                       pt_range[1])
        ],
               qualifier=QUALIFIER)

        # Save
        mkdir('figures/distributions')
        c.save('figures/distributions/incl_{}.pdf'.format(var))
        pass

    # 2D histograms
    msk = data['signal'] == 0
    axisvars = sorted(list(axes))
    for i, varx in enumerate(axisvars):
        for vary in axisvars[i + 1:]:
            # Canvas
            c = ROOT.TCanvas()
            c.SetRightMargin(0.20)

            # Create, fill histogram
            h2 = ROOT.TH2F('{}_{}'.format(varx, vary), "",
                           *(axes[varx] + axes[vary]))
            root_numpy.fill_hist(h2, data.loc[msk, [varx, vary]].values,
                                 100. * data.loc[msk, weight].values)

            # Draw
            h2.Draw("COLZ")

            # Decorations
            h2.GetXaxis().SetTitle(latex(varx, ROOT=True))
            h2.GetYaxis().SetTitle(latex(vary, ROOT=True))
            c.SetLogz()

            # Save
            c.SaveAs('figures/distributions/2d_{}_{}.pdf'.format(varx, vary))
            pass
        pass

    return