Exemplo n.º 1
0
def test_blocks_hist2(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0],
                          weights=data_gen[2],
                          bins='blocks',
                          scale='binwidth',
                          color='green',
                          p0=0.1)

    if cmdopt == "generate":
        with open(answer_dir + '/answers_blocks_hist2.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1])
        plt.title('test_blocks_hist2')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir + '/answers_blocks_hist2.npz')
        assert (np.all(output[0] == answers['bc']))
        assert (np.all(output[1] == answers['be']))

    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]],
                     bins='blocks',
                     scale='binwidth',
                     color='green',
                     gamma=0.1)
Exemplo n.º 2
0
def test_ratio_plot_stacked(cmdopt, data_gen):

    output = skh_plt.ratio_plot(dict(x=[data_gen[0], data_gen[1]],
                                     stacked=True,
                                     errorbars=True),
                                dict(x=[data_gen[0], data_gen[1]],
                                     weights=[data_gen[2], data_gen[2]],
                                     stacked=True,
                                     errorbars=True,
                                     err_style='line'),
                                range=(-5, 5),
                                bins='blocks')

    if cmdopt == "generate":
        with open(answer_dir + '/answers_ratio_plot_stacked.npz', 'wb') as f:
            np.savez(f,
                     bc1=output[1][0],
                     be1=output[1][1],
                     bc2=output[2][0],
                     be2=output[2][1])
        output[0][0].set_title('test_ratio_plot_stacked')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir + '/answers_ratio_plot_stacked.npz')
        assert (np.all(output[1][0] == answers['bc1']))
        assert (np.all(output[1][1] == answers['be1']))
        assert (np.all(output[2][0] == answers['bc2']))
        assert (np.all(output[2][1] == answers['be2']))
Exemplo n.º 3
0
def test_ratio_plot_log(cmdopt, data_gen):

    output = skh_plt.ratio_plot(dict(x=data_gen[0],
                                     errorbars=True,
                                     histtype='marker',
                                     log=True,
                                     err_x=False),
                                dict(x=data_gen[1],
                                     weights=data_gen[2],
                                     errorbars=True),
                                logx=True,
                                ratio_range=(0, 10))

    if cmdopt == "generate":
        with open(answer_dir + '/answers_ratio_plot_log.npz', 'wb') as f:
            np.savez(f,
                     bc1=output[1][0],
                     be1=output[1][1],
                     bc2=output[2][0],
                     be2=output[2][1])
        output[0][0].set_title('test_ratio_plot_log')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir + '/answers_ratio_plot_log.npz')
        assert (np.all(output[1][0] == answers['bc1']))
        assert (np.all(output[1][1] == answers['be1']))
        assert (np.all(output[2][0] == answers['bc2']))
        assert (np.all(output[2][1] == answers['be2']))
Exemplo n.º 4
0
def test_simple_hist1(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0])

    if cmdopt == "generate":
        with open(answer_dir + '/answers_simple_hist1.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1])
        plt.title('test_simple_hist1')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir + '/answers_simple_hist1.npz')
        assert (np.all(output[0] == answers['bc']))
        assert (np.all(output[1] == answers['be']))
Exemplo n.º 5
0
def test_blocks_hist(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0], bins='blocks', scale='binwidth', color='green')

    if cmdopt == "generate":
        with open(answer_dir+'/answers_blocks_hist.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1])
        plt.title('test_blocks_hist')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_blocks_hist.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
Exemplo n.º 6
0
def test_simple_hist4(cmdopt, data_gen):
    output = skh_plt.hist(data_gen[0], weights=data_gen[2], bins=range(5), normed=True,
                          scale='binwidth', color='red', histtype='bar')

    if cmdopt == "generate":
        with open(answer_dir+'/answers_simple_hist4.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1])
        plt.title('test_simple_hist4')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_simple_hist4.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
Exemplo n.º 7
0
def test_error_bars(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0], bins=20, errorbars=True, err_return=True, scale=5)

    if cmdopt == "generate":
        with open(answer_dir+'/answers_error_bars.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1], berr=output[2])
        plt.title('test_error_bars')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_error_bars.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
        assert(np.all(output[2] == answers['berr']))
Exemplo n.º 8
0
def test_ratio_plot_quick(cmdopt, data_gen):
    # bin tests
    with pytest.raises(KeyError):
        skh_plt.ratio_plot(dict(x=data_gen[0], bins=10), dict(x=data_gen[1], bins=11))
    output = skh_plt.ratio_plot(dict(x=data_gen[0]), dict(x=data_gen[1], bins=11))
    assert(len(output[1][0]) == 11)
    # range tests
    with pytest.raises(KeyError):
        skh_plt.ratio_plot(dict(x=data_gen[0], range=(0, 1)), dict(x=data_gen[1], range=(1, 2)))
    output = skh_plt.ratio_plot(dict(x=data_gen[0], range=(-0.1, 0.1)), dict(x=data_gen[1]))
    assert(output[1][1][0] >= -0.1 and output[1][1][-1] <= 0.1)
    output = skh_plt.ratio_plot(dict(x=data_gen[0]), dict(x=data_gen[1], range=(-0.1, 0.1)))
    assert(output[1][1][0] >= -0.1 and output[1][1][-1] <= 0.1)
Exemplo n.º 9
0
def test_error_bars4(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0], bins=50, errorbars=True, err_return=True,
                          histtype='step', err_type='poisson', suppress_zero=True, scale='binwidth')

    if cmdopt == "generate":
        with open(answer_dir+'/answers_error_bars4.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1], berr=output[2])
        plt.title('test_error_bars4')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_error_bars4.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
        assert(np.all(output[2] == answers['berr']))
Exemplo n.º 10
0
def test_error_bars2(cmdopt, data_gen):

    output = skh_plt.hist(data_gen[0], bins=1, errorbars=True, scale=0.5, normed=True,
                          err_color='k', alpha=0.1, err_type='poisson', err_return=True)

    if cmdopt == "generate":
        with open(answer_dir+'/answers_error_bars2.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1], berr=output[2])
        plt.title('test_error_bars2')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_error_bars2.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
        assert(np.all(output[2] == answers['berr']))
Exemplo n.º 11
0
def test_error_bars_stacked3(cmdopt, data_gen):

    output = skh_plt.hist([data_gen[0], data_gen[1]], bins=20, histtype='step', stacked=True,
                          weights=[data_gen[2], data_gen[2]], errorbars=True, err_return=True,
                          normed=True, scale=2)

    if cmdopt == "generate":
        with open(answer_dir+'/answers_error_bars_stacked3.npz', 'wb') as f:
            np.savez(f, bc=output[0], be=output[1], berr=output[2])
        plt.title('test_error_bars_stacked2')
        plt.show()
    elif cmdopt == "test":
        answers = np.load(answer_dir+'/answers_error_bars_stacked3.npz')
        assert(np.all(output[0] == answers['bc']))
        assert(np.all(output[1] == answers['be']))
        assert(np.all(output[2] == answers['berr']))
Exemplo n.º 12
0
 def plot_binned_data_error(self, axis, bin_edges, data, wgt_sqrd, *args,
                            **kwargs):
     binwidth = bin_edges[1] - bin_edges[0]
     errors = np.sqrt(wgt_sqrd)
     if 'density' in kwargs and kwargs['density'] == True:
         errors = errors / np.sum(data) / binwidth
     errors = errors.reindex(np.arange(1, len(bin_edges)), fill_value=0)
     #The dataset values are the bin centres
     x = (bin_edges[1:] + bin_edges[:-1]) / 2.0
     #The weights are the y-values of the input binned data
     weights = data
     return skh_plt.hist(x,
                         ax=axis,
                         bins=bin_edges,
                         weights=weights,
                         errorbars=errors,
                         *args,
                         **kwargs)
Exemplo n.º 13
0
 def plot_stacked_binned_data_error(self, axis, bin_edges, data, wgt_sqrd,
                                    *args, **kwargs):
     errors = wgt_sqrd[0]
     for i in np.arange(1, len(wgt_sqrd)):
         errors = errors.add(wgt_sqrd[i], fill_value=0)
     errors = np.sqrt(errors)
     errors = np.array(
         errors.reindex(np.arange(1, len(bin_edges)), fill_value=0))
     #The dataset values are the bin centres
     x = (bin_edges[1:] + bin_edges[:-1]) / 2.0
     x = np.array([x]).repeat(len(data), axis=0)
     x = np.transpose(x)
     #The weights are the y-values of the input binned data
     weights = np.transpose(data)
     return skh_plt.hist(x,
                         ax=axis,
                         bins=bin_edges,
                         weights=weights,
                         errorbars=errors,
                         stacked=True,
                         *args,
                         **kwargs)
Exemplo n.º 14
0
def test_hist_fails(cmdopt, data_gen):
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]], stacked=True, histtype='marker')
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]], histtype='marker')
    with pytest.raises(KeyError):
        skh_plt.hist(1, err_return=True)
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]], weights=data_gen[2])
    with pytest.raises(ValueError):
        skh_plt.hist(data_gen[0], weights=data_gen[2][0:10])
    with pytest.raises(KeyError):
        skh_plt.hist(data_gen[0], err_type='fake', errorbars=True)

    output1 = skh_plt.hist(5)
    assert(np.all(output1[0] == 1))
    output2 = skh_plt.hist([], range=(0, 1))
    assert(np.all(output2[0] == 0))
Exemplo n.º 15
0
def plotScores():
    isBlindAnalysis = True
    modelName = 'llqqDNN_100_60_2_0'
    outDirAfterDilep = [
        'Out_AfterDilepton_TrainggF1000_FullStat_1FatJet',
        'Out_AfterDilepton_TrainggF2000_FullStat_1FatJet',
        'Out_AfterDilepton_TrainggF3000_FullStat_1FatJet',
        'Out_AfterDilepton_TrainggF700_FullStat_1FatJet'
    ]

    outDirAfterggF = [
        'Out_AfterggFMerged_TrainggF1000_FullStat_1FatJet',
        'Out_AfterggFMerged_TrainggF2000_FullStat_1FatJet',
        'Out_AfterggFMerged_TrainggF3000_FullStat_1FatJet',
        'Out_AfterggFMerged_TrainggF700_FullStat_1FatJet'
    ]

    for idir in outDirAfterDilep:
        # for idir in outDirAfterggF:
        if isBlindAnalysis == False:
            yhat_data = np.load(os.path.join(idir, modelName, "yhat_data.npy"))

        yhat_train_signal = np.load(
            os.path.join(idir, modelName, "yhat_train_signal.npy"))
        yhat_train_background = np.load(
            os.path.join(idir, modelName, "yhat_train_background.npy"))

        yhat_test_signal = np.load(
            os.path.join(idir, modelName, "yhat_test_signal.npy"))
        yhat_test_background = np.load(
            os.path.join(idir, modelName, "yhat_test_background.npy"))

        bins = np.linspace(0, 1, 50)
        plt.hist(yhat_train_signal,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 color='deepskyblue',
                 label='TrainSignal',
                 normed=True)
        plt.hist(yhat_test_signal,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 color='turquoise',
                 label='TestSignal',
                 normed=True)
        plt.hist(yhat_train_background,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 color='deeppink',
                 label='TrainBackground',
                 normed=True)
        plt.hist(yhat_test_background,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 color='plum',
                 label='TestBackground',
                 normed=True)
        if isBlindAnalysis == False:
            skh_plt.hist(yhat_data,
                         bins=bins,
                         errorbars=True,
                         histtype='marker',
                         label='Data',
                         color='black',
                         normed=True)
        plt.legend(loc="upper center")
        plt.ylabel('Norm. Entries')
        plt.xlabel('DNN score')
        plt.yscale('log')
        plt.savefig(idir + '/' + modelName + "/MC_TrainTest_Score.pdf")
        # plt.show()
        plt.clf()
Exemplo n.º 16
0
def test_hist_fails(cmdopt, data_gen):
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]],
                     stacked=True,
                     histtype='marker')
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]], histtype='marker')
    with pytest.raises(KeyError):
        skh_plt.hist(1, err_return=True)
    with pytest.raises(ValueError):
        skh_plt.hist([data_gen[0], data_gen[1]], weights=data_gen[2])
    with pytest.raises(ValueError):
        skh_plt.hist(data_gen[0], weights=data_gen[2][0:10])
    with pytest.raises(KeyError):
        skh_plt.hist(data_gen[0], err_type='fake', errorbars=True)

    output1 = skh_plt.hist(5)
    assert (np.all(output1[0] == 1))
    # histogram method does not support empty lists as input for python 2.6
    if sys.version_info < (2, 7):
        with pytest.raises(ValueError):
            output2 = skh_plt.hist([], range=(0, 1))
    else:
        output2 = skh_plt.hist([], range=(0, 1))
        assert (np.all(output2[0] == 0))
Exemplo n.º 17
0
def comp_study(input_data,
               n_events,
               xlims=None,
               resamples=100,
               dist_name='2Gauss'):
    bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks')
    do_log = True

    # data_nom = input_data[:n_events]
    if dist_name == 'Gauss':
        np.random.seed(88)
        data_nom = np.random.normal(125, 2, size=n_events)
        resample_list = np.random.normal(125, 2, size=(resamples, n_events))
        do_log = False

    elif dist_name == '2LP':
        np.random.seed(33)
        data_nom = np.concatenate(
            (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)),
             np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)),
             np.random.uniform(low=80, high=120, size=int(n_events * 0.10))))
        resample_list = np.concatenate(
            (np.random.laplace(
                loc=90, scale=5, size=(resamples, int(n_events * 0.65))),
             np.random.laplace(
                 loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))),
             np.random.uniform(
                 low=80, high=120, size=(resamples, int(n_events * 0.10)))),
            axis=1)
        do_log = False

    elif dist_name == 'jPT':
        np.random.seed(11)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    elif dist_name == 'DY':
        np.random.seed(200)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)
    else:
        np.random.seed(1)
        data_nom = np.random.choice(input_data, size=n_events, replace=False)
        resample_list = np.random.choice(input_data,
                                         size=(resamples, n_events),
                                         replace=True)

    fig_hist, axes_hist = plt.subplots(3,
                                       3,
                                       sharex=True,
                                       sharey=False,
                                       constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22)
    # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20)
    # axes_hist[2][0].get_xaxis().set_ticks([])
    # axes_hist[2][1].get_xaxis().set_ticks([])
    # axes_hist[2][2].get_xaxis().set_ticks([])

    axes_hist[0][0].set_title('Sturges')
    hist_sturges_bw = skh_plt.hist(x=data_nom,
                                   histtype='stepfilled',
                                   bins='sturges',
                                   errorbars=False,
                                   alpha=0.5,
                                   log=do_log,
                                   scale='binwidth',
                                   err_type='gaussian',
                                   ax=axes_hist[0][0])

    axes_hist[0][1].set_title('Doane')
    hist_doane_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='doane',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][1])

    axes_hist[0][2].set_title('Scott')
    hist_scott_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins='scott',
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[0][2])

    axes_hist[1][0].set_title('Freedman Diaconis')
    axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20)
    hist_fd_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins='fd',
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[1][0])

    axes_hist[1][1].set_title('Knuth')
    _, bk = knuth_bin_width(data_nom, return_bins=True)
    hist_knuth_bw = skh_plt.hist(x=data_nom,
                                 histtype='stepfilled',
                                 bins=bk,
                                 errorbars=False,
                                 alpha=0.5,
                                 log=do_log,
                                 scale='binwidth',
                                 err_type='gaussian',
                                 ax=axes_hist[1][1])

    axes_hist[1][2].set_title('Rice')
    hist_rice_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='rice',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[1][2])

    axes_hist[2][0].set_title('Sqrt(N)')
    hist_sqrt_bw = skh_plt.hist(x=data_nom,
                                histtype='stepfilled',
                                bins='sqrt',
                                errorbars=False,
                                alpha=0.5,
                                log=do_log,
                                scale='binwidth',
                                err_type='gaussian',
                                ax=axes_hist[2][0])

    # bep = bep_optimizer(data_nom)
    # _, bep = pd.qcut(data_nom, nep, retbins=True)

    hist_sturges = np.histogram(data_nom, bins='sturges')
    hist_doane = np.histogram(data_nom, bins='doane')
    hist_scott = np.histogram(data_nom, bins='scott')
    hist_fd = np.histogram(data_nom, bins='fd')
    hist_knuth = np.histogram(data_nom, bins=bk)
    hist_rice = np.histogram(data_nom, bins='rice')
    hist_sqrt = np.histogram(data_nom, bins='sqrt')

    r_sturges = rough(hist_sturges_bw, plot=False)
    r_doane = rough(hist_doane_bw)
    r_scott = rough(hist_scott_bw)
    r_fd = rough(hist_fd_bw)
    r_knuth = rough(hist_knuth_bw, plot=False)
    r_rice = rough(hist_rice_bw)
    r_sqrt = rough(hist_sqrt_bw, plot=False)

    eli_sturges = err_li(data_nom, hist_sturges)
    eli_doane = err_li(data_nom, hist_doane)
    eli_scott = err_li(data_nom, hist_scott)
    eli_fd = err_li(data_nom, hist_fd)
    eli_knuth = err_li(data_nom, hist_knuth)
    eli_rice = err_li(data_nom, hist_rice)
    eli_sqrt = err_li(data_nom, hist_sqrt)

    avg_eli_sturges = []
    avg_eli_doane = []
    avg_eli_scott = []
    avg_eli_fd = []
    avg_eli_knuth = []
    avg_eli_rice = []
    avg_eli_sqrt = []
    for i in resample_list:
        avg_eli_sturges.append(err_li(i, hist_sturges))
        avg_eli_doane.append(err_li(i, hist_doane))
        avg_eli_scott.append(err_li(i, hist_scott))
        avg_eli_fd.append(err_li(i, hist_fd))
        avg_eli_knuth.append(err_li(i, hist_knuth))
        avg_eli_rice.append(err_li(i, hist_rice))
        avg_eli_sqrt.append(err_li(i, hist_sqrt))

    avg_eli_sturges = np.mean(avg_eli_sturges)
    avg_eli_doane = np.mean(avg_eli_doane)
    avg_eli_scott = np.mean(avg_eli_scott)
    avg_eli_fd = np.mean(avg_eli_fd)
    avg_eli_knuth = np.mean(avg_eli_knuth)
    avg_eli_rice = np.mean(avg_eli_rice)
    avg_eli_sqrt = np.mean(avg_eli_sqrt)

    avg_eli_list = [
        avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd,
        avg_eli_knuth, avg_eli_rice, avg_eli_sqrt
    ]
    r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt]

    elis_list = [
        eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice,
        eli_sqrt
    ]

    axes_hist[2][1].set_title('Equal Population')
    bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    hist_ep_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bep,
                              errorbars=False,
                              alpha=0.5,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][1])
    hist_ep = np.histogram(data_nom, bins=bep)
    r_ep = rough(hist_ep_bw)
    eli_ep = err_li(data_nom, hist_ep)
    avg_eli_ep = []
    for i in resample_list:
        avg_eli_ep.append(err_li(i, hist_ep))
    avg_eli_ep = np.mean(avg_eli_ep)

    axes_hist[2][2].set_title('Bayesian Blocks')
    p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list)
    bb = bayesian_blocks(data_nom, p0=p0)
    if xlims:
        bb[0] = xlims[0]
        bb[-1] = xlims[-1]
    hist_bb_bw = skh_plt.hist(x=data_nom,
                              histtype='stepfilled',
                              bins=bb,
                              errorbars=False,
                              alpha=1,
                              log=do_log,
                              scale='binwidth',
                              err_type='gaussian',
                              ax=axes_hist[2][2])
    # if n_events == 1000 and dist_name == '2LP':
    # axes_hist[2][2].set_ylim((0, 100))
    hist_bb = np.histogram(data_nom, bins=bb)
    r_bb = rough(hist_bb_bw, plot=False)
    eli_bb = err_li(data_nom, hist_bb)
    avg_eli_bb = []
    for i in resample_list:
        avg_eli_bb.append(err_li(i, hist_bb))
    avg_eli_bb = np.mean(avg_eli_bb)

    r_list.append(r_ep)
    r_list.append(r_bb)
    avg_eli_list.append(avg_eli_ep)
    avg_eli_list.append(avg_eli_bb)
    elis_list.append(eli_ep)
    elis_list.append(eli_bb)
    plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf')

    xs = [
        'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB'
    ]

    fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True)
    fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}')
    for i in range(len(elis_list)):
        if xs[i] == 'BB':
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=400,
                                   marker='*',
                                   c='k')
        else:
            axes_metric[0].scatter(avg_eli_list[i],
                                   r_list[i],
                                   label=xs[i],
                                   s=200)
    axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)')
    axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)')
    # ax = plt.gca()
    # ax.set_yscale('log')
    # ax.set_xscale('log')
    # ax.relim()
    # ax.autoscale_view()
    axes_metric[0].grid()
    axes_metric[0].legend(ncol=1,
                          bbox_to_anchor=(1.05, 1.15),
                          loc='upper left')
    axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}',
                             fontsize=22)
    # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf')

    # plt.figure()
    rank_rough = rankdata(r_list, method='min')
    rank_avg_eli = rankdata(avg_eli_list, method='min')

    cont = axes_metric[1].bar(xs,
                              rank_rough,
                              0.35,
                              label=r'$W_n$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    cont = axes_metric[1].bar(xs,
                              rank_avg_eli,
                              0.35,
                              bottom=rank_rough,
                              label=r'$\hat{E}$ Ranking',
                              alpha=0.5)
    cont[-1].set_alpha(1)
    axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8))
    # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}')
    axes_metric[1].set_xlabel('Binning Method')
    axes_metric[1].set_ylabel('Rank')
    plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')
Exemplo n.º 18
0
def train_and_validate(steps=10000, minibatch=128, LRrange=[0.0001, 0.00001, 10000, 0], beta1=0.9, beta2=0.999, nafdim=16, depth=2, \
    savedir='abcdnn', seed=100, retrain=False, train=True):
    rawinputs, normedinputs, inputmeans, inputsigma, ncat_per_feature = prepdata(
    )
    print(ncat_per_feature)
    inputdim = 4
    ncat_per_feature = ncat_per_feature[0:inputdim]
    conddim = normedinputs.shape[1] - inputdim

    issignal = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] >= 3
                                           )  # signal_selection
    isbackground = ~issignal
    bkgnormed = normedinputs[isbackground]
    bkg = rawinputs[isbackground]
    xmax = np.reshape(inputmeans + 5 * inputsigma, inputmeans.shape[1])

    m = ABCDdnn(ncat_per_feature, inputdim, minibatch=minibatch, conddim=conddim, LRrange=LRrange, \
        beta1=beta1, beta2=beta2, nafdim=nafdim, depth=depth, savedir=savedir, retrain=retrain, seed=seed)
    m.setrealdata(bkgnormed)
    m.savehyperparameters()
    m.monitorevery = 100

    if train:
        m.train(steps)
        m.display_training()

    nj9cut = True
    if nj9cut:
        ncol = 3  # for plots below
        condlist = [[[
            1.,
            0.,
            0.,
            1.,
            0.,
        ]], [[
            0.,
            1.,
            0.,
            1.,
            0.,
        ]], [[
            0.,
            0.,
            1.,
            1.,
            0.,
        ]], [[
            1.,
            0.,
            0.,
            0.,
            1.,
        ]], [[
            0.,
            1.,
            0.,
            0.,
            1.,
        ]], [[
            0.,
            0.,
            1.,
            0.,
            1.,
        ]]]
        select0 = (rawinputs['njet'] == 7) & (rawinputs['nbtag'] == 2)
        select1 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] == 2)
        select2 = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] == 2)
        select3 = (rawinputs['njet'] == 7) & (rawinputs['nbtag'] >= 3)
        select4 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] >= 3)
        select5 = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] >= 3)
        select_data = [select0, select1, select2, select3, select4, select5]

        plottextlist = [
            f'$N_j=7, N_b=2$', f'$N_j=8, N_b=2$', f'$N_j\geq 9, N_b=2$',
            f'$N_j=7, N_b\geq 3$', f'$N_j=8, N_b\geq 3$',
            f'$N_j\geq 9, N_b\geq 3$'
        ]
        njlist = [7, 8, 9, 7, 8, 9]
        nblist = [2, 2, 2, 3, 3, 3]

    else:
        ncol = 3  # for plots
        condlist = [[[
            0.,
            1.,
            0.,
            0.,
            1.,
            0.,
        ]], [[
            0.,
            0.,
            1.,
            0.,
            1.,
            0.,
        ]], [[
            0.,
            0.,
            0.,
            1.,
            1.,
            0.,
        ]], [[
            0.,
            1.,
            0.,
            0.,
            0.,
            1.,
        ]], [[
            0.,
            0.,
            1.,
            0.,
            0.,
            1.,
        ]], [[
            0.,
            0.,
            0.,
            1.,
            0.,
            1.,
        ]]]
        select0 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] == 2)
        select1 = (rawinputs['njet'] == 9) & (rawinputs['nbtag'] == 2)
        select2 = (rawinputs['njet'] >= 10) & (rawinputs['nbtag'] == 2)
        select3 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] >= 3)
        select4 = (rawinputs['njet'] == 9) & (rawinputs['nbtag'] >= 3)
        select5 = (rawinputs['njet'] >= 10) & (rawinputs['nbtag'] >= 3)
        select_data = [select0, select1, select2, select3, select4, select5]

        plottextlist = [
            f'$N_j=8, N_b=2$', f'$N_j=9, N_b=2$', f'$N_j\geq 10, N_b=2$',
            f'$N_j=8, N_b\geq 3$', f'$N_j=9, N_b\geq 3$',
            f'$N_j\geq 10, N_b\geq 3$'
        ]

        njlist = [8, 9, 10, 8, 9, 10]
        nblist = [2, 2, 2, 3, 3, 3]

    # create fake data

    fakedatalist = []
    for cond, nj, nb in zip(condlist, njlist, nblist):
        nmcbatches = int(bkgnormed.shape[0] / minibatch)
        nmcremain = bkgnormed.shape[0] % minibatch
        fakelist = []
        cond_to_append = np.repeat(cond, minibatch, axis=0)
        for _ib in range(nmcbatches):
            xin = bkgnormed[_ib * minibatch:(_ib + 1) * minibatch, :inputdim]
            xin = np.hstack(
                (xin,
                 cond_to_append))  # append conditional to the feature inputs
            xgen = m.model.predict(xin)
            #xgen = m.generate_sample(cond)
            fakelist.append(xgen)
        # last batch
        xin = bkgnormed[nmcbatches * minibatch:, :inputdim]
        xin = np.hstack(
            (xin,
             np.repeat(cond, nmcremain,
                       axis=0)))  # append conditional to the feature inputs
        xgen = m.model.predict(xin)
        fakelist.append(xgen)

        # all data
        fakedata = np.vstack(fakelist)
        fakedata = fakedata * inputsigma[:, :inputdim] + inputmeans[:, :
                                                                    inputdim]
        nfakes = fakedata.shape[0]

        fakedata = np.hstack((fakedata, np.array([nj]*nfakes).reshape((nfakes,1))\
                , np.array([nb]*nfakes).reshape(nfakes,1) )
        )
        fakedatalist.append(fakedata)

    labelsindices = [['MET', 'met', 0.0, xmax[0]], ['H_T', 'ht', 0.0, xmax[1]],\
        ['p_{T5}', 'pt5', 0.0, xmax[2]], ['p_{T6}', 'pt6', 0.0, xmax[3]]]
    nbins = 20
    runplots = True
    if runplots:
        yscales = ['log', 'linear']
        for yscale in yscales:
            for li in labelsindices:
                pos = featurevars.index(li[1])
                fig, ax = plt.subplots(2, ncol, figsize=(3 * ncol, 6))
                iplot = 0
                for fakedata, seld, plottext in zip(fakedatalist, select_data,
                                                    plottextlist):
                    input_data = rawinputs[seld]
                    # Make ratio plots
                    plotaxes = MplPlotter.ratio_plot(dict(x=input_data[li[1]], bins=nbins, range=(li[2], li[3]), errorbars=True, normed=True, histtype='marker'), \
                        dict(x=fakedata[:, pos], bins=nbins, range=(li[2], li[3]), errorbars=True, normed=True), ratio_range=(0.25, 1.9))

                    plotfig = plotaxes[0][0].get_figure()
                    plotaxes[0][0].set_yscale(yscale)
                    plotfig.set_size_inches(5, 5)
                    plotfig.savefig(
                        os.path.join(
                            savedir,
                            f'result_{li[1]}_{iplot}_{yscale}_ratio.pdf'))

                    # make matrix of plots
                    row = iplot // ncol
                    col = iplot % ncol
                    iplot += 1
                    plt.sca(ax[row, col])
                    ax[row, col].set_yscale(yscale)
                    ax[row, col].set_xlabel(f"${li[0]}$ (GeV)")
                    MplPlotter.hist(input_data[li[1]],
                                    bins=nbins,
                                    alpha=0.5,
                                    range=(li[2], li[3]),
                                    errorbars=True,
                                    histtype='marker',
                                    normed=True)
                    MplPlotter.hist(fakedata[:, pos],
                                    bins=nbins,
                                    alpha=0.5,
                                    range=(li[2], li[3]),
                                    errorbars=True,
                                    normed=True)
                    MplPlotter.hist(bkg[li[1]],
                                    bins=nbins,
                                    alpha=0.5,
                                    range=(li[2], li[3]),
                                    histtype='step',
                                    normed=True)
                    plt.text(0.6,
                             0.8,
                             plottext,
                             transform=ax[row, col].transAxes,
                             fontsize=10)

                fig.tight_layout()
                fig.savefig(
                    os.path.join(savedir,
                                 f'result_matrix_{li[1]}_{yscale}.pdf'))

    generatesigsample = True
    if generatesigsample:
        bkgsigfakedata = np.vstack(fakedatalist)

        datadict = {}
        for var, idx in zip(featurevars, range(len(featurevars))):
            datadict[var] = bkgsigfakedata[:, idx]

        writetorootfile(os.path.join(savedir, 'fakedata_NAF.root'), datadict)
    pass
Exemplo n.º 19
0
def PlotResults(setupClient, model, X_train, X_test, y_train, y_test, w_train,
                w_test, ix_train, ix_test):

    print(Fore.BLUE + "--------------------------")
    print(Back.BLUE + "         RESULTS          ")
    print(Fore.BLUE + "--------------------------")

    if setupClient.runMode == 'binary' or setupClient.runMode == 'param' or setupClient.runMode == 'SimpleRNN':
        print('Evaluating model on X_test, y_test')
        score = model.evaluate(X_test,
                               y_test,
                               batch_size=setupClient.Params['BatchSize'])
        # testLoss = 'Test loss:%0.3f' % score[0]
        # testAccuracy = 'Test accuracy:%0.3f' % score[1]
        print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test loss', score[0]))
        print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test accuracy',
                                        score[1]))

    # get the architecture as a json string
    arch = model.to_json()
    with open(os.path.join(setupClient.ModelSavePath, 'architecture.json'),
              'w') as arch_file:
        print('Saving model as json',
              os.path.join(setupClient.ModelSavePath, 'architecture.json'))
        arch_file.write(arch)
    # now save the weights as an HDF5 file
    model.save_weights(os.path.join(setupClient.ModelSavePath,
                                    'ModelWeights.h5'),
                       overwrite=True)

    if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'):
        print("Pickle file not found!")
        quit()
    foo = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb")
    bla = pickle.load(foo)
    minusMean = np.multiply(-1, bla.Scaler.mean_)
    OneOverStd = np.divide(1, np.sqrt(bla.Scaler.var_))

    with open(os.path.join(setupClient.ModelSavePath, 'Scaling.txt'),
              'w') as scaleFileOut:
        scaleFileOut.write(
            str(setupClient.InputDNNVariables[setupClient.VarSet]) + '\n')
        scaleFileOut.write('Mean\n' + str(bla.Scaler.mean_) + '\n')
        scaleFileOut.write('minusMean\n' + str(minusMean) + '\n')
        scaleFileOut.write('Var\n' + str(bla.Scaler.var_) + '\n')
        scaleFileOut.write('sqrtVar\n' + str(np.sqrt(bla.Scaler.var_)) + '\n')
        scaleFileOut.write('OneOverStd\n' + str(OneOverStd) + '\n')

    theClasses = []
    print('\nRunning model prediction on X train/test samples')
    yResult_test_cls = []
    yResult_train_cls = []

    yResult_test = model.predict(X_test,
                                 verbose=True,
                                 batch_size=setupClient.Params['BatchSize'])
    yResult_train = model.predict(X_train,
                                  verbose=True,
                                  batch_size=setupClient.Params['BatchSize'])

    #insert the score result back into the original file
    # ix_test['DNN_Score'] = yResult_test
    # ix_train['DNN_Score'] = yResult_train

    # ix_test.to_pickle(setupClient.ModelSavePath+'/ResultsTestPD.pkl',protocol=2)
    # ix_train.to_pickle(setupClient.ModelSavePath+'/ResultsTrainPD.pkl',protocol=2)

    # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTestPD.npy") , ix_test ) # antonio
    # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTrainPD.npy") , ix_train ) # antonio
    # np.save( os.path.join(setupClient.ModelSavePath, "rootBranchSubSample.npy") , ix_test.columns.values) # antonio

    if setupClient.runMode == 'multi':
        yResult_test_cls = np.argmax(
            yResult_test, axis=1)  #stores the element with max score
        yResult_train_cls = np.argmax(
            yResult_train, axis=1)  #stores the element with max score
        theClasses = ['Zjets', 'Signal', 'Diboson', 'Top']
    else:
        yResult_test_cls = np.array([int(round(x[0])) for x in yResult_test])
        yResult_train_cls = np.array([int(round(x[0])) for x in yResult_train])
        theClasses = ['Background', 'Signal']

    # print(X_test[:20])
    # print ('')
    #
    # print(ix_test[:20])
    # print ('')
    # print(yResult_test)
    # quit()
    #
    # print(yResult_test_cls)
    # print ('')
    # print(yResult_train)
    # print ('')
    # print(yResult_train_cls)

    if setupClient.doConfusionMatrix:
        # Plot the confusion matrix
        plt.clf()
        # The class method is:  sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None)
        cnf_matrix = confusion_matrix(y_test,
                                      yResult_test_cls,
                                      sample_weight=w_test)
        np.set_printoptions(precision=2)
        plot_confusion_matrix(setupClient,
                              cnf_matrix,
                              classes=theClasses,
                              normalize=True,
                              title='Normalized confusion matrix')

    if setupClient.doEfficiency:
        print('Calculating Efficiencies on Test sample')
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN':
            s_eff = w_test[(y_test == 1) & (
                yResult_test_cls > 0.5)].sum() / w_test[y_test == 1].sum()
            b_eff = w_test[(y_test != 1) & (
                yResult_test_cls > 0.5)].sum() / w_test[y_test != 1].sum()
            print(" ")
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Signal efficiency',
                                            s_eff))
            print('{:<35} {:<25.3f}'.format(
                Fore.GREEN + 'Background efficiency:', b_eff))
            print('{:<35} {:<25.3f}'.format(
                Fore.GREEN + 'Background rejection:', 1.0 / b_eff))
        if setupClient.runMode == 'multi':
            channelEffi = channelDic.copy()
            for channel, i in channelDic.items():
                channelEffi[channel] = w_test[(y_test == i) & (
                    yResult_test_cls == 1)].sum() / w_test[y_test == i].sum()
            for channel, eff in channelEffi.items():
                print('{:<35} {:<25.3f}'.format(
                    Fore.GREEN + channel + ' efficiency', eff))

            b_eff = w_test[(y_test != 1) & (
                yResult_test_cls == 1)].sum() / w_test[y_test != 1].sum()
            print('{:<30} {:<20.3f}'.format('Background efficiency', b_eff))
            print('{:<30} {:<20.3f}'.format('Background rejection',
                                            1.0 / b_eff))
        print(" ")

    if setupClient.doScore:
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param':
            # First create one sample of X_train only from signal and one only from background events
            Xtrain_signal = X_train[y_train == 1]
            Xtrain_background = X_train[y_train != 1]

            # Then do the same for Xtest
            Xtest_signal = X_test[y_test == 1]
            Xtest_background = X_test[y_test != 1]

            # Get predictions of the model on these -train- samples
            print('Running model prediction on Xtrain_signal')
            yhat_train_signal = model.predict(
                Xtrain_signal, batch_size=setupClient.Params['BatchSize'])
            print('Running model prediction on Xtrain_background')
            yhat_train_background = model.predict(
                Xtrain_background, batch_size=setupClient.Params['BatchSize'])

            # Get predictions of the model on these -test- samples
            print('Running model prediction on Xtest_signal')
            yhat_test_signal = model.predict(
                Xtest_signal, batch_size=setupClient.Params['BatchSize'])
            print('Running model prediction on Xtest_background')
            yhat_test_background = model.predict(
                Xtest_background, batch_size=setupClient.Params['BatchSize'])

        hasData = False
        if setupClient.runMode == 'binary' and setupClient.unblind == True:
            # Get the data PD file
            dataFileName = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Data.pkl'
            if os.path.isfile(dataFileName):
                hasData = True
                print('Reading Data file:', dataFileName)
                data_full = pd.read_pickle(dataFileName)
                data_full_matrix = data_full[setupClient.InputDNNVariables[
                    setupClient.VarSet]].as_matrix()

                print('{:<45} {:<15}'.format(
                    'Getting Scaler of Training sample from file',
                    Fore.GREEN + setupClient.TrainedModelPath + 'DNN_Setup'))
                if not os.path.isfile(setupClient.TrainedModelPath +
                                      '/DNN_Setup'):
                    print("Pickle file not found!")
                    quit()
                f = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb")
                savedSetupClient = pickle.load(f)
                data_full_matrix = savedSetupClient.Scaler.transform(
                    data_full_matrix)

                # Get predictions on data
                print('Running model prediction on data')
                yhat_data = model.predict(
                    data_full_matrix,
                    verbose=True,
                    batch_size=setupClient.Params['BatchSize'])
                yhat_data_rounded = np.array([round(x[0]) for x in yhat_data])
                # Save as numpy array
                # np.save( os.path.join(setupClient.ModelSavePath,"yhat_data.npy") , yhat_data)
            else:
                print('Data file:', dataFileName,
                      ' not found. Will proceed to MC only')

        if setupClient.runMode == 'SimpleRNN':  # antonio
            for ifile in setupClient.InputFilesSB['Data']:
                dataFileName = setupClient.PDPath + ifile + '_FullNoRandom.pkl'
                if os.path.isfile(dataFileName):
                    hasData = False
                    print('Reading Data file:', dataFileName)
                    data_full = pd.read_pickle(dataFileName)

                    VariablesSet = setupClient.InputDNNVariables[
                        setupClient.VarSet]
                    data_full_matrix = data_full[VariablesSet].copy()
                    var_names = data_full_matrix.keys()
                    new_data_full_matrix = np.zeros(
                        (data_full_matrix.shape[0], 6, 4))

                    for i in range(0, data_full_matrix.shape[0]):
                        for j in range(0, data_full_matrix.shape[1]):
                            new_data_full_matrix[i, int(j / 4), j %
                                                 4] = data_full_matrix.iloc[i,
                                                                            j]
                    data_full_matrix = new_data_full_matrix

                    PrepareData.scale(data_full_matrix,
                                      ['pt', 'eta', 'phi', 'E'], False,
                                      setupClient)  # apply scaling to test set

                    # Get predictions on data
                    print('Running model prediction on data')
                    yhat_data = model.predict(
                        data_full_matrix,
                        verbose=True,
                        batch_size=setupClient.Params['BatchSize'])

                    data_full['RNN_Score'] = yhat_data
                    print(data_full.shape)
                    np.save(
                        os.path.join(setupClient.ModelSavePath,
                                     "ResultsDataMLPD_" + ifile + ".npy"),
                        data_full)  # antonio
                    np.save(
                        os.path.join(
                            setupClient.ModelSavePath,
                            "rootBranchSubSampleForDataML_" + ifile + ".npy"),
                        data_full.columns.values)  # antonio

                else:
                    print('Data file:', dataFileName,
                          ' not found. Will proceed to MC only')

        sns.set_palette("coolwarm", 4)
        # Plot scores
        bins = np.linspace(0, 1, 50)
        plt.hist(yhat_train_signal,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 label=[r'Signal Train'],
                 normed=True)
        plt.hist(yhat_test_signal,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 label=[r'Signal Test'],
                 normed=True)
        plt.hist(yhat_test_background,
                 bins=bins,
                 histtype='stepfilled',
                 lw=2,
                 alpha=0.5,
                 label=[r'Background Test'],
                 normed=True)
        plt.hist(yhat_train_background,
                 bins=bins,
                 histtype='step',
                 lw=2,
                 alpha=0.5,
                 label=[r'Background Train'],
                 normed=True)
        if hasData and setupClient.unblind == True:
            # Plot the data as well. Using skh_plt because matplotlib does not come with markers for hist class
            skh_plt.hist(yhat_data,
                         bins=bins,
                         errorbars=True,
                         histtype='marker',
                         label='Data',
                         color='black',
                         normed=True)
        plt.ylabel('Norm. Entries')
        plt.xlabel('DNN score')
        plt.legend(loc="upper center")
        plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score.png")
        plt.yscale('log')
        plt.savefig(setupClient.ModelSavePath +
                    "/MC_Data_TrainTest_Score_log.png")
        plt.clf()

    if setupClient.doROC:
        if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param':
            # Get 'Receiver operating characteristic' (ROC)
            fpr, tpr, thresholds = roc_curve(y_test, yResult_test)

            # Compute Area Under the Curve (AUC) from prediction scores
            roc_auc = auc(fpr, tpr)
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC', roc_auc))

            # print "ROC AUC: %0.3f" % roc_auc
            plt.plot(fpr,
                     tpr,
                     color='darkorange',
                     lw=2,
                     label='Full curve (area = %0.2f)' % roc_auc)
            plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([-0.05, 1.0])
            plt.ylim([0.0, 1.05])
            plt.ylabel('True Positive Rate')
            plt.xlabel('False Positive Rate')
            plt.title('ROC curves for Signal vs Background')
            plt.legend(loc="lower right")
            # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None")
            # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point
            plt.savefig(setupClient.ModelSavePath + "/ROC.png")
            plt.clf()

            ### NOW try the weighted ROC curve
            fpr_w, tpr_w, thresholds_w = roc_curve(y_test,
                                                   yResult_test,
                                                   sample_weight=w_test)
            roc_auc_w = auc(fpr_w, tpr_w, reorder=True)
            print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC weighted',
                                            roc_auc_w))
            plt.plot(fpr_w,
                     tpr_w,
                     color='darkorange',
                     lw=2,
                     label='Full curve (area = %0.2f)' % roc_auc_w)
            plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--')
            plt.xlim([-0.05, 1.0])
            plt.ylim([0.0, 1.05])
            plt.ylabel('True Positive Rate (weighted)')
            plt.xlabel('False Positive Rate (weighted)')
            plt.title('ROC curve for Signal vs Background')
            plt.legend(loc="lower right")
            # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None")
            # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point
            plt.savefig(setupClient.ModelSavePath + "/ROC_weighted.png")
            plt.clf()

            np.save(os.path.join(setupClient.ModelSavePath, "tpr_w.npy"),
                    tpr_w)
            np.save(os.path.join(setupClient.ModelSavePath, "fpr_w.npy"),
                    fpr_w)
            np.save(
                os.path.join(setupClient.ModelSavePath, "thresholds_w.npy"),
                thresholds_w)
            np.save(os.path.join(setupClient.ModelSavePath, "thresholds.npy"),
                    thresholds)
            np.save(os.path.join(setupClient.ModelSavePath, "tpr.npy"), tpr)
            np.save(os.path.join(setupClient.ModelSavePath, "fpr.npy"), fpr)

            np.save(os.path.join(setupClient.ModelSavePath, "AUC.npy"),
                    roc_auc)
            np.save(os.path.join(setupClient.ModelSavePath, "AUC_w.npy"),
                    roc_auc_w)
Exemplo n.º 20
0
def plotDataMC(setupClient):

    topDF_list = []
    zjetsDF_list = []
    wjetsDF_list = []
    dibosonDF_list = []
    signalDF_list = []

    for itype in setupClient.InputFilesSB.keys():
        for ifile in setupClient.InputFilesSB[itype]:
            print(ifile)
            if 'Top' in ifile:
                topDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Train')
                ]
                topDF_list += [getDFEvents(setupClient.PDPath, ifile, '_Test')]
            if 'Data' in ifile:
                dataDF = getDFEvents(setupClient.PDPath, ifile, 'Data')
            if 'Zjets' in ifile:
                zjetsDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Train')
                ]
                zjetsDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Test')
                ]
            if 'Diboson' in ifile:
                dibosonDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Train')
                ]
                dibosonDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Test')
                ]
            if 'ggF' in ifile:
                signalDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Train')
                ]
                signalDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Test')
                ]
            if 'Wjets' in ifile:
                wjetsDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Train')
                ]
                wjetsDF_list += [
                    getDFEvents(setupClient.PDPath, ifile, '_Test')
                ]

    topDF = pd.concat(topDF_list, ignore_index=True)
    zjetsDF = pd.concat(zjetsDF_list, ignore_index=True)
    wjetsDF = pd.concat(wjetsDF_list, ignore_index=True)
    dibosonDF = pd.concat(dibosonDF_list, ignore_index=True)
    signalDF = pd.concat(signalDF_list, ignore_index=True)

    for var in setupClient.VariablesToPlot:
        print("Plotting variable", var)
        # print ' min:',min(dibosonDF[var]), ' max', max(dibosonDF[var])
        bins = np.linspace(min(dibosonDF[var]), max(dibosonDF[var]), 20)

        plt.hist([topDF[var], dibosonDF[var], zjetsDF[var], wjetsDF[var]],
                 histtype='stepfilled',
                 normed=False,
                 bins=bins,
                 weights=[
                     topDF['weight'], dibosonDF['weight'], zjetsDF['weight'],
                     wjetsDF['weight']
                 ],
                 label=[
                     'Top',
                     'Diboson',
                     'Z + jets',
                     'W + jets',
                 ],
                 stacked=True)

        plt.hist(signalDF[var],
                 histtype='step',
                 normed=False,
                 bins=bins,
                 weights=signalDF['weight'],
                 label=r'ggF',
                 linewidth=1,
                 color='red',
                 linestyle='dashed')
        # plt.hist(dataDF[var], histtype='step', normed=False, bins=bins, label=r'Data', linewidth=2, color='black', linestyle='dashed')
        _ = skh_plt.hist(dataDF[var],
                         bins=bins,
                         errorbars=True,
                         histtype='marker',
                         label='Data',
                         color='black')

        plt.legend(loc='best', prop={'size': 10})
        plt.xlabel(var, fontsize=14)
        plt.savefig(setupClient.VarPlotPath + "/" + var + "_DataMC.png")
        plt.yscale('log')
        plt.savefig(setupClient.VarPlotPath + "/" + var + "_DataMC_log.png")
        plt.clf()