def test_blocks_hist2(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], weights=data_gen[2], bins='blocks', scale='binwidth', color='green', p0=0.1) if cmdopt == "generate": with open(answer_dir + '/answers_blocks_hist2.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1]) plt.title('test_blocks_hist2') plt.show() elif cmdopt == "test": answers = np.load(answer_dir + '/answers_blocks_hist2.npz') assert (np.all(output[0] == answers['bc'])) assert (np.all(output[1] == answers['be'])) with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], bins='blocks', scale='binwidth', color='green', gamma=0.1)
def test_ratio_plot_stacked(cmdopt, data_gen): output = skh_plt.ratio_plot(dict(x=[data_gen[0], data_gen[1]], stacked=True, errorbars=True), dict(x=[data_gen[0], data_gen[1]], weights=[data_gen[2], data_gen[2]], stacked=True, errorbars=True, err_style='line'), range=(-5, 5), bins='blocks') if cmdopt == "generate": with open(answer_dir + '/answers_ratio_plot_stacked.npz', 'wb') as f: np.savez(f, bc1=output[1][0], be1=output[1][1], bc2=output[2][0], be2=output[2][1]) output[0][0].set_title('test_ratio_plot_stacked') plt.show() elif cmdopt == "test": answers = np.load(answer_dir + '/answers_ratio_plot_stacked.npz') assert (np.all(output[1][0] == answers['bc1'])) assert (np.all(output[1][1] == answers['be1'])) assert (np.all(output[2][0] == answers['bc2'])) assert (np.all(output[2][1] == answers['be2']))
def test_ratio_plot_log(cmdopt, data_gen): output = skh_plt.ratio_plot(dict(x=data_gen[0], errorbars=True, histtype='marker', log=True, err_x=False), dict(x=data_gen[1], weights=data_gen[2], errorbars=True), logx=True, ratio_range=(0, 10)) if cmdopt == "generate": with open(answer_dir + '/answers_ratio_plot_log.npz', 'wb') as f: np.savez(f, bc1=output[1][0], be1=output[1][1], bc2=output[2][0], be2=output[2][1]) output[0][0].set_title('test_ratio_plot_log') plt.show() elif cmdopt == "test": answers = np.load(answer_dir + '/answers_ratio_plot_log.npz') assert (np.all(output[1][0] == answers['bc1'])) assert (np.all(output[1][1] == answers['be1'])) assert (np.all(output[2][0] == answers['bc2'])) assert (np.all(output[2][1] == answers['be2']))
def test_simple_hist1(cmdopt, data_gen): output = skh_plt.hist(data_gen[0]) if cmdopt == "generate": with open(answer_dir + '/answers_simple_hist1.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1]) plt.title('test_simple_hist1') plt.show() elif cmdopt == "test": answers = np.load(answer_dir + '/answers_simple_hist1.npz') assert (np.all(output[0] == answers['bc'])) assert (np.all(output[1] == answers['be']))
def test_blocks_hist(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], bins='blocks', scale='binwidth', color='green') if cmdopt == "generate": with open(answer_dir+'/answers_blocks_hist.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1]) plt.title('test_blocks_hist') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_blocks_hist.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be']))
def test_simple_hist4(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], weights=data_gen[2], bins=range(5), normed=True, scale='binwidth', color='red', histtype='bar') if cmdopt == "generate": with open(answer_dir+'/answers_simple_hist4.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1]) plt.title('test_simple_hist4') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_simple_hist4.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be']))
def test_error_bars(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], bins=20, errorbars=True, err_return=True, scale=5) if cmdopt == "generate": with open(answer_dir+'/answers_error_bars.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1], berr=output[2]) plt.title('test_error_bars') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_error_bars.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be'])) assert(np.all(output[2] == answers['berr']))
def test_ratio_plot_quick(cmdopt, data_gen): # bin tests with pytest.raises(KeyError): skh_plt.ratio_plot(dict(x=data_gen[0], bins=10), dict(x=data_gen[1], bins=11)) output = skh_plt.ratio_plot(dict(x=data_gen[0]), dict(x=data_gen[1], bins=11)) assert(len(output[1][0]) == 11) # range tests with pytest.raises(KeyError): skh_plt.ratio_plot(dict(x=data_gen[0], range=(0, 1)), dict(x=data_gen[1], range=(1, 2))) output = skh_plt.ratio_plot(dict(x=data_gen[0], range=(-0.1, 0.1)), dict(x=data_gen[1])) assert(output[1][1][0] >= -0.1 and output[1][1][-1] <= 0.1) output = skh_plt.ratio_plot(dict(x=data_gen[0]), dict(x=data_gen[1], range=(-0.1, 0.1))) assert(output[1][1][0] >= -0.1 and output[1][1][-1] <= 0.1)
def test_error_bars4(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], bins=50, errorbars=True, err_return=True, histtype='step', err_type='poisson', suppress_zero=True, scale='binwidth') if cmdopt == "generate": with open(answer_dir+'/answers_error_bars4.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1], berr=output[2]) plt.title('test_error_bars4') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_error_bars4.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be'])) assert(np.all(output[2] == answers['berr']))
def test_error_bars2(cmdopt, data_gen): output = skh_plt.hist(data_gen[0], bins=1, errorbars=True, scale=0.5, normed=True, err_color='k', alpha=0.1, err_type='poisson', err_return=True) if cmdopt == "generate": with open(answer_dir+'/answers_error_bars2.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1], berr=output[2]) plt.title('test_error_bars2') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_error_bars2.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be'])) assert(np.all(output[2] == answers['berr']))
def test_error_bars_stacked3(cmdopt, data_gen): output = skh_plt.hist([data_gen[0], data_gen[1]], bins=20, histtype='step', stacked=True, weights=[data_gen[2], data_gen[2]], errorbars=True, err_return=True, normed=True, scale=2) if cmdopt == "generate": with open(answer_dir+'/answers_error_bars_stacked3.npz', 'wb') as f: np.savez(f, bc=output[0], be=output[1], berr=output[2]) plt.title('test_error_bars_stacked2') plt.show() elif cmdopt == "test": answers = np.load(answer_dir+'/answers_error_bars_stacked3.npz') assert(np.all(output[0] == answers['bc'])) assert(np.all(output[1] == answers['be'])) assert(np.all(output[2] == answers['berr']))
def plot_binned_data_error(self, axis, bin_edges, data, wgt_sqrd, *args, **kwargs): binwidth = bin_edges[1] - bin_edges[0] errors = np.sqrt(wgt_sqrd) if 'density' in kwargs and kwargs['density'] == True: errors = errors / np.sum(data) / binwidth errors = errors.reindex(np.arange(1, len(bin_edges)), fill_value=0) #The dataset values are the bin centres x = (bin_edges[1:] + bin_edges[:-1]) / 2.0 #The weights are the y-values of the input binned data weights = data return skh_plt.hist(x, ax=axis, bins=bin_edges, weights=weights, errorbars=errors, *args, **kwargs)
def plot_stacked_binned_data_error(self, axis, bin_edges, data, wgt_sqrd, *args, **kwargs): errors = wgt_sqrd[0] for i in np.arange(1, len(wgt_sqrd)): errors = errors.add(wgt_sqrd[i], fill_value=0) errors = np.sqrt(errors) errors = np.array( errors.reindex(np.arange(1, len(bin_edges)), fill_value=0)) #The dataset values are the bin centres x = (bin_edges[1:] + bin_edges[:-1]) / 2.0 x = np.array([x]).repeat(len(data), axis=0) x = np.transpose(x) #The weights are the y-values of the input binned data weights = np.transpose(data) return skh_plt.hist(x, ax=axis, bins=bin_edges, weights=weights, errorbars=errors, stacked=True, *args, **kwargs)
def test_hist_fails(cmdopt, data_gen): with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], stacked=True, histtype='marker') with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], histtype='marker') with pytest.raises(KeyError): skh_plt.hist(1, err_return=True) with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], weights=data_gen[2]) with pytest.raises(ValueError): skh_plt.hist(data_gen[0], weights=data_gen[2][0:10]) with pytest.raises(KeyError): skh_plt.hist(data_gen[0], err_type='fake', errorbars=True) output1 = skh_plt.hist(5) assert(np.all(output1[0] == 1)) output2 = skh_plt.hist([], range=(0, 1)) assert(np.all(output2[0] == 0))
def plotScores(): isBlindAnalysis = True modelName = 'llqqDNN_100_60_2_0' outDirAfterDilep = [ 'Out_AfterDilepton_TrainggF1000_FullStat_1FatJet', 'Out_AfterDilepton_TrainggF2000_FullStat_1FatJet', 'Out_AfterDilepton_TrainggF3000_FullStat_1FatJet', 'Out_AfterDilepton_TrainggF700_FullStat_1FatJet' ] outDirAfterggF = [ 'Out_AfterggFMerged_TrainggF1000_FullStat_1FatJet', 'Out_AfterggFMerged_TrainggF2000_FullStat_1FatJet', 'Out_AfterggFMerged_TrainggF3000_FullStat_1FatJet', 'Out_AfterggFMerged_TrainggF700_FullStat_1FatJet' ] for idir in outDirAfterDilep: # for idir in outDirAfterggF: if isBlindAnalysis == False: yhat_data = np.load(os.path.join(idir, modelName, "yhat_data.npy")) yhat_train_signal = np.load( os.path.join(idir, modelName, "yhat_train_signal.npy")) yhat_train_background = np.load( os.path.join(idir, modelName, "yhat_train_background.npy")) yhat_test_signal = np.load( os.path.join(idir, modelName, "yhat_test_signal.npy")) yhat_test_background = np.load( os.path.join(idir, modelName, "yhat_test_background.npy")) bins = np.linspace(0, 1, 50) plt.hist(yhat_train_signal, bins=bins, histtype='step', lw=2, alpha=0.5, color='deepskyblue', label='TrainSignal', normed=True) plt.hist(yhat_test_signal, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, color='turquoise', label='TestSignal', normed=True) plt.hist(yhat_train_background, bins=bins, histtype='step', lw=2, alpha=0.5, color='deeppink', label='TrainBackground', normed=True) plt.hist(yhat_test_background, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, color='plum', label='TestBackground', normed=True) if isBlindAnalysis == False: skh_plt.hist(yhat_data, bins=bins, errorbars=True, histtype='marker', label='Data', color='black', normed=True) plt.legend(loc="upper center") plt.ylabel('Norm. Entries') plt.xlabel('DNN score') plt.yscale('log') plt.savefig(idir + '/' + modelName + "/MC_TrainTest_Score.pdf") # plt.show() plt.clf()
def test_hist_fails(cmdopt, data_gen): with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], stacked=True, histtype='marker') with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], histtype='marker') with pytest.raises(KeyError): skh_plt.hist(1, err_return=True) with pytest.raises(ValueError): skh_plt.hist([data_gen[0], data_gen[1]], weights=data_gen[2]) with pytest.raises(ValueError): skh_plt.hist(data_gen[0], weights=data_gen[2][0:10]) with pytest.raises(KeyError): skh_plt.hist(data_gen[0], err_type='fake', errorbars=True) output1 = skh_plt.hist(5) assert (np.all(output1[0] == 1)) # histogram method does not support empty lists as input for python 2.6 if sys.version_info < (2, 7): with pytest.raises(ValueError): output2 = skh_plt.hist([], range=(0, 1)) else: output2 = skh_plt.hist([], range=(0, 1)) assert (np.all(output2[0] == 0))
def comp_study(input_data, n_events, xlims=None, resamples=100, dist_name='2Gauss'): bb_dir = os.path.join('/Users/brianpollack/Coding/BayesianBlocks') do_log = True # data_nom = input_data[:n_events] if dist_name == 'Gauss': np.random.seed(88) data_nom = np.random.normal(125, 2, size=n_events) resample_list = np.random.normal(125, 2, size=(resamples, n_events)) do_log = False elif dist_name == '2LP': np.random.seed(33) data_nom = np.concatenate( (np.random.laplace(loc=90, scale=5, size=int(n_events * 0.65)), np.random.laplace(loc=110, scale=1.5, size=int(n_events * 0.25)), np.random.uniform(low=80, high=120, size=int(n_events * 0.10)))) resample_list = np.concatenate( (np.random.laplace( loc=90, scale=5, size=(resamples, int(n_events * 0.65))), np.random.laplace( loc=110, scale=1.5, size=(resamples, int(n_events * 0.25))), np.random.uniform( low=80, high=120, size=(resamples, int(n_events * 0.10)))), axis=1) do_log = False elif dist_name == 'jPT': np.random.seed(11) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) elif dist_name == 'DY': np.random.seed(200) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) else: np.random.seed(1) data_nom = np.random.choice(input_data, size=n_events, replace=False) resample_list = np.random.choice(input_data, size=(resamples, n_events), replace=True) fig_hist, axes_hist = plt.subplots(3, 3, sharex=True, sharey=False, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}', fontsize=22) # fig_hist.text(-0.03, 0.5, 'Entries/Bin Width', va='center', rotation='vertical', fontsize=20) # axes_hist[2][0].get_xaxis().set_ticks([]) # axes_hist[2][1].get_xaxis().set_ticks([]) # axes_hist[2][2].get_xaxis().set_ticks([]) axes_hist[0][0].set_title('Sturges') hist_sturges_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sturges', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][0]) axes_hist[0][1].set_title('Doane') hist_doane_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='doane', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][1]) axes_hist[0][2].set_title('Scott') hist_scott_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='scott', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[0][2]) axes_hist[1][0].set_title('Freedman Diaconis') axes_hist[1][0].set_ylabel('Entries/Bin Width', fontsize=20) hist_fd_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='fd', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][0]) axes_hist[1][1].set_title('Knuth') _, bk = knuth_bin_width(data_nom, return_bins=True) hist_knuth_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bk, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][1]) axes_hist[1][2].set_title('Rice') hist_rice_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='rice', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[1][2]) axes_hist[2][0].set_title('Sqrt(N)') hist_sqrt_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins='sqrt', errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][0]) # bep = bep_optimizer(data_nom) # _, bep = pd.qcut(data_nom, nep, retbins=True) hist_sturges = np.histogram(data_nom, bins='sturges') hist_doane = np.histogram(data_nom, bins='doane') hist_scott = np.histogram(data_nom, bins='scott') hist_fd = np.histogram(data_nom, bins='fd') hist_knuth = np.histogram(data_nom, bins=bk) hist_rice = np.histogram(data_nom, bins='rice') hist_sqrt = np.histogram(data_nom, bins='sqrt') r_sturges = rough(hist_sturges_bw, plot=False) r_doane = rough(hist_doane_bw) r_scott = rough(hist_scott_bw) r_fd = rough(hist_fd_bw) r_knuth = rough(hist_knuth_bw, plot=False) r_rice = rough(hist_rice_bw) r_sqrt = rough(hist_sqrt_bw, plot=False) eli_sturges = err_li(data_nom, hist_sturges) eli_doane = err_li(data_nom, hist_doane) eli_scott = err_li(data_nom, hist_scott) eli_fd = err_li(data_nom, hist_fd) eli_knuth = err_li(data_nom, hist_knuth) eli_rice = err_li(data_nom, hist_rice) eli_sqrt = err_li(data_nom, hist_sqrt) avg_eli_sturges = [] avg_eli_doane = [] avg_eli_scott = [] avg_eli_fd = [] avg_eli_knuth = [] avg_eli_rice = [] avg_eli_sqrt = [] for i in resample_list: avg_eli_sturges.append(err_li(i, hist_sturges)) avg_eli_doane.append(err_li(i, hist_doane)) avg_eli_scott.append(err_li(i, hist_scott)) avg_eli_fd.append(err_li(i, hist_fd)) avg_eli_knuth.append(err_li(i, hist_knuth)) avg_eli_rice.append(err_li(i, hist_rice)) avg_eli_sqrt.append(err_li(i, hist_sqrt)) avg_eli_sturges = np.mean(avg_eli_sturges) avg_eli_doane = np.mean(avg_eli_doane) avg_eli_scott = np.mean(avg_eli_scott) avg_eli_fd = np.mean(avg_eli_fd) avg_eli_knuth = np.mean(avg_eli_knuth) avg_eli_rice = np.mean(avg_eli_rice) avg_eli_sqrt = np.mean(avg_eli_sqrt) avg_eli_list = [ avg_eli_sturges, avg_eli_doane, avg_eli_scott, avg_eli_fd, avg_eli_knuth, avg_eli_rice, avg_eli_sqrt ] r_list = [r_sturges, r_doane, r_scott, r_fd, r_knuth, r_rice, r_sqrt] elis_list = [ eli_sturges, eli_doane, eli_scott, eli_fd, eli_knuth, eli_rice, eli_sqrt ] axes_hist[2][1].set_title('Equal Population') bep = bep_optimizer(data_nom, resample_list, r_list, avg_eli_list) hist_ep_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bep, errorbars=False, alpha=0.5, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][1]) hist_ep = np.histogram(data_nom, bins=bep) r_ep = rough(hist_ep_bw) eli_ep = err_li(data_nom, hist_ep) avg_eli_ep = [] for i in resample_list: avg_eli_ep.append(err_li(i, hist_ep)) avg_eli_ep = np.mean(avg_eli_ep) axes_hist[2][2].set_title('Bayesian Blocks') p0 = bb_optimizer(data_nom, resample_list, r_list, avg_eli_list) bb = bayesian_blocks(data_nom, p0=p0) if xlims: bb[0] = xlims[0] bb[-1] = xlims[-1] hist_bb_bw = skh_plt.hist(x=data_nom, histtype='stepfilled', bins=bb, errorbars=False, alpha=1, log=do_log, scale='binwidth', err_type='gaussian', ax=axes_hist[2][2]) # if n_events == 1000 and dist_name == '2LP': # axes_hist[2][2].set_ylim((0, 100)) hist_bb = np.histogram(data_nom, bins=bb) r_bb = rough(hist_bb_bw, plot=False) eli_bb = err_li(data_nom, hist_bb) avg_eli_bb = [] for i in resample_list: avg_eli_bb.append(err_li(i, hist_bb)) avg_eli_bb = np.mean(avg_eli_bb) r_list.append(r_ep) r_list.append(r_bb) avg_eli_list.append(avg_eli_ep) avg_eli_list.append(avg_eli_bb) elis_list.append(eli_ep) elis_list.append(eli_bb) plt.savefig(bb_dir + f'/plots/bin_comp/hists_{dist_name}_{n_events}.pdf') xs = [ 'Sturges', 'Doane', 'Scott', 'FD', 'Knuth', 'Rice', 'Sqrt', 'EP', 'BB' ] fig_metric, axes_metric = plt.subplots(2, 1, constrained_layout=True) fig_hist.suptitle(f'{dist_name} Distribution, N={n_events}') for i in range(len(elis_list)): if xs[i] == 'BB': axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=400, marker='*', c='k') else: axes_metric[0].scatter(avg_eli_list[i], r_list[i], label=xs[i], s=200) axes_metric[0].set_ylabel(r'$W_n$ (Wiggles)') axes_metric[0].set_xlabel(r'$\hat{E}$ (Average Error)') # ax = plt.gca() # ax.set_yscale('log') # ax.set_xscale('log') # ax.relim() # ax.autoscale_view() axes_metric[0].grid() axes_metric[0].legend(ncol=1, bbox_to_anchor=(1.05, 1.15), loc='upper left') axes_metric[0].set_title(f'{dist_name} Distribution, N={n_events}', fontsize=22) # plt.savefig(bb_dir+f'/plots/bin_comp/scat_{dist_name}_{n_events}.pdf') # plt.figure() rank_rough = rankdata(r_list, method='min') rank_avg_eli = rankdata(avg_eli_list, method='min') cont = axes_metric[1].bar(xs, rank_rough, 0.35, label=r'$W_n$ Ranking', alpha=0.5) cont[-1].set_alpha(1) cont = axes_metric[1].bar(xs, rank_avg_eli, 0.35, bottom=rank_rough, label=r'$\hat{E}$ Ranking', alpha=0.5) cont[-1].set_alpha(1) axes_metric[1].legend(loc='upper left', bbox_to_anchor=(1.0, 0.8)) # axes_metric[1].set_title(f'Combined Ranking, {dist_name} Distribution, N={n_events}') axes_metric[1].set_xlabel('Binning Method') axes_metric[1].set_ylabel('Rank') plt.savefig(bb_dir + f'/plots/bin_comp/metric_{dist_name}_{n_events}.pdf')
def train_and_validate(steps=10000, minibatch=128, LRrange=[0.0001, 0.00001, 10000, 0], beta1=0.9, beta2=0.999, nafdim=16, depth=2, \ savedir='abcdnn', seed=100, retrain=False, train=True): rawinputs, normedinputs, inputmeans, inputsigma, ncat_per_feature = prepdata( ) print(ncat_per_feature) inputdim = 4 ncat_per_feature = ncat_per_feature[0:inputdim] conddim = normedinputs.shape[1] - inputdim issignal = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] >= 3 ) # signal_selection isbackground = ~issignal bkgnormed = normedinputs[isbackground] bkg = rawinputs[isbackground] xmax = np.reshape(inputmeans + 5 * inputsigma, inputmeans.shape[1]) m = ABCDdnn(ncat_per_feature, inputdim, minibatch=minibatch, conddim=conddim, LRrange=LRrange, \ beta1=beta1, beta2=beta2, nafdim=nafdim, depth=depth, savedir=savedir, retrain=retrain, seed=seed) m.setrealdata(bkgnormed) m.savehyperparameters() m.monitorevery = 100 if train: m.train(steps) m.display_training() nj9cut = True if nj9cut: ncol = 3 # for plots below condlist = [[[ 1., 0., 0., 1., 0., ]], [[ 0., 1., 0., 1., 0., ]], [[ 0., 0., 1., 1., 0., ]], [[ 1., 0., 0., 0., 1., ]], [[ 0., 1., 0., 0., 1., ]], [[ 0., 0., 1., 0., 1., ]]] select0 = (rawinputs['njet'] == 7) & (rawinputs['nbtag'] == 2) select1 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] == 2) select2 = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] == 2) select3 = (rawinputs['njet'] == 7) & (rawinputs['nbtag'] >= 3) select4 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] >= 3) select5 = (rawinputs['njet'] >= 9) & (rawinputs['nbtag'] >= 3) select_data = [select0, select1, select2, select3, select4, select5] plottextlist = [ f'$N_j=7, N_b=2$', f'$N_j=8, N_b=2$', f'$N_j\geq 9, N_b=2$', f'$N_j=7, N_b\geq 3$', f'$N_j=8, N_b\geq 3$', f'$N_j\geq 9, N_b\geq 3$' ] njlist = [7, 8, 9, 7, 8, 9] nblist = [2, 2, 2, 3, 3, 3] else: ncol = 3 # for plots condlist = [[[ 0., 1., 0., 0., 1., 0., ]], [[ 0., 0., 1., 0., 1., 0., ]], [[ 0., 0., 0., 1., 1., 0., ]], [[ 0., 1., 0., 0., 0., 1., ]], [[ 0., 0., 1., 0., 0., 1., ]], [[ 0., 0., 0., 1., 0., 1., ]]] select0 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] == 2) select1 = (rawinputs['njet'] == 9) & (rawinputs['nbtag'] == 2) select2 = (rawinputs['njet'] >= 10) & (rawinputs['nbtag'] == 2) select3 = (rawinputs['njet'] == 8) & (rawinputs['nbtag'] >= 3) select4 = (rawinputs['njet'] == 9) & (rawinputs['nbtag'] >= 3) select5 = (rawinputs['njet'] >= 10) & (rawinputs['nbtag'] >= 3) select_data = [select0, select1, select2, select3, select4, select5] plottextlist = [ f'$N_j=8, N_b=2$', f'$N_j=9, N_b=2$', f'$N_j\geq 10, N_b=2$', f'$N_j=8, N_b\geq 3$', f'$N_j=9, N_b\geq 3$', f'$N_j\geq 10, N_b\geq 3$' ] njlist = [8, 9, 10, 8, 9, 10] nblist = [2, 2, 2, 3, 3, 3] # create fake data fakedatalist = [] for cond, nj, nb in zip(condlist, njlist, nblist): nmcbatches = int(bkgnormed.shape[0] / minibatch) nmcremain = bkgnormed.shape[0] % minibatch fakelist = [] cond_to_append = np.repeat(cond, minibatch, axis=0) for _ib in range(nmcbatches): xin = bkgnormed[_ib * minibatch:(_ib + 1) * minibatch, :inputdim] xin = np.hstack( (xin, cond_to_append)) # append conditional to the feature inputs xgen = m.model.predict(xin) #xgen = m.generate_sample(cond) fakelist.append(xgen) # last batch xin = bkgnormed[nmcbatches * minibatch:, :inputdim] xin = np.hstack( (xin, np.repeat(cond, nmcremain, axis=0))) # append conditional to the feature inputs xgen = m.model.predict(xin) fakelist.append(xgen) # all data fakedata = np.vstack(fakelist) fakedata = fakedata * inputsigma[:, :inputdim] + inputmeans[:, : inputdim] nfakes = fakedata.shape[0] fakedata = np.hstack((fakedata, np.array([nj]*nfakes).reshape((nfakes,1))\ , np.array([nb]*nfakes).reshape(nfakes,1) ) ) fakedatalist.append(fakedata) labelsindices = [['MET', 'met', 0.0, xmax[0]], ['H_T', 'ht', 0.0, xmax[1]],\ ['p_{T5}', 'pt5', 0.0, xmax[2]], ['p_{T6}', 'pt6', 0.0, xmax[3]]] nbins = 20 runplots = True if runplots: yscales = ['log', 'linear'] for yscale in yscales: for li in labelsindices: pos = featurevars.index(li[1]) fig, ax = plt.subplots(2, ncol, figsize=(3 * ncol, 6)) iplot = 0 for fakedata, seld, plottext in zip(fakedatalist, select_data, plottextlist): input_data = rawinputs[seld] # Make ratio plots plotaxes = MplPlotter.ratio_plot(dict(x=input_data[li[1]], bins=nbins, range=(li[2], li[3]), errorbars=True, normed=True, histtype='marker'), \ dict(x=fakedata[:, pos], bins=nbins, range=(li[2], li[3]), errorbars=True, normed=True), ratio_range=(0.25, 1.9)) plotfig = plotaxes[0][0].get_figure() plotaxes[0][0].set_yscale(yscale) plotfig.set_size_inches(5, 5) plotfig.savefig( os.path.join( savedir, f'result_{li[1]}_{iplot}_{yscale}_ratio.pdf')) # make matrix of plots row = iplot // ncol col = iplot % ncol iplot += 1 plt.sca(ax[row, col]) ax[row, col].set_yscale(yscale) ax[row, col].set_xlabel(f"${li[0]}$ (GeV)") MplPlotter.hist(input_data[li[1]], bins=nbins, alpha=0.5, range=(li[2], li[3]), errorbars=True, histtype='marker', normed=True) MplPlotter.hist(fakedata[:, pos], bins=nbins, alpha=0.5, range=(li[2], li[3]), errorbars=True, normed=True) MplPlotter.hist(bkg[li[1]], bins=nbins, alpha=0.5, range=(li[2], li[3]), histtype='step', normed=True) plt.text(0.6, 0.8, plottext, transform=ax[row, col].transAxes, fontsize=10) fig.tight_layout() fig.savefig( os.path.join(savedir, f'result_matrix_{li[1]}_{yscale}.pdf')) generatesigsample = True if generatesigsample: bkgsigfakedata = np.vstack(fakedatalist) datadict = {} for var, idx in zip(featurevars, range(len(featurevars))): datadict[var] = bkgsigfakedata[:, idx] writetorootfile(os.path.join(savedir, 'fakedata_NAF.root'), datadict) pass
def PlotResults(setupClient, model, X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test): print(Fore.BLUE + "--------------------------") print(Back.BLUE + " RESULTS ") print(Fore.BLUE + "--------------------------") if setupClient.runMode == 'binary' or setupClient.runMode == 'param' or setupClient.runMode == 'SimpleRNN': print('Evaluating model on X_test, y_test') score = model.evaluate(X_test, y_test, batch_size=setupClient.Params['BatchSize']) # testLoss = 'Test loss:%0.3f' % score[0] # testAccuracy = 'Test accuracy:%0.3f' % score[1] print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test loss', score[0])) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Test accuracy', score[1])) # get the architecture as a json string arch = model.to_json() with open(os.path.join(setupClient.ModelSavePath, 'architecture.json'), 'w') as arch_file: print('Saving model as json', os.path.join(setupClient.ModelSavePath, 'architecture.json')) arch_file.write(arch) # now save the weights as an HDF5 file model.save_weights(os.path.join(setupClient.ModelSavePath, 'ModelWeights.h5'), overwrite=True) if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'): print("Pickle file not found!") quit() foo = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb") bla = pickle.load(foo) minusMean = np.multiply(-1, bla.Scaler.mean_) OneOverStd = np.divide(1, np.sqrt(bla.Scaler.var_)) with open(os.path.join(setupClient.ModelSavePath, 'Scaling.txt'), 'w') as scaleFileOut: scaleFileOut.write( str(setupClient.InputDNNVariables[setupClient.VarSet]) + '\n') scaleFileOut.write('Mean\n' + str(bla.Scaler.mean_) + '\n') scaleFileOut.write('minusMean\n' + str(minusMean) + '\n') scaleFileOut.write('Var\n' + str(bla.Scaler.var_) + '\n') scaleFileOut.write('sqrtVar\n' + str(np.sqrt(bla.Scaler.var_)) + '\n') scaleFileOut.write('OneOverStd\n' + str(OneOverStd) + '\n') theClasses = [] print('\nRunning model prediction on X train/test samples') yResult_test_cls = [] yResult_train_cls = [] yResult_test = model.predict(X_test, verbose=True, batch_size=setupClient.Params['BatchSize']) yResult_train = model.predict(X_train, verbose=True, batch_size=setupClient.Params['BatchSize']) #insert the score result back into the original file # ix_test['DNN_Score'] = yResult_test # ix_train['DNN_Score'] = yResult_train # ix_test.to_pickle(setupClient.ModelSavePath+'/ResultsTestPD.pkl',protocol=2) # ix_train.to_pickle(setupClient.ModelSavePath+'/ResultsTrainPD.pkl',protocol=2) # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTestPD.npy") , ix_test ) # antonio # np.save( os.path.join(setupClient.ModelSavePath, "ResultsTrainPD.npy") , ix_train ) # antonio # np.save( os.path.join(setupClient.ModelSavePath, "rootBranchSubSample.npy") , ix_test.columns.values) # antonio if setupClient.runMode == 'multi': yResult_test_cls = np.argmax( yResult_test, axis=1) #stores the element with max score yResult_train_cls = np.argmax( yResult_train, axis=1) #stores the element with max score theClasses = ['Zjets', 'Signal', 'Diboson', 'Top'] else: yResult_test_cls = np.array([int(round(x[0])) for x in yResult_test]) yResult_train_cls = np.array([int(round(x[0])) for x in yResult_train]) theClasses = ['Background', 'Signal'] # print(X_test[:20]) # print ('') # # print(ix_test[:20]) # print ('') # print(yResult_test) # quit() # # print(yResult_test_cls) # print ('') # print(yResult_train) # print ('') # print(yResult_train_cls) if setupClient.doConfusionMatrix: # Plot the confusion matrix plt.clf() # The class method is: sklearn.metrics.confusion_matrix(y_true, y_pred, labels=None, sample_weight=None) cnf_matrix = confusion_matrix(y_test, yResult_test_cls, sample_weight=w_test) np.set_printoptions(precision=2) plot_confusion_matrix(setupClient, cnf_matrix, classes=theClasses, normalize=True, title='Normalized confusion matrix') if setupClient.doEfficiency: print('Calculating Efficiencies on Test sample') if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN': s_eff = w_test[(y_test == 1) & ( yResult_test_cls > 0.5)].sum() / w_test[y_test == 1].sum() b_eff = w_test[(y_test != 1) & ( yResult_test_cls > 0.5)].sum() / w_test[y_test != 1].sum() print(" ") print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'Signal efficiency', s_eff)) print('{:<35} {:<25.3f}'.format( Fore.GREEN + 'Background efficiency:', b_eff)) print('{:<35} {:<25.3f}'.format( Fore.GREEN + 'Background rejection:', 1.0 / b_eff)) if setupClient.runMode == 'multi': channelEffi = channelDic.copy() for channel, i in channelDic.items(): channelEffi[channel] = w_test[(y_test == i) & ( yResult_test_cls == 1)].sum() / w_test[y_test == i].sum() for channel, eff in channelEffi.items(): print('{:<35} {:<25.3f}'.format( Fore.GREEN + channel + ' efficiency', eff)) b_eff = w_test[(y_test != 1) & ( yResult_test_cls == 1)].sum() / w_test[y_test != 1].sum() print('{:<30} {:<20.3f}'.format('Background efficiency', b_eff)) print('{:<30} {:<20.3f}'.format('Background rejection', 1.0 / b_eff)) print(" ") if setupClient.doScore: if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param': # First create one sample of X_train only from signal and one only from background events Xtrain_signal = X_train[y_train == 1] Xtrain_background = X_train[y_train != 1] # Then do the same for Xtest Xtest_signal = X_test[y_test == 1] Xtest_background = X_test[y_test != 1] # Get predictions of the model on these -train- samples print('Running model prediction on Xtrain_signal') yhat_train_signal = model.predict( Xtrain_signal, batch_size=setupClient.Params['BatchSize']) print('Running model prediction on Xtrain_background') yhat_train_background = model.predict( Xtrain_background, batch_size=setupClient.Params['BatchSize']) # Get predictions of the model on these -test- samples print('Running model prediction on Xtest_signal') yhat_test_signal = model.predict( Xtest_signal, batch_size=setupClient.Params['BatchSize']) print('Running model prediction on Xtest_background') yhat_test_background = model.predict( Xtest_background, batch_size=setupClient.Params['BatchSize']) hasData = False if setupClient.runMode == 'binary' and setupClient.unblind == True: # Get the data PD file dataFileName = setupClient.PDPath + setupClient.MixPD_TrainTestTag + '_Data.pkl' if os.path.isfile(dataFileName): hasData = True print('Reading Data file:', dataFileName) data_full = pd.read_pickle(dataFileName) data_full_matrix = data_full[setupClient.InputDNNVariables[ setupClient.VarSet]].as_matrix() print('{:<45} {:<15}'.format( 'Getting Scaler of Training sample from file', Fore.GREEN + setupClient.TrainedModelPath + 'DNN_Setup')) if not os.path.isfile(setupClient.TrainedModelPath + '/DNN_Setup'): print("Pickle file not found!") quit() f = open(setupClient.TrainedModelPath + 'DNN_Setup', "rb") savedSetupClient = pickle.load(f) data_full_matrix = savedSetupClient.Scaler.transform( data_full_matrix) # Get predictions on data print('Running model prediction on data') yhat_data = model.predict( data_full_matrix, verbose=True, batch_size=setupClient.Params['BatchSize']) yhat_data_rounded = np.array([round(x[0]) for x in yhat_data]) # Save as numpy array # np.save( os.path.join(setupClient.ModelSavePath,"yhat_data.npy") , yhat_data) else: print('Data file:', dataFileName, ' not found. Will proceed to MC only') if setupClient.runMode == 'SimpleRNN': # antonio for ifile in setupClient.InputFilesSB['Data']: dataFileName = setupClient.PDPath + ifile + '_FullNoRandom.pkl' if os.path.isfile(dataFileName): hasData = False print('Reading Data file:', dataFileName) data_full = pd.read_pickle(dataFileName) VariablesSet = setupClient.InputDNNVariables[ setupClient.VarSet] data_full_matrix = data_full[VariablesSet].copy() var_names = data_full_matrix.keys() new_data_full_matrix = np.zeros( (data_full_matrix.shape[0], 6, 4)) for i in range(0, data_full_matrix.shape[0]): for j in range(0, data_full_matrix.shape[1]): new_data_full_matrix[i, int(j / 4), j % 4] = data_full_matrix.iloc[i, j] data_full_matrix = new_data_full_matrix PrepareData.scale(data_full_matrix, ['pt', 'eta', 'phi', 'E'], False, setupClient) # apply scaling to test set # Get predictions on data print('Running model prediction on data') yhat_data = model.predict( data_full_matrix, verbose=True, batch_size=setupClient.Params['BatchSize']) data_full['RNN_Score'] = yhat_data print(data_full.shape) np.save( os.path.join(setupClient.ModelSavePath, "ResultsDataMLPD_" + ifile + ".npy"), data_full) # antonio np.save( os.path.join( setupClient.ModelSavePath, "rootBranchSubSampleForDataML_" + ifile + ".npy"), data_full.columns.values) # antonio else: print('Data file:', dataFileName, ' not found. Will proceed to MC only') sns.set_palette("coolwarm", 4) # Plot scores bins = np.linspace(0, 1, 50) plt.hist(yhat_train_signal, bins=bins, histtype='step', lw=2, alpha=0.5, label=[r'Signal Train'], normed=True) plt.hist(yhat_test_signal, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, label=[r'Signal Test'], normed=True) plt.hist(yhat_test_background, bins=bins, histtype='stepfilled', lw=2, alpha=0.5, label=[r'Background Test'], normed=True) plt.hist(yhat_train_background, bins=bins, histtype='step', lw=2, alpha=0.5, label=[r'Background Train'], normed=True) if hasData and setupClient.unblind == True: # Plot the data as well. Using skh_plt because matplotlib does not come with markers for hist class skh_plt.hist(yhat_data, bins=bins, errorbars=True, histtype='marker', label='Data', color='black', normed=True) plt.ylabel('Norm. Entries') plt.xlabel('DNN score') plt.legend(loc="upper center") plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score.png") plt.yscale('log') plt.savefig(setupClient.ModelSavePath + "/MC_Data_TrainTest_Score_log.png") plt.clf() if setupClient.doROC: if setupClient.runMode == 'binary' or setupClient.runMode == 'SimpleRNN' or setupClient.runMode == 'param': # Get 'Receiver operating characteristic' (ROC) fpr, tpr, thresholds = roc_curve(y_test, yResult_test) # Compute Area Under the Curve (AUC) from prediction scores roc_auc = auc(fpr, tpr) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC', roc_auc)) # print "ROC AUC: %0.3f" % roc_auc plt.plot(fpr, tpr, color='darkorange', lw=2, label='Full curve (area = %0.2f)' % roc_auc) plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.title('ROC curves for Signal vs Background') plt.legend(loc="lower right") # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None") # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point plt.savefig(setupClient.ModelSavePath + "/ROC.png") plt.clf() ### NOW try the weighted ROC curve fpr_w, tpr_w, thresholds_w = roc_curve(y_test, yResult_test, sample_weight=w_test) roc_auc_w = auc(fpr_w, tpr_w, reorder=True) print('{:<35} {:<25.3f}'.format(Fore.GREEN + 'ROC AUC weighted', roc_auc_w)) plt.plot(fpr_w, tpr_w, color='darkorange', lw=2, label='Full curve (area = %0.2f)' % roc_auc_w) plt.plot([0, 0], [1, 1], color='navy', lw=2, linestyle='--') plt.xlim([-0.05, 1.0]) plt.ylim([0.0, 1.05]) plt.ylabel('True Positive Rate (weighted)') plt.xlabel('False Positive Rate (weighted)') plt.title('ROC curve for Signal vs Background') plt.legend(loc="lower right") # plt.plot([0.038], [0.45], marker='*', color='red',markersize=5, label="Cut-based",linestyle="None") # plt.plot([0.038, 0.038], [0,1], color='red', lw=1, linestyle='--') # same background rejection point plt.savefig(setupClient.ModelSavePath + "/ROC_weighted.png") plt.clf() np.save(os.path.join(setupClient.ModelSavePath, "tpr_w.npy"), tpr_w) np.save(os.path.join(setupClient.ModelSavePath, "fpr_w.npy"), fpr_w) np.save( os.path.join(setupClient.ModelSavePath, "thresholds_w.npy"), thresholds_w) np.save(os.path.join(setupClient.ModelSavePath, "thresholds.npy"), thresholds) np.save(os.path.join(setupClient.ModelSavePath, "tpr.npy"), tpr) np.save(os.path.join(setupClient.ModelSavePath, "fpr.npy"), fpr) np.save(os.path.join(setupClient.ModelSavePath, "AUC.npy"), roc_auc) np.save(os.path.join(setupClient.ModelSavePath, "AUC_w.npy"), roc_auc_w)
def plotDataMC(setupClient): topDF_list = [] zjetsDF_list = [] wjetsDF_list = [] dibosonDF_list = [] signalDF_list = [] for itype in setupClient.InputFilesSB.keys(): for ifile in setupClient.InputFilesSB[itype]: print(ifile) if 'Top' in ifile: topDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Train') ] topDF_list += [getDFEvents(setupClient.PDPath, ifile, '_Test')] if 'Data' in ifile: dataDF = getDFEvents(setupClient.PDPath, ifile, 'Data') if 'Zjets' in ifile: zjetsDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Train') ] zjetsDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Test') ] if 'Diboson' in ifile: dibosonDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Train') ] dibosonDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Test') ] if 'ggF' in ifile: signalDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Train') ] signalDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Test') ] if 'Wjets' in ifile: wjetsDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Train') ] wjetsDF_list += [ getDFEvents(setupClient.PDPath, ifile, '_Test') ] topDF = pd.concat(topDF_list, ignore_index=True) zjetsDF = pd.concat(zjetsDF_list, ignore_index=True) wjetsDF = pd.concat(wjetsDF_list, ignore_index=True) dibosonDF = pd.concat(dibosonDF_list, ignore_index=True) signalDF = pd.concat(signalDF_list, ignore_index=True) for var in setupClient.VariablesToPlot: print("Plotting variable", var) # print ' min:',min(dibosonDF[var]), ' max', max(dibosonDF[var]) bins = np.linspace(min(dibosonDF[var]), max(dibosonDF[var]), 20) plt.hist([topDF[var], dibosonDF[var], zjetsDF[var], wjetsDF[var]], histtype='stepfilled', normed=False, bins=bins, weights=[ topDF['weight'], dibosonDF['weight'], zjetsDF['weight'], wjetsDF['weight'] ], label=[ 'Top', 'Diboson', 'Z + jets', 'W + jets', ], stacked=True) plt.hist(signalDF[var], histtype='step', normed=False, bins=bins, weights=signalDF['weight'], label=r'ggF', linewidth=1, color='red', linestyle='dashed') # plt.hist(dataDF[var], histtype='step', normed=False, bins=bins, label=r'Data', linewidth=2, color='black', linestyle='dashed') _ = skh_plt.hist(dataDF[var], bins=bins, errorbars=True, histtype='marker', label='Data', color='black') plt.legend(loc='best', prop={'size': 10}) plt.xlabel(var, fontsize=14) plt.savefig(setupClient.VarPlotPath + "/" + var + "_DataMC.png") plt.yscale('log') plt.savefig(setupClient.VarPlotPath + "/" + var + "_DataMC_log.png") plt.clf()