def load_input_file(args): if not os.path.isfile(args.input): print( "ERROR provided input file (={}) is not found or is not a regular file" .format(args.input)) sys.exit() samples_group_name = "samples" scaling_group_name = "scaling" scaling_data_name = "scaling_data" samples = [] data_scaler = None with h5py.File(args.input, 'r', libver='latest') as input_file: # look up the scaling first if scaling_group_name in input_file: scaling_group = input_file[scaling_group_name] scaling_dataset = scaling_group[scaling_data_name] data_scaler = DataScaler(scaling_dataset=scaling_dataset, ignore_features=['eventweight']) print( "DataScaler found {} features to use as inputs (there were {} total features in the input)" .format(len(data_scaler.feature_list()), len(data_scaler.raw_feature_list()))) else: print("scaling group (={}) not found in file".format( scaling_group_name)) sys.exit() # build the samples if samples_group_name in input_file: sample_group = input_file[samples_group_name] for p in sample_group: process_group = sample_group[p] class_label = process_group.attrs['training_label'] s = Sample(name=p, class_label=int(class_label), input_data=floatify( process_group['validation_features'][tuple( data_scaler.feature_list())], data_scaler.feature_list())) s.eventweights = floatify( process_group['validation_features'][tuple(['eventweight' ])], ['eventweight']) samples.append(s) else: print("samples group (={}) not found in file".format( samples_group_name)) sys.exit() return samples, data_scaler
def get_single_nn_histo(sample, scaler, model): lcd = 0.0 histo_data = [] weight_data = [] w2_data = [] total_read = 0 with h5py.File(sample.filename, 'r', libver='latest') as sample_file: if 'superNt' not in sample_file: print('ERROR superNt dataset not found in input file (={})'.format( sample.filename)) sys.exit() dataset = sample_file['superNt'] if 'hh' in sample.name: dataset = dataset[19000:] for chunk in chunk_generator(dataset): total_read += chunk.size if total_read > 1e6: continue weights = chunk['eventweight'] lcd_idx = (chunk['nBJets'] >= 1) weights = weights[lcd_idx] * 36.1 lcd += np.sum(weights) chunk = chunk[lcd_idx] more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 110) & (chunk['mbb'] < 140) chunk = chunk[more_idx] weights = weights[more_idx] input_features = chunk[scaler.feature_list()] input_features = floatify(input_features, scaler.feature_list()) input_features = (input_features - scaler.mean()) / scaler.scale() scores = model.predict(input_features) num_data = scores[:, 0] den_data = scores[:, 1] ok_idx = den_data != 0 num_data = num_data[ok_idx] den_data = den_data[ok_idx] weights = weights[ok_idx] data = np.log(num_data / den_data) ok_idx = (data > -np.inf) & (data < np.inf) data = data[ok_idx] weights = weights[ok_idx] histo_data.extend(data) weight_data.extend(weights) w2_data.extend(np.power(weights, 2)) h = Histo(sample.name) h.lcd = lcd h.weights = weight_data h.histo_data = histo_data h_sumw2_histo_data = w2_data return h
def get_data(sample, kind, scaler=None, model=None): lcd = 0.0 histo_data = [] weight_data = [] weight2_data = [] use_stored_model = scaler and model total_read = 0.0 total_pass_raw = 0 total_pass_w = 0. with h5py.File(sample.filename, 'r', libver='latest') as sample_file: if 'superNt' not in sample_file: print("ERROR superNt dataset not found in input file (={})".format( sample.filename)) sys.exit() dataset = sample_file['superNt'] if 'hh' in sample.name: #dataset = dataset[8800:] dataset = dataset[19000:] for chunk in chunk_generator(dataset): total_read += chunk.size if total_read > 1000000.: break #print("TOTAL READ = {}".format(total_read)) weights = chunk['eventweight'] # HELLO # count the total number of weighted events at the base, denominator selection #lcd_idx = (chunk['nBJets']>=1) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100) & (chunk['mt2_llbb']<140) & (chunk['dRll']<0.9) lcd_idx = ( chunk['nBJets'] >= 1 ) #& (chunk['mt2_llbb']<140) #& (chunk['mbb']<140)# & (chunk['dRll']<0.9) #lcd_idx = chunk['nBJets'] >= 1 weights_lcd = weights[lcd_idx] * 36.1 lcd += np.sum(weights_lcd) # now get the disciminants we want to scan over if kind == 'nn': chunk = chunk[lcd_idx] weights = weights[lcd_idx] * 36.1 # add more #more_idx = (chunk['nBMJets'] >= 2) & (chunk['nSJets']>0) & (chunk['mt2_bb_bm'] > 65) # & (chunk['mbb_bm'] > 100) & (chunk['mbb_bm'] < 140) #more_idx = (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65)#(chunk['mt2_bb']>30)# & (chunk['met']>45) & (chunk['l1_pt']>15) #more_idx = (chunk['met']>50)# & (chunk['l1_pt']>20) more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 110) & ( chunk['mbb'] < 140) & (chunk['mt2_bb'] > 65) chunk = chunk[more_idx] weights = weights[more_idx] if use_stored_model: input_features = chunk[scaler.feature_list()] input_features = floatify(input_features, scaler.feature_list()) input_features = (input_features - scaler.mean()) / scaler.scale() scores = model.predict(input_features) num_data = scores[:, 0] # get the HH score from NN output den_data = scores[:, 1] print('WARNING ONLY GRABBING ONE SCORE') # den_data += scores[:,2] # den_data += scores[:,3]# * 0.1 ok_idx = den_data != 0 num_data = num_data[ok_idx] den_data = den_data[ok_idx] weights = weights[ok_idx] data = np.log(num_data / den_data) ok_idx = (data > -np.inf) & (data < np.inf) data = data[ok_idx] weights = weights[ok_idx] print("MIN MAX = {} {}".format(np.min(data), np.max(data))) total_pass_raw += data.size total_pass_w += np.sum(weights) else: data = chunk['nn_p_hh'] # target HH score from NN histo_data.extend(data) weight_data.extend(weights) weight2_data.extend(weights**2) elif kind == 'cut': sel_idx = (chunk['mbb'] > 100) & (chunk['mbb'] < 140) & ( chunk['mt2_llbb'] > 100) & (chunk['mt2_llbb'] < 140) & ( chunk['dRll'] < 0.9) & (chunk['HT2Ratio'] > 0.8) & ( chunk['nBJets'] == 2) & (chunk['l1_pt'] > 20.) & ( chunk['mll'] > 20.) data = chunk[sel_idx] weights = weights[sel_idx] * 36.1 data = data[ 'mt2_bb'] # we are going to scan over mt2_bb in the cut based strategy histo_data.extend(data) weight_data.extend(weights) weight2_data.extend(weights**2) total_pass_raw += data.size total_pass_w += np.sum(weights) print("Total pass for {} : {} ({})".format(sample.name, total_pass_w, total_pass_raw)) return lcd, histo_data, weight_data, weight2_data
def make_plots(args): sample = Sample("sample", args.input, "") data_scaler, model = load_stored_model(args.nn_dir) lwtnn_data = [] otf_data = [] weight_data = [] weight2_data = [] with h5py.File(sample.filename, 'r', libver='latest') as sample_file: if 'superNt' not in sample_file: print( 'ERROR "superNt" dataset not found in input file (={})'.format( sample.filename)) sys.exit(1) dataset = sample_file['superNt'] for chunk in chunk_generator(dataset): weights = chunk['eventweight'] # LWTNN var lwtnn_var = chunk['NN_p_hh'] # OTF input_features = chunk[data_scaler.feature_list()] input_features = floatify(input_features, data_scaler.feature_list()) input_features = (input_features - data_scaler.mean()) / data_scaler.scale() scores = model.predict(input_features) nn_p_hh = scores[:, 0] nn_p_tt = scores[:, 1] nn_p_wt = scores[:, 2] nn_p_zjets = scores[:, 3] otf_var = nn_p_hh #otf_var = np.log( nn_p_tt / (nn_p_hh + nn_p_wt + nn_p_zjets) ) #ok_idx = valid_idx(otf_var) #otf_var = otf_var[ok_idx] #weights = weights[ok_idx] #lwtnn_var = lwtnn_var[ok_idx] lwtnn_data.extend(lwtnn_var) otf_data.extend(otf_var) weight_data.extend(weights) weight2_data.extend(weights**2) ## histos bw = 0.05 bins = np.arange(0, 1 + bw, bw) hist_lwtnn, _ = np.histogram(lwtnn_data, bins=bins, weights=weight_data) hist_otf, _ = np.histogram(otf_data, bins=bins, weights=weight_data) sumw2_hist, _ = np.histogram(lwtnn_data, bins=bins, weights=weight2_data) print('lwtnn: {}'.format(hist_lwtnn[:20])) print('otf : {}'.format(hist_otf[:20])) bin_centers = bins + 0.5 * bw ratio_hist = hist_lwtnn / hist_otf bin_centers = bin_centers[:-1] fig, ax = plt.subplots(2, 1) #ax[0].hist( [otf_data], bins = bins, weights = [weight_data], label = ['otf'], histtype = 'step', color = ['b'] ) ax[0].hist([lwtnn_data, otf_data], bins=bins, weights=[weight_data, weight_data], label=['lwtnn', 'otf'], histtype='step', color=['r', 'b']) ax[1].plot(bin_centers, ratio_hist, label='lwtnn/otf') ax[0].set_ylabel('Entries') ax[1].set_xlabel('$hh$ NN score') ax[1].set_ylabel('lwtnn compute / keras compute') fig.savefig('test_otf_lwtnn_comp.pdf', bbox_inches='tight', dpi=200)
def get_process_inputs(data_scaler): idx = -1 if training_dir.endswith('/'): idx = -2 input_file_dir = '/'.join(training_dir.split('/')[:idx]) input_file = '%s/wwbb_preprocessed.h5' % input_file_dir if not os.path.isdir(input_file_dir): print('ERROR could not locate file dir (={})'.format(input_file_dir)) sys.exit() if not os.path.isfile(input_file): print('ERROR could not locate file (={})'.format(input_file)) sys.exit() samples_group_name = 'samples' scaling_group_name = 'scaling' scaling_data_name = 'scaling_data' sample_dict = {} with h5py.File(input_file, 'r', libver='latest') as input_file: if samples_group_name in input_file: sample_group = input_file[samples_group_name] n_per_sample = -1 for p in sample_group: if p == 'ttbar' or p == 'hh': sample_dict[p] = {} process_group = sample_group[p] class_label = process_group.attrs['training_label'] # get the "test" sample test_sample = Sample( name='%s_test' % p, class_label=int(class_label), input_data=floatify( process_group['validation_features'][tuple( data_scaler.feature_list())], data_scaler.feature_list())) test_sample.eventweights = floatify( process_group['validation_features'][tuple( ['eventweight'])], ['eventweight']) # get the "training" sample (which has our validation data in it) training_data = floatify( process_group['train_features'][tuple( data_scaler.feature_list())], data_scaler.feature_list()) training_weights = floatify( process_group['train_features'][tuple(['eventweight' ])], ['eventweight']) # randomize if p != 'hh': if n_per_sample < 0: print( 'ERROR Did not get number to split for train/validation from signal' ) sys.exit() elif p == 'hh': n_per_sample = int(training_data.shape[0]) randomize = np.arange(len(training_data)) np.random.shuffle(randomize) shuffled_training_data = training_data[randomize] shuffled_training_weights = training_weights[randomize] fraction_for_validation = 0.2 total_n = len(shuffled_training_data) n_for_validation = int(fraction_for_validation * total_n) split_train_data = shuffled_training_data[ n_for_validation:] split_train_weights = shuffled_training_weights[ n_for_validation:] split_val_data = shuffled_training_data[:n_for_validation] split_val_weights = shuffled_training_weights[: n_for_validation] train_sample = Sample(name='%s_train' % p, class_label=int(class_label), input_data=split_train_data) train_sample.eventweights = split_train_weights val_sample = Sample(name='%s_val' % p, class_label=int(class_label), input_data=split_val_data) val_sample.eventweights = split_val_weights print( 'Loaded sample %s: n train = %d, n val = %d, n_test = %d' % (p, len(train_sample.data()), len( val_sample.data()), len(test_sample.data()))) sample_dict[p]['test'] = test_sample sample_dict[p]['train'] = train_sample sample_dict[p]['val'] = val_sample return sample_dict
def get_total_bkg_disc(sample, scaler=None, model=None, add_mt2bb_cut=False): lcd = 0.0 histo_data = [] weight_data = [] w2_data = [] total_read = 0 with h5py.File(sample.filename, 'r', libver='latest') as sample_file: if 'superNt' not in sample_file: print('ERROR superNt dataset not found in input file (={})'.format( sample.filename)) sys.exit() dataset = sample_file['superNt'] if 'hh' in sample.name: dataset = dataset[19000:] for chunk in chunk_generator(dataset): total_read += chunk.size if total_read > 1e6: break weights = chunk['eventweight'] lcd_idx = (chunk['nBJets'] >= 1) weights = weights[lcd_idx] * 36.1 lcd += np.sum(weights) chunk = chunk[lcd_idx] if add_mt2bb_cut: print('get_total_bkg_disc ADDING mt2_bb cut to selection') more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 100) & ( chunk['mbb'] < 140) & (chunk['mt2_bb'] > 50) else: more_idx = (chunk['nBJets'] >= 2) & (chunk['mbb'] > 100) & (chunk['mbb'] < 140) chunk = chunk[more_idx] weights = weights[more_idx] input_features = chunk[scaler.feature_list()] input_features = floatify(input_features, scaler.feature_list()) input_features = (input_features - scaler.mean()) / scaler.scale() scores = model.predict(input_features) num_data = scores[:, 0] den_data = scores[:, 1] if scores.shape[1] > 2: den_data += scores[:, 2] den_data += scores[:, 3] ok_idx = den_data != 0 num_data = num_data[ok_idx] den_data = den_data[ok_idx] weights = weights[ok_idx] data = np.log(num_data / den_data) ok_idx = (data > -np.inf) & (data < np.inf) data = data[ok_idx] weights = weights[ok_idx] histo_data.extend(data) weight_data.extend(weights) w2_data.extend(np.power(weights, 2)) h = Histo(sample.name) h.lcd = lcd h.weights = weight_data h.histo_data = histo_data h.sumw2_histo_data = w2_data return h
def get_yields(args, kind = '') : if not kind : print('did not provide kind') sys.exit() filename = {'reco' : reco_sig, 'truth' : truth_sig}[kind] treename = {'reco' : 'superNt', 'truth' : 'truth'}[kind] sample = Sample(kind, filename, '') data_scaler, model = load_stored_model(args.nn_dir) total_counts_raw = 0 total_counts_weighted = 0.0 with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file : if treename not in sample_file : print('ERROR treename (={}) is not found in input file (={})'.format(treename, sample.filename)) sys.exit() dataset = sample_file[treename] for chunk in chunk_generator(dataset) : weights = chunk['eventweight'] * lumi_factor if not args.cut_based : # calculate OTF input_features = chunk[data_scaler.feature_list()] input_features = floatify(input_features, data_scaler.feature_list()) input_features = (input_features - data_scaler.mean()) / data_scaler.scale() scores = model.predict(input_features) nn_p_hh = scores[:,0] nn_p_tt = scores[:,1] nn_p_wt = scores[:,2] nn_p_zjets = scores[:,3] nn_d_hh = np.log( nn_p_hh / (nn_p_tt + nn_p_wt + nn_p_zjets) ) ok_idx = (nn_d_hh > -np.inf) & (nn_d_hh < np.inf) weights = weights[ok_idx] chunk = chunk[ok_idx] selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>110) & (chunk['mbb']<140) & (chunk['mt2_bb']>65) nn_idx = nn_d_hh > 6.2 #selection_idx = (chunk['nBJets']>=2) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_bb']>65) #nn_idx = nn_d_hh > 6.3 print('nn_idx = {}'.format(nn_idx.any())) selection_idx = nn_idx & selection_idx weights = weights[selection_idx] data = chunk[selection_idx] total_counts_raw += data.size total_counts_weighted += np.sum(weights) else : selection_idx = (chunk['mll']>20.) & (chunk['l1_pt']>20.) & (chunk['nBJets']==2) & (chunk['dRll']<0.9) & (chunk['HT2Ratio']>0.8) & (chunk['mt2_bb']>150.) & (chunk['mbb']>100) & (chunk['mbb']<140) & (chunk['mt2_llbb']>100.) & (chunk['mt2_llbb']<140.) weights = weights[selection_idx] data = chunk[selection_idx] total_counts_raw += data.size total_counts_weighted += np.sum(weights) print('yields for {}: {} ({})'.format(kind, total_counts_weighted, total_counts_raw)) return total_counts_weighted, total_counts_raw
def dump_scores(input_file, model, data_scaler, args): """ From the input HDF5 file, go through it and get the NN output for the features, storing them to a single output file whose name is based on the input filename. Args : input_file : input filename for HDF5 file to be opened and processed model : loaded Keras model data_scaler : loaded DataScaler object used to scale the input features prior to network evaluation args : command line inputs """ outname = input_file.split("/")[-1].replace(".h5", "").replace(".hdf5", "") outname += "_scores.h5" if args.outdir != "": mkdir_p(args.outdir) outname = "{}/{}".format(args.outdir, outname) #out_ds_created = False #out_ds = None #gen = chunk_generator(input_file, dataset_name = args.dataset) #chunk = next(gen) #chunk = chunk [ (chunk['nBJets']==2) ] #row_count = chunk.shape[0] #weights = chunk['eventweight'] #input_features = chunk[data_scaler.feature_list()] #input_features = floatify(chunk[data_scaler.feature_list()], data_scaler.feature_list()) #input_features = (input_features - data_scaler.mean()) / data_scaler.scale() #scores = model.predict(input_features) #n_outputs = scores.shape[1] #ds = np.array( list(weights), dtype = [('eventweight', float)]) #for io in range(n_outputs) : # ds = recfunctions.append_fields( ds , names = 'nn_score_{}'.format(io), data = scores[:,io], dtypes = float ) #dtype = ds.dtype #row_count = ds.shape[0] dataset_id = 0 with h5py.File(outname, 'w', libver='latest') as outfile: for chunk in chunk_generator(input_file, dataset_name=args.dataset): # apply the selection here chunk = chunk[(chunk['nBJets'] >= 1) & ( chunk['HT2Ratio'] > 0.5 )] # & (chunk['mt2_bb'] > 65) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ] #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 55) & (chunk['l1_pt']>20.) & (chunk['mll']>20.) ] #chunk = chunk[ (chunk['nBJets'] >= 1) & (chunk['mt2_bb'] > 65) & (chunk['HT2Ratio']>0.5) ] if chunk.size == 0: continue weights = chunk['eventweight'] input_features = chunk[data_scaler.feature_list()] input_features = floatify(input_features, data_scaler.feature_list()) input_features = (input_features - data_scaler.mean()) / data_scaler.scale() scores = model.predict(input_features) n_outputs = scores.shape[1] discriminants = build_discriminant_array(scores, n_outputs) ds = np.array(list(weights), dtype=[('eventweight', float)]) for io in range(n_outputs): ds = recfunctions.append_fields(ds, names='nn_score_{}'.format(io), data=scores[:, io], dtypes=np.float64) for io in range(n_outputs): ds = recfunctions.append_fields(ds, names='nn_disc_{}'.format(io), data=discriminants[io], dtypes=np.float64) maxshape = (None, ) + ds.shape[1:] dsname = "nn_scores_{}".format(dataset_id) out_ds = outfile.create_dataset(dsname, shape=ds.shape, maxshape=maxshape, chunks=ds.shape, dtype=ds.dtype) out_ds[:] = ds dataset_id += 1 print(" > output saved : {}".format(os.path.abspath(outname)))
def get_data(sample, scaler, model, to_do) : data = [] w = [] name = "" total_read = 0.0 with h5py.File(sample.filename, 'r', libver = 'latest') as sample_file : if 'superNt' not in sample_file : print('ERROR superNt dataset not found in input file (={})'.format(sample.filename)) sys.exit() dataset = sample_file['superNt'] if 'hh' in sample.name : dataset = dataset[19000:] for chunk in chunk_generator(dataset) : total_read += chunk.size if total_read > 1e6 : break print('{} > {}'.format(sample.name, total_read)) chunk = chunk[ (chunk['nBJets'] >= 1 ) ] weights = chunk['eventweight'] * 36.1 input_features = chunk[scaler.feature_list()] input_features = floatify(input_features, scaler.feature_list()) input_features = (input_features - scaler.mean()) / scaler.scale() scores = model.predict(input_features) #to_do = "hh" p_hh = scores[:,0] p_tt = scores[:,1] p_wt = scores[:,2] p_z = scores[:,3] hist_data = None if 'd_' in to_do : num_data = None den_data = None if to_do == 'd_hh' : num_data = p_hh den_data = (p_tt + p_wt + p_z) elif to_do == 'd_tt' : num_data = p_tt den_data = (p_hh + p_wt + p_z) elif to_do == 'd_wt' : num_data = p_wt den_data = (p_hh + p_tt + p_z) elif to_do == 'd_z' : num_data = p_z den_data = (p_hh + p_tt + p_wt) d = np.log(num_data / den_data) idx = (d > -np.inf) & (d < np.inf) d = d[idx] weights = weights[idx] hist_data = d[:] else : hist_data = { "hh" : p_hh, "tt" : p_tt, "wt" : p_wt, "z" : p_z } [ to_do ] name = { "hh" : "$p_{hh}$", "tt" : "$p_{t\\bar{t}}$", "wt" : "$p_{Wt}$", "z" : "$p_{Z}$" } [ to_do.replace('d_','') ] if 'd_' in to_do : name = name.replace('p_', 'd_') data.extend(hist_data) w.extend(weights) return data, w, name
def make_plot(var_dict, sample, scaler, model, args): x_data = [] y_data = [] w_data = [] total_read = 0 disc_for_x = args.varX with h5py.File(sample.filename, 'r', libver='latest') as sample_file: if 'superNt' not in sample_file: print('ERROR superNt dataset not found in input file (={})'.format( sample.filename)) sys.exit() dataset = sample_file['superNt'] is_first = True if 'data' in sample.filename: is_first = False for chunk in chunk_generator(dataset, chunksize=19000): if is_first: is_first = False continue total_read += chunk.size if total_read > 1e6: break idx = (chunk['nBJets'] >= 2) weights = chunk[idx] chunk = chunk[idx] input_features = chunk[scaler.feature_list()] input_features = floatify(input_features, scaler.feature_list()) input_features = (input_features - scaler.mean()) / scaler.scale() scores = model.predict(input_features) p_hh = scores[:, 0] p_tt = scores[:, 1] p_wt = scores[:, 2] p_z = scores[:, 3] i_x_data = p_hh if 'p_' in disc_for_x: i_x_data = { 'p_hh': p_hh, 'p_tt': p_tt, 'p_wt': p_wt, 'p_z': p_z }[disc_for_x] elif 'd_' in disc_for_x: num_data = { 'd_hh': p_hh, 'd_tt': p_tt, 'd_wt': p_wt, 'd_z': p_z }[disc_for_x] den_data = { 'd_hh': (p_tt + p_wt + p_z), 'd_tt': (p_wt + p_hh + p_z), 'd_wt': (p_hh + p_tt + p_z), 'd_z': (p_tt + p_wt + p_hh) }[disc_for_x] disc = np.log(num_data / den_data) idx = valid_idx(disc) p_hh = p_hh[idx] p_tt = p_tt[idx] p_wt = p_wt[idx] p_z = p_z[idx] disc = disc[idx] weights = weights[idx] chunk = chunk[idx] i_x_data = disc i_y_data = None if 'p_' in args.varY or 'd_' in args.varY: if 'p_' in args.varY: i_y_data = { 'p_hh': p_hh, 'p_tt': p_tt, 'p_wt': p_wt, 'p_z': p_z }[args.varY] elif 'd_' in args.varY: num_data = { 'd_hh': p_hh, 'd_tt': p_tt, 'd_wt': p_wt, 'd_z': p_z }[args.varY] den_data = { 'd_hh': (p_tt + p_wt + p_z), 'd_tt': (p_wt + p_hh + p_z), 'd_wt': (p_hh + p_tt + p_z), 'd_z': (p_tt + p_wt + p_hh) }[args.varY] y_disc = np.log(num_data / den_data) #idx = valid_idx(y_disc) #y_disc = y_disc[idx] #weights = weights[idx] #chunk = chunk[idx] i_y_data = y_disc else: i_y_data = chunk[args.varY] x_data.extend(list(i_x_data)) y_data.extend(list(i_y_data)) w_data.extend(list(weights)) x_data = np.array(x_data) y_data = np.array(y_data) w_data = np.array(w_data) fig, ax = plt.subplots(1, 1) ax.grid(color='k', which='both', linestyle='-', lw=0.5, alpha=0.1) ax.tick_params(axis='both', which='both', direction='in', labelleft=True, bottom=True, top=True, right=True, left=True) var_dict = all_vars() x_bounds = var_dict[disc_for_x]['bounds'] y_bounds = var_dict[args.varY]['bounds'] x_label = var_dict[disc_for_x]['name'] y_label = var_dict[args.varY]['name'] x_edges = np.arange(x_bounds[1], x_bounds[2] + x_bounds[0], x_bounds[0]) y_edges = np.arange(y_bounds[1], y_bounds[2] + y_bounds[0], y_bounds[0]) bins = [x_edges, y_edges] ax.set_xlabel(x_label, horizontalalignment='right', x=1) ax.set_ylabel(y_label, horizontalalignment='right', y=1) print('x_data shape = {}'.format(x_data.shape)) print('y_data shape = {}'.format(y_data.shape)) h, x, y = np.histogram2d(x_data, y_data, bins=bins, normed=False) #integral = h.sum() #h = h / integral imextent = list((min(x_edges), max(x_edges))) + list( (min(y_edges), max(y_edges))) ax.set_facecolor('lightgrey') h = h.T im = ax.imshow(h, origin='lower', cmap='coolwarm', aspect='auto', interpolation='nearest', extent=imextent, norm=LogNorm()) ax.contour(h, levels=[1, 3, 10], colors='black', extent=imextent) cb = fig.colorbar(im) process = '' if 'wt' in sample.filename: process = 'wt' elif '123456' in sample.filename: process = 'hh' elif 'ttbar' in sample.filename or '410009' in sample.filename: process = 'ttbar' elif 'zll' in sample.filename or 'zjets' in sample.filename: process = 'zjets' elif 'ztt' in sample.filename: process = 'zjets_tt' elif 'data' in sample.filename: process = 'data' ax.text(0.85, 0.93, process, weight='bold', transform=ax.transAxes) outname = './plots_input_output/input_output_2D_{}_{}_{}.pdf'.format( process, disc_for_x, args.varY) print(' >> saving plot to: {}'.format(os.path.abspath(outname))) fig.savefig(outname, bbox_inches='tight', dpi=200)