def PFN_AUC_calculation(jet_array_1, jet_array_2, train_size, test_size): X = np.concatenate([jet_array_1, jet_array_2])[:,:,:4] y = np.concatenate([np.ones(len(jet_array_1)), np.zeros(len(jet_array_2))]) ################################### SETTINGS ################################### # data controls train, val, test = train_size, X.shape[0]-train_size-test_size, test_size use_pids = True # network architecture parameters Phi_sizes, F_sizes = (100, 100, 128), (100, 100, 100) # network training parameters num_epoch = 10 batch_size = 500 ################################################################################ # convert labels to categorical Y = to_categorical(y, num_classes=2) # preprocess by centering jets and normalizing pts for x in X: mask = x[:,0] > 0 yphi_avg = np.average(x[mask,1:3], weights=x[mask,0], axis=0) x[mask,1:3] -= yphi_avg x[mask,0] /= x[:,0].sum() # handle particle id channel if use_pids: remap_pids(X, pid_i=3) else: X = X[:,:,:3] # do train/val/test split (X_train, X_val, X_test, Y_train, Y_val, Y_test) = data_split(X, Y, val=val, test=test) # build architecture pfn = 0 with suppress_stdout(): pfn = PFN(input_dim=X.shape[-1], Phi_sizes=Phi_sizes, F_sizes=F_sizes) # train model pfn.fit(X_train, Y_train, epochs=num_epoch, batch_size=batch_size, validation_data=(X_val, Y_val), verbose=0) # get predictions on test data preds = pfn.predict(X_test, batch_size=1000) # get area under the ROC curve auc = roc_auc_score(Y_test[:,1], preds[:,1]) return auc
# convert labels to categorical Y = to_categorical(y, num_classes=2) print('Loaded quark and gluon jets') # preprocess by centering jets and normalizing pts for x in X: mask = x[:, 0] > 0 yphi_avg = np.average(x[mask, 1:3], weights=x[mask, 0], axis=0) x[mask, 1:3] -= yphi_avg x[mask, 0] /= x[:, 0].sum() # handle particle id channel if use_pids: remap_pids(X, pid_i=3) else: X = X[:, :, :3] print('Finished preprocessing') # do train/val/test split (X_train, X_val, X_test, Y_train, Y_val, Y_test) = data_split(X, Y, val=val, test=test) print('Done train/val/test split') print('Model summary:') # build architecture
def load_data(cache_dir, pt_lower, pt_upper, eta, quality, pad, x_dim=3, momentum_scale=250, n=100000, amount=1, max_particle_select=None, frac=1.0, return_pfcs=True): # Load data specs = [ f'{pt_lower} <= gen_jet_pts <= {pt_upper}', f'abs_jet_eta < {eta}', f'quality >= {quality}' ] sim = ef.mod.load(*specs, cache_dir=cache_dir, dataset='sim', amount=amount) # Gen_pt for Y Y1 = sim.jets_f[:, sim.gen_jet_pt] Y = np.zeros((Y1.shape[0], 1), dtype=np.float32) Y[:, 0] = Y1 / momentum_scale # Sim_pt for X X = np.zeros((Y1.shape[0], 3), dtype=np.float32) X[:, 0] = sim.jets_f[:, sim.jet_pt] / momentum_scale X[:, 1] = sim.jets_f[:, sim.jet_eta] X[:, 2] = sim.jets_f[:, sim.jet_phi] # CMS JEC's C = sim.jets_f[:, sim.jec] # PFC's pfcs = sim.particles # Shuffle and trim shuffle_indices = np.random.choice(np.arange(pfcs.shape[0]), size=int(pfcs.shape[0] * frac), replace=False) pfcs = pfcs[shuffle_indices] Y = Y[shuffle_indices] X = X[shuffle_indices] C = C[shuffle_indices] pfcs = pfcs[:n] Y = Y[:n] X = X[:n] C = C[:n] # PFC's dataset = np.zeros((pfcs.shape[0], pad, x_dim), dtype=np.float32) particle_counts = [] if return_pfcs: for (i, jet) in enumerate(pfcs): size = min(jet.shape[0], pad) indices = (-jet[:, 0]).argsort() dataset[i, :size, 0] = jet[indices[:size], 0] / momentum_scale dataset[i, :size, 1] = jet[indices[:size], 1] dataset[i, :size, 2] = jet[indices[:size], 2] if x_dim == 4: dataset[i, :size, 3] = jet[indices[:size], 4] # PID particle_counts.append(jet.shape[0]) if x_dim == 4: remap_pids(dataset, pid_i=3, error_on_unknown=False) for x in dataset: mask = x[:, 0] > 0 yphi_avg = np.average(x[mask, 1:3], weights=x[mask, 0], axis=0) x[mask, 1:3] -= yphi_avg particle_counts = np.array(particle_counts) # Trim and shuffle if max_particle_select is not None: dataset = dataset[particle_counts < max_particle_select] Y = Y[particle_counts < max_particle_select] X = X[particle_counts < max_particle_select] C = C[particle_counts < max_particle_select] particle_counts = particle_counts[ particle_counts < max_particle_select] shuffle_indices = np.random.choice(np.arange(dataset.shape[0]), size=int(dataset.shape[0] * frac), replace=False) print("X: ", X.shape, X.dtype) print("Y: ", Y.shape, Y.dtype) print("PFCs: ", dataset.shape, dataset.dtype) if not return_pfcs: return X, Y, C, particle_counts print("Max # of particles: %d" % max(particle_counts)) return X, dataset, Y, C, particle_counts