def get_data(): gdata = GraphDataset(root='/anomalyvol/data/gnn_node_global_merge', bb=0) fulllen = len(gdata) tv_frac = 0.10 tv_num = math.ceil(fulllen * tv_frac) torch.manual_seed(0) train_dataset, valid_dataset, test_dataset = random_split( gdata, [fulllen - 2 * tv_num, tv_num, tv_num]) train_loader = DataListLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) train_loader.collate_fn = collate valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) valid_loader.collate_fn = collate test_loader = DataListLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader.collate_fn = collate train_samples = len(train_dataset) valid_samples = len(valid_dataset) test_samples = len(test_dataset) return train_loader, valid_loader, test_loader, train_samples, valid_samples, test_samples
def from_data_to_loader(full_dataset, n_train, n_val, batch_size): train_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=0, stop=n_train)) valid_dataset = torch.utils.data.Subset( full_dataset, np.arange(start=n_train, stop=n_train + n_val)) # preprocessing the train_dataset in a good format for passing correct batches of events to the GNN train_dataset_batched = [] for i in range(len(train_dataset)): train_dataset_batched += train_dataset[i] train_dataset_batched = [[i] for i in train_dataset_batched] # preprocessing the valid_dataset in a good format for passing correct batches of events to the GNN valid_dataset_batched = [] for i in range(len(valid_dataset)): valid_dataset_batched += valid_dataset[i] valid_dataset_batched = [[i] for i in valid_dataset_batched] #hack for multi-gpu training if not multi_gpu: def collate(items): l = sum(items, []) return Batch.from_data_list(l) else: def collate(items): l = sum(items, []) return l train_loader = DataListLoader(train_dataset_batched, batch_size, pin_memory=True, shuffle=True) train_loader.collate_fn = collate valid_loader = DataListLoader(valid_dataset_batched, batch_size, pin_memory=True, shuffle=False) valid_loader.collate_fn = collate return train_loader, valid_loader
optimizer = torch.optim.Adam(model.parameters(), lr = lr) def collate(items): # collate function for data loaders (transforms list of lists to list) l = sum(items, []) return Batch.from_data_list(l) # train, valid, test split torch.manual_seed(0) # lock seed for random_split train_dataset, valid_dataset, test_dataset = random_split(gdata, [fulllen-2*tv_num,tv_num,tv_num]) train_loader = -1 valid_loader = -1 test_loader = -1 if use_sparseloss == False and use_vae == False: train_loader = DataListLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) train_loader.collate_fn = collate valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) valid_loader.collate_fn = collate test_loader = DataListLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader.collate_fn = collate else: train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) train_samples = len(train_dataset) valid_samples = len(valid_dataset) test_samples = len(test_dataset) # load in model modpath = osp.join('/anomalyvol/models/',model_fname+'.best.pth')
val_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=args.n_train, stop=args.n_train+args.n_val)) print("train_dataset", len(train_dataset)) print("val_dataset", len(val_dataset)) #hack for multi-gpu training if not multi_gpu: def collate(items): l = sum(items, []) return Batch.from_data_list(l) else: def collate(items): l = sum(items, []) return l train_loader = DataListLoader(train_dataset, batch_size=args.batch_size, pin_memory=False, shuffle=False) train_loader.collate_fn = collate val_loader = DataListLoader(val_dataset, batch_size=args.batch_size, pin_memory=False, shuffle=False) val_loader.collate_fn = collate model_class = model_classes[args.model] model_kwargs = {'input_dim': input_dim, 'hidden_dim': args.hidden_dim, 'encoding_dim': args.encoding_dim, 'output_dim_id': output_dim_id, 'output_dim_p4': output_dim_p4, 'dropout_rate': args.dropout, 'convlayer': args.convlayer, 'convlayer2': args.convlayer2, 'radius': args.radius, 'space_dim': args.space_dim, 'activation': args.activation,
def create_models(features, spectators, labels, nfeatures, nspectators, nlabels, ntracks, train_files, test_files, val_files, batch_size, remove_mass_pt_window, remove_unlabeled, max_entry): #imports from tensorflow.keras.models import Model from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau from tensorflow.keras.layers import Input, Dense, BatchNormalization, Conv1D, Flatten, Lambda # DATA GENERATORS FOR USE IN MODEL TRAINING AND TESTING train_generator = DataGenerator( train_files, features, labels, spectators, batch_size=batch_size, n_dim=ntracks, remove_mass_pt_window=remove_mass_pt_window, remove_unlabeled=remove_unlabeled, max_entry=max_entry) val_generator = DataGenerator(val_files, features, labels, spectators, batch_size=batch_size, n_dim=ntracks, remove_mass_pt_window=remove_mass_pt_window, remove_unlabeled=remove_unlabeled, max_entry=max_entry) test_generator = DataGenerator(test_files, features, labels, spectators, batch_size=batch_size, n_dim=ntracks, remove_mass_pt_window=remove_mass_pt_window, remove_unlabeled=remove_unlabeled, max_entry=max_entry) #weights for training training_weights = { 0: 3.479, 1: 4.002, 2: 3.246, 3: 2.173, 4: 0.253, 5: 1.360 } # FULLY CONNECTED NEURAL NET CLASSIFIER # define dense keras model inputs = Input(shape=( ntracks, nfeatures, ), name='input') x = BatchNormalization(name='bn_1')(inputs) x = Flatten(name='flatten_1')(x) x = Dense(64, name='dense_1', activation='relu')(x) x = Dense(32, name='dense_2', activation='relu')(x) x = Dense(32, name='dense_3', activation='relu')(x) outputs = Dense(nlabels, name='output', activation='softmax')(x) keras_model_dense = Model(inputs=inputs, outputs=outputs) keras_model_dense.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) print(keras_model_dense.summary()) # define callbacks early_stopping = EarlyStopping(monitor='val_loss', patience=35) reduce_lr = ReduceLROnPlateau(patience=5, factor=0.5) model_checkpoint = ModelCheckpoint('keras_model_dense_best.h5', monitor='val_loss', save_best_only=True) callbacks = [early_stopping, model_checkpoint, reduce_lr] # fit keras model history_dense = keras_model_dense.fit_generator( train_generator, validation_data=val_generator, steps_per_epoch=len(train_generator), validation_steps=len(val_generator), max_queue_size=5, epochs=50, class_weight=training_weights, shuffle=False, callbacks=callbacks, verbose=0) # reload best weights keras_model_dense.load_weights('keras_model_dense_best.h5') visualize_loss(history_dense) visualize('fcnn_loss.png') from tensorflow.keras.models import Model from tensorflow.keras.layers import Input, Dense, BatchNormalization, Conv1D, Flatten, Lambda, GlobalAveragePooling1D import tensorflow.keras.backend as K # define Deep Sets model with Conv1D Keras layer inputs = Input(shape=( ntracks, nfeatures, ), name='input') x = BatchNormalization(name='bn_1')(inputs) x = Conv1D(64, 1, strides=1, padding='same', name='conv1d_1', activation='relu')(x) x = Conv1D(32, 1, strides=1, padding='same', name='conv1d_2', activation='relu')(x) x = Conv1D(32, 1, strides=1, padding='same', name='conv1d_3', activation='relu')(x) # sum over tracks x = GlobalAveragePooling1D(name='pool_1')(x) x = Dense(100, name='dense_1', activation='relu')(x) outputs = Dense(nlabels, name='output', activation='softmax')(x) keras_model_conv1d = Model(inputs=inputs, outputs=outputs) keras_model_conv1d.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) print(keras_model_conv1d.summary()) # define callbacks from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau early_stopping = EarlyStopping(monitor='val_loss', patience=35) #defining learningrate decay model num_epochs = 100 initial_learning_rate = 0.01 decay = initial_learning_rate / num_epochs learn_rate_decay = lambda epoch, lr: lr * 1 / (1 + decay * epoch) reduce_lr2 = ReduceLROnPlateau(patience=5, factor=0.5) #reduce_lr = ReduceLROnPlateau(patience=5,factor=0.5) reduce_lr = LearningRateScheduler(learn_rate_decay) model_checkpoint = ModelCheckpoint('keras_model_conv1d_best.h5', monitor='val_loss', save_best_only=True) #callbacks = [early_stopping, model_checkpoint, reduce_lr2] callbacks = [early_stopping, model_checkpoint, reduce_lr2] #weights for training training_weights = { 0: 3.479, 1: 4.002, 2: 3.246, 3: 2.173, 4: 0.253, 5: 1.360 } # fit keras model history_conv1d = keras_model_conv1d.fit_generator( train_generator, validation_data=val_generator, steps_per_epoch=len(train_generator), validation_steps=len(val_generator), max_queue_size=5, epochs=num_epochs, class_weight=training_weights, shuffle=False, callbacks=callbacks, verbose=0) # reload best weights keras_model_conv1d.load_weights('keras_model_conv1d_best.h5') visualize_loss(history_conv1d) visualize('conv1d_loss.png') #GNN START #load data graph_dataset = GraphDataset('gdata_train', features, labels, spectators, n_events=1000, n_events_merge=1, file_names=train_files) graph_dataset.process() #understand data from torch_geometric.data import Data, DataListLoader, Batch from torch.utils.data import random_split torch.manual_seed(0) valid_frac = 0.20 full_length = len(graph_dataset) valid_num = int(valid_frac * full_length) batch_size = 32 train_dataset, valid_dataset = random_split( graph_dataset, [full_length - valid_num, valid_num]) train_loader = DataListLoader(graph_dataset, batch_size=batch_size, pin_memory=True, shuffle=True) train_loader.collate_fn = collate valid_loader = DataListLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) valid_loader.collate_fn = collate train_samples = len(train_dataset) valid_samples = len(valid_dataset) #create gnn model import torch.nn as nn import torch.nn.functional as F import torch_geometric.transforms as T from torch_geometric.nn import EdgeConv, global_mean_pool from torch.nn import Sequential as Seq, Linear as Lin, ReLU, BatchNorm1d from torch_scatter import scatter_mean from torch_geometric.nn import MetaLayer model = InteractionNetwork().to(device) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) import os.path as osp n_epochs = 20 stale_epochs = 0 best_valid_loss = 99999 patience = 5 t = tqdm(range(0, n_epochs)) # calculate weights s = 0 for data in graph_dataset: d = data[0].y[0] if s is 0: s = d else: s += d weights = [] for w in s: # wi = (# jets)/(# classes * # jets in class) den = w.item() * 6 num = sum(s).item() weights += [num / den] for epoch in t: loss = train(model, optimizer, train_loader, train_samples, batch_size, leave=bool(epoch == n_epochs - 1), weights=weights) valid_loss = test(model, valid_loader, valid_samples, batch_size, leave=bool(epoch == n_epochs - 1)) print('Epoch: {:02d}, Training Loss: {:.4f}'.format(epoch, loss)) print(' Validation Loss: {:.4f}'.format(valid_loss)) if valid_loss < best_valid_loss: best_valid_loss = valid_loss modpath = osp.join('interactionnetwork_best.pth') print('New best model saved to:', modpath) torch.save(model.state_dict(), modpath) stale_epochs = 0 else: print('Stale epoch') stale_epochs += 1 if stale_epochs >= patience: print('Early stopping after %i stale epochs' % patience) break #load test data test_dataset = GraphDataset('data', features, labels, spectators, n_events=1000, n_events_merge=1, file_names=test_files) test_dataset.process() test_loader = DataListLoader(test_dataset, batch_size=batch_size, pin_memory=True, shuffle=False) test_loader.collate_fn = collate test_samples = len(test_dataset) #model evaluation model.eval() t = tqdm(enumerate(test_loader), total=test_samples / batch_size) y_test = [] y_predict = [] for i, data in t: data = data.to(device) batch_output = model(data.x, data.edge_index, data.batch) y_predict.append(batch_output.detach().cpu().numpy()) y_test.append(data.y.cpu().numpy()) y_test = np.concatenate(y_test) y_predict = np.concatenate(y_predict) #GNN END # COMPARING MODELS predict_array_dnn = [] predict_array_cnn = [] label_array_test = [] for t in test_generator: label_array_test.append(t[1]) predict_array_dnn.append(keras_model_dense.predict(t[0])) predict_array_cnn.append(keras_model_conv1d.predict(t[0])) predict_array_dnn = np.concatenate(predict_array_dnn, axis=0) predict_array_cnn = np.concatenate(predict_array_cnn, axis=0) label_array_test = np.concatenate(label_array_test, axis=0) fpr_dnn = [] tpr_dnn = [] fpr_cnn = [] tpr_cnn = [] fpr_gnn = [] tpr_gnn = [] # create ROC curves for each class for i in range(nlabels): t_fpr_d, t_tpr_d, thresh_d = roc_curve(label_array_test[:, i], predict_array_dnn[:, i]) t_fpr_c, t_tpr_c, thresh_c = roc_curve(label_array_test[:, i], predict_array_cnn[:, i]) t_fpr_g, t_tpr_g, thresh_g = roc_curve(y_test[:, i], y_predict[:, i]) #appending fpr_dnn.append(t_fpr_d) tpr_dnn.append(t_tpr_d) fpr_cnn.append(t_fpr_c) tpr_cnn.append(t_tpr_c) fpr_gnn.append(t_fpr_g) tpr_gnn.append(t_tpr_g) # plot ROC curves visualize_roc(fpr_cnn, tpr_cnn, fpr_dnn, tpr_dnn, fpr_gnn, tpr_gnn) visualize('fnn_vs_conv1d.pdf')
if __name__ == "__main__": args = parse_args() device = torch.device("cuda") epoch = args.epoch model = args.model path = args.path weights = torch.load("{}/epoch_{}/weights.pth".format(path, epoch)) with open('{}/model_kwargs.pkl'.format(path),'rb') as f: model_kwargs = pickle.load(f) model_class = train_end2end.model_classes[args.model] model = model_class(**model_kwargs) model.load_state_dict(weights) model = model.to(device) model.eval() print(args.dataset) full_dataset = graph_data.PFGraphDataset(root=args.dataset) test_dataset = torch.utils.data.Subset(full_dataset, np.arange(start=args.start, stop=args.stop)) loader = DataListLoader(test_dataset, batch_size=1, pin_memory=False, shuffle=False) loader.collate_fn = collate big_df = train_end2end.prepare_dataframe(model, loader) big_df.to_pickle("{}/test.pkl.bz2".format(path)) print(big_df[big_df["cand_pid"]!=1].head())