def view_model_param(MODEL_NAME, net_params): model = gnn_model(MODEL_NAME, net_params) model = model.float() total_param = 0 print("MODEL DETAILS:\n") for param in model.parameters(): total_param += np.prod(list(param.data.size())) print('MODEL/Total parameters:', MODEL_NAME, total_param) return total_param
def view_model_param(MODEL_NAME, net_params, verbose=False): model = gnn_model(MODEL_NAME, net_params) total_param = 0 print("MODEL DETAILS:\n") for param in model.parameters(): total_param += np.prod(list(param.data.size())) print('MODEL/Total parameters:', MODEL_NAME, total_param) if verbose: print('\n== Net Params:') print(net_params) print('\n== Model Structure:') print(model) return total_param
def inference(MODEL_NAME, dataset, params, net_params, model_path): if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test device = net_params['device'] model = gnn_model(MODEL_NAME, net_params) model = model.to(device) model.load_state_dict(torch.load(model_path)) # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) _, train_mae = evaluate_network(model, device, train_loader) _, val_mae = evaluate_network(model, device, val_loader) _, test_mae = evaluate_network(model, device, test_loader) print("Train MAE: {:.4f}".format(train_mae)) print("Val MAE: {:.4f}".format(val_mae)) print("Test MAE: {:.4f}".format(test_mae))
def train_val_pipeline(MODEL_NAME, dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() if MODEL_NAME in ['GatedGCN']: if net_params['pos_enc']: print("[!] Adding graph positional encoding.") dataset._add_positional_encodings(net_params['pos_enc_dim']) print('Time PE:', time.time() - t0) trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device.type == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False if MODEL_NAME in ['RingGNN', '3WLGNN']: # import train functions specific for WLGNNs from train.train_molecules_graph_regression import train_epoch_dense as train_epoch, evaluate_network_dense as evaluate_network from functools import partial # util function to pass edge_feat to collate function train_loader = DataLoader(trainset, shuffle=True, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) val_loader = DataLoader(valset, shuffle=False, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) test_loader = DataLoader(testset, shuffle=False, collate_fn=partial( dataset.collate_dense_gnn, edge_feat=net_params['edge_feat'])) else: # import train functions for all other GNNs from train.train_molecules_graph_regression import train_epoch_sparse as train_epoch, evaluate_network_sparse as evaluate_network train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs'])) as t: for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() if MODEL_NAME in [ 'RingGNN', '3WLGNN' ]: # since different batch training function for RingGNN epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch, params['batch_size']) else: # for all other models common train function epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae) epoch_val_MAEs.append(epoch_val_mae) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('test/_mae', epoch_test_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae, val_MAE=epoch_val_mae, test_MAE=epoch_test_mae) per_epoch_time.append(time.time() - start) # Saving checkpoint ckpt_dir = os.path.join(root_ckpt_dir, "RUN_") if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/epoch_" + str(epoch))) files = glob.glob(ckpt_dir + '/*.pkl') for file in files: epoch_nb = file.split('_')[-1] epoch_nb = int(epoch_nb.split('.')[0]) if epoch_nb < epoch - 1: os.remove(file) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') _, test_mae = evaluate_network(model, device, test_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) print("Test MAE: {:.4f}".format(test_mae)) print("Train MAE: {:.4f}".format(train_mae)) print("Convergence Time (Epochs): {:.4f}".format(epoch)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) writer.close() """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Convergence Time (Epochs): {:.4f}\nTotal Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], test_mae, train_mae, epoch, (time.time()-t0)/3600, np.mean(per_epoch_time)))
test_loader = DataLoader(dataset.test, batch_size=args.batch_size, collate_fn=dataset.collate) for key, _net_param in net_params.items(): print(f'Starting {args.net}{key} on {args.dataset}') time_start = time.perf_counter() net_param = deepcopy(_net_param) net_param['num_atom_type'] = dataset.num_atom_type net_param['num_bond_type'] = dataset.num_bond_type net_param['device'] = 'cuda:0' set_seed(params['seed']) net = gnn_model(args.net, net_param) net.cuda() optimizer = optim.Adam(net.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience']) with tqdm(range(params['epochs'])) as epochs: for e in epochs: epochs.set_description(f'Epoch #{e+1}')
def train_val_pipeline(MODEL_NAME, dataset, params, net_params, dirs): t0 = time.time() per_epoch_time = [] DATASET_NAME = dataset.name if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test root_log_dir, root_ckpt_dir, write_file_name, write_config_file = dirs device = net_params['device'] # Write the network and optimization hyper-parameters in folder config/ with open(write_config_file + '.txt', 'w') as f: f.write( """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n\nTotal Parameters: {}\n\n""" .format(DATASET_NAME, MODEL_NAME, params, net_params, net_params['total_param'])) log_dir = os.path.join(root_log_dir, "RUN_" + str(0)) writer = SummaryWriter(log_dir=log_dir) # setting seeds random.seed(params['seed']) np.random.seed(params['seed']) torch.manual_seed(params['seed']) if device == 'cuda': torch.cuda.manual_seed(params['seed']) print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) epoch_train_losses, epoch_val_losses = [], [] epoch_train_MAEs, epoch_val_MAEs = [], [] # batching exception for Diffpool drop_last = True if MODEL_NAME == 'DiffPool' else False train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, drop_last=drop_last, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, drop_last=drop_last, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs'])) as t: best_val_mae = 10000 for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_mae, optimizer = train_epoch( model, optimizer, device, train_loader, epoch) epoch_val_loss, epoch_val_mae = evaluate_network( model, device, val_loader, epoch) epoch_train_losses.append(epoch_train_loss) epoch_val_losses.append(epoch_val_loss) epoch_train_MAEs.append(epoch_train_mae) epoch_val_MAEs.append(epoch_val_mae) writer.add_scalar('train/_loss', epoch_train_loss, epoch) writer.add_scalar('val/_loss', epoch_val_loss, epoch) writer.add_scalar('train/_mae', epoch_train_mae, epoch) writer.add_scalar('val/_mae', epoch_val_mae, epoch) writer.add_scalar('learning_rate', optimizer.param_groups[0]['lr'], epoch) _, epoch_test_mae = evaluate_network(model, device, test_loader, epoch) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_MAE=epoch_train_mae.item(), val_MAE=epoch_val_mae.item(), test_MAE=epoch_test_mae.item()) per_epoch_time.append(time.time() - start) # Saving checkpoint ckpt_dir = os.path.join(root_ckpt_dir, "RUN_") if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/epoch_" + str(epoch))) if best_val_mae > epoch_val_mae: best_val_mae = epoch_val_mae torch.save(model.state_dict(), '{}.pkl'.format(ckpt_dir + "/best")) files = glob.glob(ckpt_dir + '/*.pkl') for file in files: if file[-8:] == 'best.pkl': continue else: epoch_nb = file.split('_')[-1] epoch_nb = int(epoch_nb.split('.')[0]) if epoch_nb < epoch - 1: os.remove(file) scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') model.load_state_dict(torch.load('{}.pkl'.format(ckpt_dir + "/best"))) _, val_mae = evaluate_network(model, device, val_loader, epoch) _, test_mae = evaluate_network(model, device, test_loader, epoch) _, train_mae = evaluate_network(model, device, train_loader, epoch) print("Test MAE: {:.4f}".format(test_mae)) print("Train MAE: {:.4f}".format(train_mae)) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) writer.close() """ Write the results in out_dir/results folder """ with open(write_file_name + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae.cpu())), np.array(train_mae.cpu()), (time.time()-t0)/3600, np.mean(per_epoch_time))) # send results to gmail try: from gmail import send subject = 'Result for Dataset: {}, Model: {}'.format( DATASET_NAME, MODEL_NAME) body = """Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST MAE: {:.4f}\nTRAIN MAE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_mae.cpu())), np.array(train_mae.cpu()), (time.time()-t0)/3600, np.mean(per_epoch_time)) send(subject, body) except: pass return val_mae, test_mae
def train_val_pipeline_classification(MODEL_NAME, DATASET_NAME, dataset, config, params, net_params, dirs): t0 = time.time() per_epoch_time = [] if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test root_ckpt_dir, write_file_name, root_output_dir = dirs device = net_params['device'] print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) # Choose optmizer if params['optimizer'] == 'ADAM': optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) elif params['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) else: raise NameError('No optimizer given') print("optimizer: " + str(params['optimizer'])) # second model called swa_model in order to move-average parame if params['swag'] is True: swag_model = SWAG(gnn_model(MODEL_NAME, net_params), no_cov_mat=False, max_num_models=20) swag_model = swag_model.to(device) swa_n = 0 start_epoch = 0 train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) # At any point you can hit Ctrl + C to break out of training early. if params['swag'] is True: swa_n = 0 start_epoch = 0 try: with tqdm(range(params['epochs'])) as t: for epoch in t: epoch += 1 # SWA LR adjustin if (epoch >= params['swa_start']): if (params['swa_lr_alpha1'] != params['swa_lr_alpha2']): # Using cyclic learning rate for SWA cyclic_schedule = swa_utils.cyclic_learning_rate( epoch, params['swa_c_epochs'], params['swa_lr_alpha1'], params['swa_lr_alpha2']) else: # Using fixed learning rate for SWA cyclic_schedule = None lr = params['swa_lr_alpha1'] swa_utils.adjust_learning_rate(optimizer, lr) else: cyclic_schedule = None lr = swa_utils.schedule(epoch, params) swa_utils.adjust_learning_rate(optimizer, lr) t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_perf, optimizer, train_scores, train_targets = \ train_epoch_classification(model, optimizer, device, train_loader, epoch, params, cyclic_schedule) epoch_val_loss, epoch_val_perf, val_scores, val_targets, val_smiles = \ evaluate_network_classification(model, device, val_loader, epoch, params) _, epoch_test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(model, device, test_loader, epoch, params) # SWA update of parameters if epoch > params['swa_start'] and ( epoch - (params['swa_start'])) % params['swa_c_epochs'] == 0: swag_model.collect_model(model) swa_n += 1 t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_AUC=epoch_train_perf['auroc'], val_AUC=epoch_val_perf['auroc'], test_AUC=epoch_test_perf['auroc'], train_ECE=epoch_train_perf['ece'], val_ECE=epoch_val_perf['ece'], test_ECE=epoch_test_perf['ece']) per_epoch_time.append(time.time() - start) # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') if config['save_params'] is True: swa_utils.save_checkpoint(root_ckpt_dir, epoch, params, state_dict=model.state_dict(), swag_state_dict=swag_model.state_dict(), swa_n=swa_n, optimizer=optimizer.state_dict()) # SWAG prediction with 30 samples of models test_scores_list = [] test_targets = [] train_scores_list = [] train_targets = [] scale = params['swag_eval_scale'] num_samples = params['swag_eval_num_samples'] for i in range(num_samples): swag_model.sample(scale, cov=True) test_loss, test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(swag_model, device, test_loader, epoch, params) train_loss, train_perf, train_scores, train_targets, train_smiles = \ evaluate_network_classification(swag_model, device, train_loader, epoch, params) test_scores_list.append(test_scores.detach().cpu().numpy()) train_scores_list.append(train_scores.detach().cpu().numpy()) test_scores = np.mean(test_scores_list, axis=0) train_scores = np.mean(train_scores_list, axis=0) test_perfs = binary_class_perfs(test_scores, test_targets.detach().cpu().numpy()) train_perfs = binary_class_perfs(train_scores, train_targets.detach().cpu().numpy()) # additional metrics for tox21: accuracy, auc, precision, recall, f1, + ECE print("Test AUC: {:.4f}".format(test_perf['auroc'])) print("Test ECE: {:.4f}".format(test_perf['ece'])) print("Train AUC: {:.4f}".format(train_perf['auroc'])) print("Train ECE: {:.4f}".format(train_perf['ece'])) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) """ Write the results in out_dir/results folder """ with open( write_file_name + '_seed_' + str(params['seed']) + '_dtseed_' + str(params['data_seed']) + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST ACC: {:.4f}\nTEST AUROC: {:.4f}\nTEST Precision: {:.4f}\nTEST Recall: {:.4f}\nTEST F1: {:.4f}\nTEST AUPRC: {:.4f}\nTEST ECE: {:.4f}\nTRAIN ACC: {:.4f}\nTRAIN AUROC: {:.4f}\nTRAIN Precision: {:.4f}\nTRAIN Recall: {:.4f}\nTRAIN F1: {:.4f}\nTRAIN AUPRC: {:.4f}\nTRAIN ECE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_perf['accuracy'])), np.mean(np.array(test_perf['auroc'])), np.mean(np.array(test_perf['precision'])), np.mean(np.array(test_perf['recall'])), np.mean(np.array(test_perf['f1'])), np.mean(np.array(test_perf['auprc'])), np.mean(np.array(test_perf['ece'])), np.mean(np.array(train_perf['accuracy'])), np.mean(np.array(train_perf['auroc'])), np.mean(np.array(train_perf['precision'])), np.mean(np.array(train_perf['recall'])), np.mean(np.array(train_perf['f1'])), np.mean(np.array(train_perf['auprc'])), np.mean(np.array(train_perf['ece'])), (time.time()-t0)/3600, np.mean(per_epoch_time))) # Saving predicted outputs predictions = {} predictions['train_smiles'] = train_smiles predictions['train_scores'] = train_scores predictions['train_targets'] = train_targets.detach().cpu().numpy() predictions['val_smiles'] = val_smiles predictions['val_scores'] = val_scores predictions['val_targets'] = val_targets.detach().cpu().numpy() predictions['test_smiles'] = test_smiles predictions['test_scores'] = test_scores predictions['test_targets'] = test_targets.detach().cpu().numpy() with open( '{}.pkl'.format(root_output_dir + '_seed_' + str(params['seed']) + '_dtseed_' + str(params['data_seed'])), 'wb') as f: pickle.dump(predictions, f)
def train_val_pipeline_classification(MODEL_NAME, DATASET_NAME, dataset, config, params, net_params, dirs): if params['bbp'] == True: from train.train_molecules_graph_classification_bbp import \ train_epoch_classification, evaluate_network_classification # import train functions from nets.molecules_graph_regression.load_bbp_net import gnn_model else: from train.train_molecules_graph_classification import \ train_epoch_classification, evaluate_network_classification # import train functions from nets.molecules_graph_regression.load_net import gnn_model t0 = time.time() per_epoch_time = [] if MODEL_NAME in ['GCN', 'GAT']: if net_params['self_loop']: print( "[!] Adding graph self-loops for GCN/GAT models (central node trick)." ) dataset._add_self_loops() trainset, valset, testset = dataset.train, dataset.val, dataset.test root_ckpt_dir, write_file_name, root_output_dir = dirs device = net_params['device'] print("Training Graphs: ", len(trainset)) print("Validation Graphs: ", len(valset)) print("Test Graphs: ", len(testset)) model = gnn_model(MODEL_NAME, net_params) model = model.to(device) # Choose optmizer if params['optimizer'] == 'ADAM': optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) elif params['optimizer'] == 'SGD': optimizer = optim.SGD(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay']) else: raise NameError('No optimizer given') # Choose learning rate scheduler if params['scheduler'] == 'step': scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=params['step_size'], gamma=params['lr_reduce_factor']) else: scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=params['lr_reduce_factor'], patience=params['lr_schedule_patience'], verbose=True) train_loader = DataLoader(trainset, batch_size=params['batch_size'], shuffle=True, collate_fn=dataset.collate) val_loader = DataLoader(valset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) test_loader = DataLoader(testset, batch_size=params['batch_size'], shuffle=False, collate_fn=dataset.collate) """ Training / Evaluating """ # At any point you can hit Ctrl + C to break out of training early. try: with tqdm(range(params['epochs'])) as t: for epoch in t: t.set_description('Epoch %d' % epoch) start = time.time() epoch_train_loss, epoch_train_perf, optimizer, train_scores, train_targets = \ train_epoch_classification(model, optimizer, device, train_loader, epoch, params) epoch_val_loss, epoch_val_perf, val_scores, val_targets, val_smiles = \ evaluate_network_classification(model, device, val_loader, epoch, params) _, epoch_test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(model, device, test_loader, epoch, params) t.set_postfix(time=time.time() - start, lr=optimizer.param_groups[0]['lr'], train_loss=epoch_train_loss, val_loss=epoch_val_loss, train_AUC=epoch_train_perf['auroc'], val_AUC=epoch_val_perf['auroc'], test_AUC=epoch_test_perf['auroc'], train_ECE=epoch_train_perf['ece'], val_ECE=epoch_val_perf['ece'], test_ECE=epoch_test_perf['ece']) per_epoch_time.append(time.time() - start) if params['scheduler'] == 'step': scheduler.step() else: scheduler.step(epoch_val_loss) if optimizer.param_groups[0]['lr'] < params['min_lr']: print("\n!! LR EQUAL TO MIN LR SET.") break # Stop training after params['max_time'] hours if time.time() - t0 > params['max_time'] * 3600: print('-' * 89) print( "Max_time for training elapsed {:.2f} hours, so stopping" .format(params['max_time'])) break except KeyboardInterrupt: print('-' * 89) print('Exiting from training early because of KeyboardInterrupt') # Saving checkpoint if config['save_params'] is True: ckpt_dir = os.path.join(root_ckpt_dir, "RUN_") if not os.path.exists(ckpt_dir): os.makedirs(ckpt_dir) torch.save( model.state_dict(), '{}.pkl'.format(ckpt_dir + '/seed_' + str(params['seed']) + '_dtseed_' + str(params['data_seed']) + "_epoch_" + str(epoch))) # Evaluate train & test set based on trained models if params['mcdropout'] == True: # get 30 predicts from different dropout models. test_scores_list = [] test_targets = [] train_scores_list = [] train_targets = [] for i in range(params['mc_eval_num_samples']): test_loss, test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(model, device, test_loader, epoch, params) train_loss, train_perf, train_scores, train_targets, train_smiles = \ evaluate_network_classification(model, device, train_loader, epoch, params) test_scores_list.append(test_scores.detach().cpu().numpy()) train_scores_list.append(train_scores.detach().cpu().numpy()) test_scores = np.mean(test_scores_list, axis=0) train_scores = np.mean(train_scores_list, axis=0) test_perfs = binary_class_perfs(test_scores, test_targets.detach().cpu().numpy()) train_perfs = binary_class_perfs(train_scores, train_targets.detach().cpu().numpy()) else: if params['bbp'] == True: test_loss, test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(model, device, test_loader, epoch, params, Nsamples=int(params['bbp_eval_Nsample'])) train_loss, train_perf, train_scores, train_targets, train_smiles = \ evaluate_network_classification(model, device, train_loader, epoch, params, Nsamples=int(params['bbp_eval_Nsample'])) else: test_loss, test_perf, test_scores, test_targets, test_smiles = \ evaluate_network_classification(model, device, test_loader, epoch, params) train_loss, train_perf, train_scores, train_targets, train_smiles = \ evaluate_network_classification(model, device, train_loader, epoch, params) test_scores = test_scores.detach().cpu().numpy() val_scores = val_scores.detach().cpu().numpy() train_scores = train_scores.detach().cpu().numpy() # additional metrics for tox21: accuracy, auc, precision, recall, f1, + ECE print("Test AUC: {:.4f}".format(test_perf['auroc'])) print("Test ECE: {:.4f}".format(test_perf['ece'])) print("Train AUC: {:.4f}".format(train_perf['auroc'])) print("Train ECE: {:.4f}".format(train_perf['ece'])) print("TOTAL TIME TAKEN: {:.4f}s".format(time.time() - t0)) print("AVG TIME PER EPOCH: {:.4f}s".format(np.mean(per_epoch_time))) """ Write the results in out_dir/results folder """ with open( write_file_name + '_seed_' + str(params['seed']) + '_dtseed_' + str(params['data_seed']) + '.txt', 'w') as f: f.write("""Dataset: {},\nModel: {}\n\nparams={}\n\nnet_params={}\n\n{}\n\nTotal Parameters: {}\n\n FINAL RESULTS\nTEST ACC: {:.4f}\nTEST AUROC: {:.4f}\nTEST Precision: {:.4f}\nTEST Recall: {:.4f}\nTEST F1: {:.4f}\nTEST AUPRC: {:.4f}\nTEST ECE: {:.4f}\nTRAIN ACC: {:.4f}\nTRAIN AUROC: {:.4f}\nTRAIN Precision: {:.4f}\nTRAIN Recall: {:.4f}\nTRAIN F1: {:.4f}\nTRAIN AUPRC: {:.4f}\nTRAIN ECE: {:.4f}\n\n Total Time Taken: {:.4f} hrs\nAverage Time Per Epoch: {:.4f} s\n\n\n"""\ .format(DATASET_NAME, MODEL_NAME, params, net_params, model, net_params['total_param'], np.mean(np.array(test_perf['accuracy'])), np.mean(np.array(test_perf['auroc'])), np.mean(np.array(test_perf['precision'])), np.mean(np.array(test_perf['recall'])), np.mean(np.array(test_perf['f1'])), np.mean(np.array(test_perf['auprc'])), np.mean(np.array(test_perf['ece'])), np.mean(np.array(train_perf['accuracy'])), np.mean(np.array(train_perf['auroc'])), np.mean(np.array(train_perf['precision'])), np.mean(np.array(train_perf['recall'])), np.mean(np.array(train_perf['f1'])), np.mean(np.array(train_perf['auprc'])), np.mean(np.array(train_perf['ece'])), (time.time()-t0)/3600, np.mean(per_epoch_time))) # Saving predicted outputs predictions = {} predictions['train_smiles'] = train_smiles predictions['train_scores'] = train_scores predictions['train_targets'] = train_targets.detach().cpu().numpy() predictions['val_smiles'] = val_smiles predictions['val_scores'] = val_scores predictions['val_targets'] = val_targets.detach().cpu().numpy() predictions['test_smiles'] = test_smiles predictions['test_scores'] = test_scores predictions['test_targets'] = test_targets.detach().cpu().numpy() with open( '{}.pkl'.format(root_output_dir + '_seed_' + str(params['seed']) + '_dtseed_' + str(params['data_seed'])), 'wb') as f: pickle.dump(predictions, f)