def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with DGL') parser.add_argument('--seed', type=int, default=42, help='random seed to use (default: 42)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help='GNN to use, which can be from ' '[gin, gin-virtual, gcn, gcn-virtual] (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device("cuda:" + str(args.device)) else: device = torch.device("cpu") ### automatic data loading and splitting ### Read in the raw SMILES strings smiles_dataset = PCQM4MDataset(root='dataset/', only_smiles=True) split_idx = smiles_dataset.get_idx_split() test_smiles_dataset = [smiles_dataset[i] for i in split_idx['test']] onthefly_dataset = OnTheFlyPCQMDataset(test_smiles_dataset) test_loader = DataLoader(onthefly_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) ### automatic evaluator. evaluator = PCQM4MEvaluator() shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params).to(device) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pt') if not os.path.exists(checkpoint_path): raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}') ## reading in checkpoint checkpoint = torch.load(checkpoint_path) model.load_state_dict(checkpoint['model_state_dict']) print('Predicting on test data...') y_pred = test(model, device, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
def main(): device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: #4min train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
idx_dev = idx_dev.cuda() idx_test = idx_test.cuda() idx_all = idx_all.cuda() idx_unlabeled = idx_unlabeled.cuda() inputs_q = inputs_q.cuda() target_q = target_q.cuda() inputs_p = inputs_p.cuda() target_p = target_p.cuda() gnn = GNN(opt, adj) trainer = Trainer(opt, gnn) # Build the ema model gnn_ema = GNN(opt, adj) for ema_param, param in zip(gnn_ema.parameters(), gnn.parameters()): ema_param.data= param.data for param in gnn_ema.parameters(): param.detach_() trainer_ema = Trainer(opt, gnn_ema, ema = False) def init_data(): inputs_q.copy_(inputs) temp = torch.zeros(idx_train.size(0), target_q.size(1)).type_as(target_q) temp.scatter_(1, torch.unsqueeze(target[idx_train], 1), 1.0) target_q[idx_train] = temp def update_ema_variables(model, ema_model, alpha, epoch):
def main(): device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name = args.dataset, root='/cmlscratch/kong/datasets/ogb') seq_len_list = np.array([len(seq) for seq in dataset.data.y]) print('Target seqence less or equal to {} is {}%.'.format(args.max_seq_len, np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list))) split_idx = dataset.get_idx_split() # print(split_idx['train']) # print(split_idx['valid']) # print(split_idx['test']) # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']] # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']] # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']] # print('#train') # print(len(train_method_name)) # print('#valid') # print(len(valid_method_name)) # print('#test') # print(len(test_method_name)) # train_method_name_set = set(train_method_name) # valid_method_name_set = set(valid_method_name) # test_method_name_set = set(test_method_name) # # unique method name # print('#unique train') # print(len(train_method_name_set)) # print('#unique valid') # print(len(valid_method_name_set)) # print('#unique test') # print(len(test_method_name_set)) # # unique valid/test method name # print('#valid unseen during training') # print(len(valid_method_name_set - train_method_name_set)) # print('#test unseen during training') # print(len(test_method_name_set - train_method_name_set)) ### building vocabulary for sequence predition. Only use training data. vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], args.num_vocab) # test encoder and decoder # for data in dataset: # # PyG >= 1.5.0 # print(data.y) # # # PyG 1.4.3 # # print(data.y[0]) # data = encode_y_to_arr(data, vocab2idx, args.max_seq_len) # print(data.y_arr[0]) # decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab) # print(decoded_seq) # print('') ## test augment_edge # data = dataset[2] # print(data) # data_augmented = augment_edge(data) # print(data_augmented) ### set the transform function # augment_edge: add next-token edge as well as inverse edges. add edge attributes. # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence. dataset.transform = transforms.Compose([augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)]) ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers = args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers) nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz')) nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz')) ### Encoding node features into emb_dim vectors. ### The following three node features are used. # 1. node type # 2. node attribute # 3. node depth node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes = len(nodetypes_mapping['type']), num_nodeattributes = len(nodeattributes_mapping['attr']), max_depth = 20) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs+1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: #4min train_perf = eval(model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) valid_perf = eval(model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) test_perf = eval(model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppa data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument('--pooling', type=str, default='mean', help='Pooling tecnhnique for graph embedding') parser.add_argument('--laf', type=str, default='mean', help='Init function if laf pooling is specified') parser.add_argument( '--laf_layers', type=str, default='false', help= 'If set to true, internal layers will be initialized with laf function' ) parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppa", help='dataset name (default: ogbg-ppa)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') parser.add_argument('--seed', type=int, default=92, help='torch seed') args = parser.parse_args() print(args) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] best_val = 0 flog = open(args.filename + ".log", 'w') flog.write("{}\n".format(args)) for epoch in range(1, args.epochs + 1): start = time.time() print("=====Epoch {}".format(epoch)) flog.write("=====Epoch {}\n".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) print("Time {:.4f}s".format(time.time() - start)) flog.write("{}\tTime: {}s\n".format( { 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }, time.time() - start)) flog.flush() train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) if valid_perf[dataset.eval_metric] >= best_val: best_val = valid_perf[dataset.eval_metric] if not args.filename == '': torch.save(model.state_dict(), '{}.mdl'.format(args.filename)) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) flog.write('Finished training!\n') flog.write('Best validation score: {}\n'.format( valid_curve[best_val_epoch])) flog.write('Test score: {}\n'.format(test_curve[best_val_epoch])) flog.flush() if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename + ".res")
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) random.seed(42) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygPCQM4MDataset(root='dataset/') split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() if args.train_subset: subset_ratio = 0.1 subset_idx = torch.randperm(len( split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))] train_loader = DataLoader(dataset[split_idx["train"][subset_idx]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) else: train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.save_test_dir != '': test_loader = DataLoader(dataset[split_idx["test-dev"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.checkpoint_dir != '': os.makedirs(args.checkpoint_dir, exist_ok=True) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params).to(device) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') optimizer = optim.Adam(model.parameters(), lr=0.001) if args.log_dir != '': writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 if args.train_subset: scheduler = StepLR(optimizer, step_size=300, gamma=0.25) args.epochs = 1000 else: scheduler = StepLR(optimizer, step_size=30, gamma=0.25) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train_mae = train(model, device, train_loader, optimizer) print('Evaluating...') valid_mae = eval(model, device, valid_loader, evaluator) print({'Train': train_mae, 'Validation': valid_mae}) if args.log_dir != '': writer.add_scalar('valid/mae', valid_mae, epoch) writer.add_scalar('train/mae', train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir != '': print('Saving checkpoint...') checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'best_val_mae': best_valid_mae, 'num_params': num_params } torch.save(checkpoint, os.path.join(args.checkpoint_dir, 'checkpoint.pt')) if args.save_test_dir != '': print('Predicting on test data...') y_pred = test(model, device, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir, mode='test-dev') scheduler.step() print(f'Best validation MAE so far: {best_valid_mae}') if args.log_dir != '': writer.close()
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with PGL') parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=1, help='number of workers (default: 1)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) random.seed(42) np.random.seed(42) paddle.seed(42) if not args.use_cuda: paddle.set_device("cpu") ### automatic dataloading and splitting class Config(): def __init__(self): self.base_data_path = "./dataset" config = Config() ds = MolDataset(config) split_idx = ds.get_idx_split() test_ds = Subset(ds, split_idx['test']) print("Test exapmles: ", len(test_ds)) ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() test_loader = Dataloader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') checkpoint_path = os.path.join(args.checkpoint_dir, 'checkpoint.pdparams') if not os.path.exists(checkpoint_path): raise RuntimeError(f'Checkpoint file not found at {checkpoint_path}') model.set_state_dict(paddle.load(checkpoint_path)) print('Predicting on test data...') y_pred = test(model, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir)
def train(dataset): print('random seed:', args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) random.seed(args.seed) np.random.seed(args.seed) torch.backends.cudnn.deterministic = True # torch.backends.cudnn.enabled = False cross_res = {label: [] for label in label2id if label != 'O'} for cross_valid in range(1): # print('cross_valid', cross_valid) model = GNN(word_vocab_size=WORD_VOCAB_SIZE, char_vocab_size=CHAR_VOCAB_SIZE, d_output=d_output, args=args) model.cuda() # print vocab_size # print('split dataset') # dataset.split_train_valid_test_bycase([0.5, 0.1, 0.4], 5, cross_valid) print('train:', len(dataset.train), 'valid:', len(dataset.valid), 'test:', len(dataset.test)) sys.stdout.flush() train_dataloader = DataLoader(dataset.train, batch_size=args.batch, shuffle=True) valid_dataloader = DataLoader(dataset.valid, batch_size=args.batch) test_dataloader = DataLoader(dataset.test, batch_size=args.batch) weight = torch.zeros(len(label2id)) for label, idx in label2id.items(): weight[idx] = 1 if label == 'O' else 2 loss_function = nn.CrossEntropyLoss(weight.cuda(), reduce=False) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8) best_acc = -1 wait = 0 batch_cnt = 0 for epoch in range(args.epochs): total_loss = 0 pending_loss = None model.train() # random.shuffle(dataset.train) load_time, forward_time, backward_time = 0, 0, 0 model.clear_time() train_log = open(args.save_path + '_train.log', 'w') for tensors, batch in tqdm(train_dataloader, file=train_log, mininterval=60): # print(batch[0].case_id, batch[0].doc_id, batch[0].page_id) start = time.time() data, data_word, pos, length, mask, label, adjs = to_var( tensors, cuda=args.cuda) batch_size, docu_len, sent_len, word_len = data.size() load_time += (time.time() - start) start = time.time() logit = model(data, data_word, pos, length, mask, adjs) forward_time += (time.time() - start) start = time.time() if args.crf: logit = logit.view(batch_size * docu_len, sent_len, -1) mask = mask.view(batch_size * docu_len, -1) length = length.view(batch_size * docu_len) label = label.view(batch_size * docu_len, -1) loss = -model.crf_layer.loglikelihood( logit, mask, length, label) loss = torch.masked_select(loss, torch.gt(length, 0)).mean() else: loss = loss_function(logit.view(-1, d_output), label.view(-1)) loss = torch.masked_select(loss, mask.view(-1)).mean() total_loss += loss.data.sum() # print(total_loss, batch[0].case_id, batch[0].doc_id, batch[0].page_id) if math.isnan(total_loss): print('Loss is NaN!') exit() loss.backward() optimizer.step() optimizer.zero_grad() backward_time += (time.time() - start) batch_cnt += 1 if batch_cnt % 20000 != 0: continue # print('load %f forward %f backward %f'%(load_time, forward_time, backward_time)) # model.print_time() valid_acc, valid_prec, valid_recall, valid_f1 = evaluate( model, valid_dataloader, args=args) print('Epoch %d: Train Loss: %.3f Valid Acc: %.5f' % (epoch, total_loss, valid_acc)) # print(acc_to_str(valid_f1)) # scheduler.step() acc = np.mean(list(valid_f1.values())) # valid_acc print(acc) if acc >= best_acc: obj = {'args': args, 'model': model.state_dict()} torch.save(obj, args.save_path + '.model') result_obj['valid_prec'] = np.mean( list(valid_prec.values())) result_obj['valid_recall'] = np.mean( list(valid_recall.values())) result_obj['valid_f1'] = np.mean(list(valid_f1.values())) wait = 0 if acc > best_acc else wait + 1 best_acc = max(acc, best_acc) model.train() sys.stdout.flush() if wait >= args.patience: break train_log.close() os.remove(args.save_path + '_train.log') if wait >= args.patience: break obj = torch.load(args.save_path + '.model') model.load_state_dict(obj['model']) test(test_dataloader, model) # print("Cross Validation Result:") # for label in cross_res: # cross_res[label] = np.mean(cross_res[label]) # print(acc_to_str(cross_res)) return cross_res
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-code2 data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gcn-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gcn-virtual)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument('--max_seq_len', type=int, default=5, help='maximum sequence length to predict (default: 5)') parser.add_argument( '--num_vocab', type=int, default=5000, help= 'the number of vocabulary used for sequence prediction (default: 5000)' ) parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=128, help='input batch size for training (default: 128)') parser.add_argument('--epochs', type=int, default=25, help='number of epochs to train (default: 25)') parser.add_argument('--random_split', action='store_true') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-code2", help='dataset name (default: ogbg-code2)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() print(args) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) seq_len_list = np.array([len(seq) for seq in dataset.data.y]) print('Target seqence less or equal to {} is {}%.'.format( args.max_seq_len, np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list))) split_idx = dataset.get_idx_split() if args.random_split: print('Using random split') perm = torch.randperm(len(dataset)) num_train, num_valid, num_test = len(split_idx['train']), len( split_idx['valid']), len(split_idx['test']) split_idx['train'] = perm[:num_train] split_idx['valid'] = perm[num_train:num_train + num_valid] split_idx['test'] = perm[num_train + num_valid:] assert (len(split_idx['train']) == num_train) assert (len(split_idx['valid']) == num_valid) assert (len(split_idx['test']) == num_test) # print(split_idx['train']) # print(split_idx['valid']) # print(split_idx['test']) # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']] # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']] # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']] # print('#train') # print(len(train_method_name)) # print('#valid') # print(len(valid_method_name)) # print('#test') # print(len(test_method_name)) # train_method_name_set = set(train_method_name) # valid_method_name_set = set(valid_method_name) # test_method_name_set = set(test_method_name) # # unique method name # print('#unique train') # print(len(train_method_name_set)) # print('#unique valid') # print(len(valid_method_name_set)) # print('#unique test') # print(len(test_method_name_set)) # # unique valid/test method name # print('#valid unseen during training') # print(len(valid_method_name_set - train_method_name_set)) # print('#test unseen during training') # print(len(test_method_name_set - train_method_name_set)) ### building vocabulary for sequence predition. Only use training data. vocab2idx, idx2vocab = get_vocab_mapping( [dataset.data.y[i] for i in split_idx['train']], args.num_vocab) # test encoder and decoder # for data in dataset: # # PyG >= 1.5.0 # print(data.y) # # # PyG 1.4.3 # # print(data.y[0]) # data = encode_y_to_arr(data, vocab2idx, args.max_seq_len) # print(data.y_arr[0]) # decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab) # print(decoded_seq) # print('') ## test augment_edge # data = dataset[2] # print(data) # data_augmented = augment_edge(data) # print(data_augmented) ### set the transform function # augment_edge: add next-token edge as well as inverse edges. add edge attributes. # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence. dataset.transform = transforms.Compose([ augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len) ]) ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) nodetypes_mapping = pd.read_csv( os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz')) nodeattributes_mapping = pd.read_csv( os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz')) print(nodeattributes_mapping) ### Encoding node features into emb_dim vectors. ### The following three node features are used. # 1. node type # 2. node attribute # 3. node depth node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes=len(nodetypes_mapping['type']), num_nodeattributes=len( nodeattributes_mapping['attr']), max_depth=20) if args.gnn == 'gin': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) print(f'#Params: {sum(p.numel() for p in model.parameters())}') valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval( model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) valid_perf = eval( model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) test_perf = eval( model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) print('F1') best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': result_dict = { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train } torch.save(result_dict, args.filename)
def main(): # Training settings parser = argparse.ArgumentParser( description="GNN baselines on pcqm4m with Pytorch Geometrics") parser.add_argument("--device", type=int, default=0, help="which gpu to use if any (default: 0)") parser.add_argument( "--gnn", type=str, default="gin-virtual", help= "GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)", ) parser.add_argument( "--graph_pooling", type=str, default="sum", help="graph pooling strategy mean or sum (default: sum)", ) parser.add_argument("--drop_ratio", type=float, default=0, help="dropout ratio (default: 0)") parser.add_argument( "--num_layers", type=int, default=5, help="number of GNN message passing layers (default: 5)", ) parser.add_argument( "--emb_dim", type=int, default=600, help="dimensionality of hidden units in GNNs (default: 600)", ) parser.add_argument("--train_subset", action="store_true") parser.add_argument( "--batch_size", type=int, default=256, help="input batch size for training (default: 256)", ) parser.add_argument( "--epochs", type=int, default=100, help="number of epochs to train (default: 100)", ) parser.add_argument("--num_workers", type=int, default=0, help="number of workers (default: 0)") parser.add_argument("--log_dir", type=str, default="", help="tensorboard log directory") parser.add_argument("--checkpoint_dir", type=str, default="", help="directory to save checkpoint") parser.add_argument( "--save_test_dir", type=str, default="", help="directory to save test submission file", ) args = parser.parse_args() print(args) np.random.seed(42) torch.manual_seed(42) torch.cuda.manual_seed(42) random.seed(42) device = (torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")) ### automatic dataloading and splitting dataset = PygPCQM4MDataset(root="dataset/") split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() if args.train_subset: subset_ratio = 0.1 subset_idx = torch.randperm(len( split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))] train_loader = DataLoader( dataset[split_idx["train"][subset_idx]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) else: train_loader = DataLoader( dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) valid_loader = DataLoader( dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) if args.save_test_dir is not "": test_loader = DataLoader( dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) if args.checkpoint_dir is not "": os.makedirs(args.checkpoint_dir, exist_ok=True) shared_params = { "num_layers": args.num_layers, "emb_dim": args.emb_dim, "drop_ratio": args.drop_ratio, "graph_pooling": args.graph_pooling, } if args.gnn == "gin": model = GNN(gnn_type="gin", virtual_node=False, **shared_params).to(device) elif args.gnn == "gin-virtual": model = GNN(gnn_type="gin", virtual_node=True, **shared_params).to(device) elif args.gnn == "gcn": model = GNN(gnn_type="gcn", virtual_node=False, **shared_params).to(device) elif args.gnn == "gcn-virtual": model = GNN(gnn_type="gcn", virtual_node=True, **shared_params).to(device) else: raise ValueError("Invalid GNN type") num_params = sum(p.numel() for p in model.parameters()) print(f"#Params: {num_params}") optimizer = optim.Adam(model.parameters(), lr=0.001) if args.log_dir is not "": writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 if args.train_subset: scheduler = StepLR(optimizer, step_size=300, gamma=0.25) args.epochs = 1000 else: scheduler = StepLR(optimizer, step_size=30, gamma=0.25) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print("Training...") train_mae = train(model, device, train_loader, optimizer) print("Evaluating...") valid_mae = eval(model, device, valid_loader, evaluator) print({"Train": train_mae, "Validation": valid_mae}) if args.log_dir is not "": writer.add_scalar("valid/mae", valid_mae, epoch) writer.add_scalar("train/mae", train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir is not "": print("Saving checkpoint...") checkpoint = { "epoch": epoch, "model_state_dict": model.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "scheduler_state_dict": scheduler.state_dict(), "best_val_mae": best_valid_mae, "num_params": num_params, } torch.save(checkpoint, os.path.join(args.checkpoint_dir, "checkpoint.pt")) if args.save_test_dir is not "": print("Predicting on test data...") y_pred = test(model, device, test_loader) print("Saving test submission file...") evaluator.save_test_submission({"y_pred": y_pred}, args.save_test_dir) scheduler.step() print(f"Best validation MAE so far: {best_valid_mae}") if args.log_dir is not "": writer.close()
def main(): seed = args.seed np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) model_save_dir = f'models/{args.name}' os.makedirs(model_save_dir, exist_ok=True) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") print("Training") # writer = SummaryWriter(model_save_dir) with open(f'{model_save_dir}/arguments.txt', 'w') as f: json.dump(args.__dict__, f, indent=2) ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, topological=args.topological).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, topological=args.topological).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, topological=args.topological).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, topological=args.topological).to(device) elif args.gnn == 'controller': model = ControllerTransformer().to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: # 4min train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: torch.save(model.state_dict(), os.path.join(model_save_dir, f'model-best.pth')) best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with PGL') parser.add_argument('--use_cuda', action='store_true') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=1, help='number of workers (default: 1)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) random.seed(42) np.random.seed(42) paddle.seed(42) if not args.use_cuda: paddle.set_device("cpu") ### automatic dataloading and splitting class Config(): def __init__(self): self.base_data_path = "./dataset" config = Config() ds = MolDataset(config) split_idx = ds.get_idx_split() train_ds = Subset(ds, split_idx['train']) valid_ds = Subset(ds, split_idx['valid']) test_ds = Subset(ds, split_idx['test']) print("Train exapmles: ", len(train_ds)) print("Valid exapmles: ", len(valid_ds)) print("Test exapmles: ", len(test_ds)) ### automatic evaluator. takes dataset name as input evaluator = PCQM4MEvaluator() train_loader = Dataloader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=CollateFn()) valid_loader = Dataloader(valid_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) if args.save_test_dir is not '': test_loader = Dataloader(test_ds, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=CollateFn()) if args.checkpoint_dir is not '': os.makedirs(args.checkpoint_dir, exist_ok=True) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') if args.log_dir is not '': writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.001, step_size=300, gamma=0.25) optimizer = paddle.optimizer.Adam(learning_rate=scheduler, parameters=model.parameters()) msg = "ogbg_lsc_paddle_baseline\n" for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train_mae = train(model, train_loader, optimizer) print('Evaluating...') valid_mae = eval(model, valid_loader, evaluator) print({'Train': train_mae, 'Validation': valid_mae}) if args.log_dir is not '': writer.add_scalar('valid/mae', valid_mae, epoch) writer.add_scalar('train/mae', train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir is not '': print('Saving checkpoint...') paddle.save( model.state_dict(), os.path.join(args.checkpoint_dir, 'checkpoint.pdparams')) if args.save_test_dir is not '': print('Predicting on test data...') y_pred = test(model, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir) scheduler.step() print(f'Best validation MAE so far: {best_valid_mae}') try: msg +="Epoch: %d | Train: %.6f | Valid: %.6f | Best Valid: %.6f\n" \ % (epoch, train_mae, valid_mae, best_valid_mae) print(msg) except: continue if args.log_dir is not '': writer.close()
def main(): device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") write_file_name = 'results/result_' ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'randomgin': model = GNN(gnn_type='randomgin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, drop_path_p=args.drop_path_p, virtual_node=False).to(device) elif args.gnn == 'randomgin-virtual': model = GNN(gnn_type='randomgin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, JK=args.JK, drop_ratio=args.drop_ratio, drop_path_p=args.drop_path_p, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') tot_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("No. params: %d" % (tot_params, )) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') loss = train(model, device, train_loader, optimizer, dataset.task_type, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst if epoch == 1: print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) print(f'Run{run} val:{best_val}, test:{final_test}') with open(write_file_name + '_' + args.JK + '_run' + str(run) + '.txt', 'w') as f: f.write("""Run: {}\nVal {:.4f}\nTest: {:.4f}\n\n\n""".format( run, best_val, final_test)) vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on pcqm4m with DGL') parser.add_argument('--seed', type=int, default=42, help='random seed to use (default: 42)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help='GNN to use, which can be from ' '[gin, gin-virtual, gcn, gcn-virtual] (default: gin-virtual)') parser.add_argument( '--graph_pooling', type=str, default='sum', help='graph pooling strategy mean or sum (default: sum)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument( '--num_layers', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=600, help='dimensionality of hidden units in GNNs (default: 600)') parser.add_argument('--train_subset', action='store_true', help='use 10% of the training set for training') parser.add_argument('--batch_size', type=int, default=256, help='input batch size for training (default: 256)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--log_dir', type=str, default="", help='tensorboard log directory. If not specified, ' 'tensorboard will not be used.') parser.add_argument('--checkpoint_dir', type=str, default='', help='directory to save checkpoint') parser.add_argument('--save_test_dir', type=str, default='', help='directory to save test submission file') args = parser.parse_args() print(args) np.random.seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) device = torch.device("cuda:" + str(args.device)) else: device = torch.device("cpu") ### automatic dataloading and splitting dataset = SampleDglPCQM4MDataset(root='dataset/') # split_idx['train'], split_idx['valid'], split_idx['test'] # separately gives a 1D int64 tensor split_idx = dataset.get_idx_split() split_idx["train"] = split_idx["train"].type(torch.LongTensor) split_idx["test"] = split_idx["test"].type(torch.LongTensor) split_idx["valid"] = split_idx["valid"].type(torch.LongTensor) ### automatic evaluator. evaluator = PCQM4MEvaluator() if args.train_subset: subset_ratio = 0.1 subset_idx = torch.randperm(len( split_idx["train"]))[:int(subset_ratio * len(split_idx["train"]))] train_loader = DataLoader(dataset[split_idx["train"][subset_idx]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_dgl) else: train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_dgl) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) if args.save_test_dir is not '': test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_dgl) if args.checkpoint_dir is not '': os.makedirs(args.checkpoint_dir, exist_ok=True) shared_params = { 'num_layers': args.num_layers, 'emb_dim': args.emb_dim, 'drop_ratio': args.drop_ratio, 'graph_pooling': args.graph_pooling } if args.gnn == 'gin': model = GNN(gnn_type='gin', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', virtual_node=False, **shared_params).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gin-virtual-diffpool': model = DiffPoolGNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) elif args.gnn == 'gin-virtual-bayes-diffpool': model = BayesDiffPoolGNN(gnn_type='gin', virtual_node=True, **shared_params).to(device) else: raise ValueError('Invalid GNN type') num_params = sum(p.numel() for p in model.parameters()) print(f'#Params: {num_params}') optimizer = optim.Adam(model.parameters(), lr=0.001) if args.log_dir is not '': writer = SummaryWriter(log_dir=args.log_dir) best_valid_mae = 1000 if args.train_subset: scheduler = StepLR(optimizer, step_size=300, gamma=0.25) args.epochs = 1000 else: scheduler = StepLR(optimizer, step_size=30, gamma=0.25) """ load from latest checkpoint """ # start epoch (default = 1, unless resuming training) firstEpoch = 1 # check if checkpoint exist -> load model checkpointFile = os.path.join(args.checkpoint_dir, 'checkpoint.pt') if os.path.exists(checkpointFile): # load checkpoint file checkpointData = torch.load(checkpointFile) firstEpoch = checkpointData["epoch"] model.load_state_dict(checkpointData["model_state_dict"]) optimizer.load_state_dict(checkpointData["optimizer_state_dict"]) scheduler.load_state_dict(checkpointData["scheduler_state_dict"]) best_valid_mae = checkpointData["best_val_mae"] num_params = checkpointData["num_params"] print( "Loaded existing weights from {}. Continuing from epoch: {} with best valid MAE: {}" .format(checkpointFile, firstEpoch, best_valid_mae)) for epoch in range(firstEpoch, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train_mae = train(model, device, train_loader, optimizer, args.gnn) print('Evaluating...') valid_mae = eval(model, device, valid_loader, evaluator) print({'Train': train_mae, 'Validation': valid_mae}) if args.log_dir is not '': writer.add_scalar('valid/mae', valid_mae, epoch) writer.add_scalar('train/mae', train_mae, epoch) if valid_mae < best_valid_mae: best_valid_mae = valid_mae if args.checkpoint_dir is not '': print('Saving checkpoint...') checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'best_val_mae': best_valid_mae, 'num_params': num_params } torch.save(checkpoint, os.path.join(args.checkpoint_dir, 'checkpoint.pt')) if args.save_test_dir is not '': print('Predicting on test data...') y_pred = test(model, device, test_loader) print('Saving test submission file...') evaluator.save_test_submission({'y_pred': y_pred}, args.save_test_dir) scheduler.step() print(f'Best validation MAE so far: {best_valid_mae}') if args.log_dir is not '': writer.close()
def train(seed): print('random seed:', seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) # torch.backends.cudnn.enabled = False dataset = read_data('../data', '../graph') label2id = dataset.label2id print(label2id) vocab_size = dataset.vocab_size output_dim = len(label2id) def acc_to_str(acc): s = ['%s:%.3f' % (label, acc[label]) for label in acc] return '{' + ', '.join(s) + '}' cross_res = {label: [] for label in label2id if label != 'O'} output_file = open('%s.mistakes' % args.output, 'w') for cross_valid in range(5): model = GNN(vocab_size=vocab_size, output_dim=output_dim, args=args) model.cuda() # print vocab_size dataset.split_train_valid_test([0.8, 0.1, 0.1], 5, cross_valid) print('train:', len(dataset.train), 'valid:', len(dataset.valid), 'test:', len(dataset.test)) def evaluate(model, datalist, output_file=None): if output_file != None: output_file.write( '#############################################\n') correct = {label: 0 for label in label2id if label != 'O'} total = len(datalist) model.eval() print_cnt = 0 for data in datalist: word, feat = Variable(data.input_word).cuda(), Variable( data.input_feat).cuda() a_ud, a_lr = Variable(data.a_ud, requires_grad=False).cuda(), Variable( data.a_lr, requires_grad=False).cuda() mask = Variable(data.mask, requires_grad=False).cuda() if args.globalnode: logprob, form = model(word, feat, mask, a_ud, a_lr) logprob = logprob.data.view(-1, output_dim) else: logprob = model(word, feat, mask, a_ud, a_lr).data.view(-1, output_dim) mask = mask.data.view(-1) y_pred = torch.LongTensor(output_dim) for i in range(output_dim): prob = logprob[:, i].exp() * mask y_pred[i] = prob.topk(k=1)[1][0] # y_pred = logprob.topk(k=1,dim=0)[1].view(-1) for label in label2id: if label == 'O': continue labelid = label2id[label] if data.output.view(-1)[y_pred[labelid]] == labelid: correct[label] += 1 else: if output_file != None: num_sent, sent_len, word_len = data.input_word.size( ) id = y_pred[label2id[label]] word = data.words[data.sents[int( id / sent_len)][id % sent_len]] output_file.write( '%d %d %s %s\n' % (data.set_id, data.fax_id, label, word)) return {label: float(correct[label]) / total for label in correct} batch = 1 weight = torch.zeros(len(label2id)) for label, id in label2id.items(): weight[id] = 1 if label == 'O' else 10 loss_function = nn.NLLLoss(weight.cuda(), reduce=False) optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr / float(batch), weight_decay=args.wd) # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.8) best_acc = -1 wait = 0 for epoch in range(args.epochs): sum_loss = 0 model.train() # random.shuffle(dataset.train) for idx, data in enumerate(dataset.train): word, feat = Variable(data.input_word).cuda(), Variable( data.input_feat).cuda() a_ud, a_lr = Variable(data.a_ud, requires_grad=False).cuda(), Variable( data.a_lr, requires_grad=False).cuda() mask = Variable(data.mask, requires_grad=False).cuda() true_output = Variable(data.output).cuda() if args.globalnode: logprob, form = model(word, feat, mask, a_ud, a_lr) else: logprob = model(word, feat, mask, a_ud, a_lr) loss = torch.mean( mask.view(-1) * loss_function(logprob.view(-1, output_dim), true_output.view(-1))) if args.globalnode: true_form = Variable(torch.LongTensor([data.set_id - 1 ])).cuda() loss = loss + 0.1 * F.nll_loss(form, true_form) sum_loss += loss.data.sum() loss.backward() if (idx + 1) % batch == 0 or idx + 1 == len(dataset.train): optimizer.step() optimizer.zero_grad() train_acc = evaluate(model, dataset.train) valid_acc = evaluate(model, dataset.valid) test_acc = evaluate(model, dataset.test) print('Epoch %d: Train Loss: %.3f Train: %s Valid: %s Test: %s' \ % (epoch, sum_loss, acc_to_str(train_acc), acc_to_str(valid_acc), acc_to_str(test_acc))) # scheduler.step() acc = np.log(list(valid_acc.values())).sum() if epoch < 6: continue if acc >= best_acc: torch.save(model.state_dict(), args.output + '.model') wait = 0 if acc > best_acc else wait + 1 best_acc = max(acc, best_acc) if wait >= args.patience: break model.load_state_dict(torch.load(args.output + '.model')) test_acc = evaluate(model, dataset.test, output_file=output_file) print('########', acc_to_str(test_acc)) for label in test_acc: cross_res[label].append(test_acc[label]) print("Cross Validation Result:") for label in cross_res: cross_res[label] = np.mean(cross_res[label]) print(acc_to_str(cross_res)) return cross_res
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppi data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppi", help='dataset name (default: ogbg-ppi)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
def main(): # Training settings parser = argparse.ArgumentParser( description="GNN baselines on ogbg-ppa data with Pytorch Geometrics" ) parser.add_argument( "--device", type=int, default=0, help="which gpu to use if any (default: 0)" ) parser.add_argument( "--gnn", type=str, default="gin-virtual", help="GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)", ) parser.add_argument( "--drop_ratio", type=float, default=0.5, help="dropout ratio (default: 0.5)" ) parser.add_argument( "--num_layer", type=int, default=5, help="number of GNN message passing layers (default: 5)", ) parser.add_argument( "--emb_dim", type=int, default=300, help="dimensionality of hidden units in GNNs (default: 300)", ) parser.add_argument( "--batch_size", type=int, default=32, help="input batch size for training (default: 32)", ) parser.add_argument( "--epochs", type=int, default=100, help="number of epochs to train (default: 100)", ) parser.add_argument( "--num_workers", type=int, default=0, help="number of workers (default: 0)" ) parser.add_argument( "--dataset", type=str, default="ogbg-ppa", help="dataset name (default: ogbg-ppa)", ) parser.add_argument( "--filename", type=str, default="", help="filename to output result (default: )" ) args = parser.parse_args() device = ( torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ) ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader( dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, ) valid_loader = DataLoader( dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) test_loader = DataLoader( dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, ) if args.gnn == "gin": model = GNN( gnn_type="gin", num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, ).to(device) elif args.gnn == "gin-virtual": model = GNN( gnn_type="gin", num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, ).to(device) elif args.gnn == "gcn": model = GNN( gnn_type="gcn", num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, ).to(device) elif args.gnn == "gcn-virtual": model = GNN( gnn_type="gcn", num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, ).to(device) else: raise ValueError("Invalid GNN type") optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print("Training...") train(model, device, train_loader, optimizer) print("Evaluating...") train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({"Train": train_perf, "Validation": valid_perf, "Test": test_perf}) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print("Finished training!") print("Best validation score: {}".format(valid_curve[best_val_epoch])) print("Test score: {}".format(test_curve[best_val_epoch])) if not args.filename == "": torch.save( { "Val": valid_curve[best_val_epoch], "Test": test_curve[best_val_epoch], "Train": train_curve[best_val_epoch], "BestTrain": best_train, }, args.filename, )