def load_dataset(): ''' load raw datasets. :return: a list of networkx/deepsnap graphs, plus additional info if needed ''' format = cfg.dataset.format name = cfg.dataset.name # dataset_dir = '{}/{}'.format(cfg.dataset.dir, name) dataset_dir = cfg.dataset.dir # Try to load customized data format for func in register.loader_dict.values(): graphs = func(format, name, dataset_dir) if graphs is not None: return graphs # Load from Pytorch Geometric dataset if format == 'PyG': graphs = load_pyg(name, dataset_dir) # Load from networkx formatted data # todo: clean nx dataloader elif format == 'nx': graphs = load_nx(name, dataset_dir) # Load from OGB formatted data elif cfg.dataset.format == 'OGB': if cfg.dataset.name == 'ogbg-molhiv': dataset = PygGraphPropPredDataset(name=cfg.dataset.name) graphs = GraphDataset.pyg_to_graphs(dataset) # Note this is only used for custom splits from OGB split_idx = dataset.get_idx_split() return graphs, split_idx else: raise ValueError('Unknown data format: {}'.format(cfg.dataset.format)) return graphs
def get_molhiv(): path = osp.dirname(osp.realpath(__file__)) dataset = PygGraphPropPredDataset(name='ogbg-molhiv', root=path) split_idx = dataset.get_idx_split() max_num_nodes = torch.tensor(dataset.data.num_nodes).max().item() return dataset[split_idx["train"]], dataset[split_idx["valid"]], dataset[ split_idx["test"]], max_num_nodes
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygGraphPropPredDataset(name=args.dataset) args.num_tasks = dataset.num_tasks print(args) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) train_result = eval(model, device, train_loader, evaluator)[dataset.eval_metric] valid_result = eval(model, device, valid_loader, evaluator)[dataset.eval_metric] test_result = eval(model, device, test_loader, evaluator)[dataset.eval_metric] print({ 'Train': train_result, 'Validation': valid_result, 'Test': test_result }) model.print_params(final=True)
def setup(self, stage: Optional[str] = None): """Load data. Set variables: self.data_train, self.data_val, self.data_test.""" if not self.data_train and not self.data_val and not self.data_test: dataset = PygGraphPropPredDataset(name="ogbg-molpcba", root=self.data_dir, transform=self.transform) split_idx = dataset.get_idx_split() self.data_train = dataset[split_idx["train"]] self.data_val = dataset[split_idx["valid"]] self.data_test = dataset[split_idx["test"]]
def mol_pred_GNN_prepare(batch_size=50): dataset_name = 'ogbg-molhiv' dataset = PygGraphPropPredDataset(name=dataset_name) evaluator = Evaluator(name=dataset_name) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=batch_size, shuffle=True) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=batch_size, shuffle=False) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=batch_size, shuffle=False) return train_loader, test_loader
def __init__(self, train): super(Mol_pred_DNN_dataset, self).__init__() self.train = train dataset_name = 'ogbg-molhiv' mol_origin_dataset = PygGraphPropPredDataset(name=dataset_name) evaluator = Evaluator(name=dataset_name) split_idx = mol_origin_dataset.get_idx_split() if self.train == True: self.mol_origin_dataset = mol_origin_dataset[split_idx["train"]] else: self.mol_origin_dataset = mol_origin_dataset[split_idx["test"]]
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') if args.not_extract_node_feature: dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) else: extract_node_feature_func = partial(extract_node_feature, reduce=args.aggr) dataset = PygGraphPropPredDataset(name=args.dataset, transform=extract_node_feature_func) args.num_tasks = dataset.num_classes evaluator = Evaluator(args.dataset) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) print(args) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) train_accuracy = eval(model, device, train_loader, evaluator) valid_accuracy = eval(model, device, valid_loader, evaluator) test_accuracy = eval(model, device, test_loader, evaluator) print({ 'Train': train_accuracy, 'Validation': valid_accuracy, 'Test': test_accuracy }) model.print_params(final=True)
def train_dataloader(self): dataset = PygGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() train_data = dataset[split_idx["train"]] train_loader = DataLoader(train_data, batch_size=self.configuration["batch_size"], shuffle=True, num_workers = self.configuration["num_workers"]) self._train_data = train_data self._train_loader = train_loader return train_loader
def mol_data(root, dataset, batch_size=32, num_workers=4): dataset = PygGraphPropPredDataset(name=f"ogbg-mol{dataset}", root=root) split_idx = dataset.get_idx_split() loaders = dict() for split in ["train", "valid", "test"]: loaders[split] = DataLoader( dataset[split_idx[split]], batch_size=batch_size, shuffle=(split == "train"), num_workers=num_workers, ) return loaders
def val_dataloader(self): dataset = PygGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() val_data = dataset[split_idx["valid"]] validation_loader = DataLoader(val_data, batch_size=self.configuration["batch_size"], shuffle=False, num_workers = self.configuration["num_workers"]) self._validation_data = val_data self._validation_loader = validation_loader return validation_loader
def load_graphs(ogb_name): dataset = PygGraphPropPredDataset(ogb_name, root='data', transform=preproc) out_dim = dataset[0].y.shape[1] split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"] print("Preprocessing Graphs...") train_graphs = list(tqdm(dataset[train_idx])) train_graphs = [d for d in train_graphs if d.num_edges > 0] valid_graphs = list(dataset[valid_idx]) test_graphs = list(dataset[test_idx]) return out_dim, train_graphs, valid_graphs, test_graphs
def code_data( root, batch_size=128, num_vocab=VOCAB_SIZE, seq_len=SEQ_LEN, use_old_code_dataset=False, ): dataset = PygGraphPropPredDataset( "ogbg-code" if use_old_code_dataset else "ogbg-code2", root=root) split_idx = dataset.get_idx_split() vocab2idx, idx2vocab = get_vocab_mapping( [dataset.data.y[i] for i in split_idx["train"]], num_vocab) dataset.transform = transforms.Compose( [augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, seq_len)]) loaders = dict() for split in ["train", "valid", "test"]: loaders[split] = DataLoader( dataset[split_idx[split]], batch_size=batch_size, shuffle=(split == "train"), num_workers=2, ) return loaders, idx2vocab
def run(rank, world_size: int, dataset_name: str, root: str): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12355' dist.init_process_group('nccl', rank=rank, world_size=world_size) dataset = Dataset(dataset_name, root, pre_transform=T.ToSparseTensor(attr='edge_attr')) split_idx = dataset.get_idx_split() evaluator = Evaluator(dataset_name) train_dataset = dataset[split_idx['train']] train_sampler = DistributedSampler(train_dataset, num_replicas=world_size, rank=rank) train_loader = DataLoader(train_dataset, batch_size=128, sampler=train_sampler) torch.manual_seed(12345) model = GIN(128, dataset.num_tasks, num_layers=3, dropout=0.5).to(rank) model = DistributedDataParallel(model, device_ids=[rank]) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) criterion = torch.nn.BCEWithLogitsLoss() if rank == 0: val_loader = DataLoader(dataset[split_idx['valid']], batch_size=256) test_loader = DataLoader(dataset[split_idx['test']], batch_size=256) for epoch in range(1, 51): model.train() total_loss = 0 for data in train_loader: data = data.to(rank) optimizer.zero_grad() logits = model(data.x, data.adj_t, data.batch) loss = criterion(logits, data.y.to(torch.float)) loss.backward() optimizer.step() total_loss += float(loss) * logits.size(0) loss = total_loss / len(train_loader.dataset) dist.barrier() if rank == 0: # We evaluate on a single GPU for now. model.eval() y_pred, y_true = [], [] for data in val_loader: data = data.to(rank) with torch.no_grad(): y_pred.append(model.module(data.x, data.adj_t, data.batch)) y_true.append(data.y) val_rocauc = evaluator.eval({ 'y_pred': torch.cat(y_pred, dim=0), 'y_true': torch.cat(y_true, dim=0), })['rocauc'] y_pred, y_true = [], [] for data in test_loader: data = data.to(rank) with torch.no_grad(): y_pred.append(model.module(data.x, data.adj_t, data.batch)) y_true.append(data.y) test_rocauc = evaluator.eval({ 'y_pred': torch.cat(y_pred, dim=0), 'y_true': torch.cat(y_true, dim=0), })['rocauc'] print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, ' f'Val: {val_rocauc:.4f}, Test: {test_rocauc:.4f}') dist.barrier() dist.destroy_process_group()
def main(): device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") write_file_name = 'results/result_' ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'randomgin': model = GNN(gnn_type='randomgin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, drop_path_p=args.drop_path_p, virtual_node=False).to(device) elif args.gnn == 'randomgin-virtual': model = GNN(gnn_type='randomgin', num_tasks=dataset.num_tasks, num_layer=args.num_layer, emb_dim=args.emb_dim, JK=args.JK, drop_ratio=args.drop_ratio, drop_path_p=args.drop_path_p, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') tot_params = sum(p.numel() for p in model.parameters() if p.requires_grad) print("No. params: %d" % (tot_params, )) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') loss = train(model, device, train_loader, optimizer, dataset.task_type, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst if epoch == 1: print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) print(f'Run{run} val:{best_val}, test:{final_test}') with open(write_file_name + '_' + args.JK + '_run' + str(run) + '.txt', 'w') as f: f.write("""Run: {}\nVal {:.4f}\nTest: {:.4f}\n\n\n""".format( run, best_val, final_test)) vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
from ogb.graphproppred import PygGraphPropPredDataset import os root_folder = '/vol/deform/gbouritsas/datasets/' datasets = ['ogbg-molpcba', 'ogbg-molhiv', 'ogbg-ppa'] for name in datasets: dataset = PygGraphPropPredDataset(name=name, root=os.path.join( root_folder, 'ogb', '{}'.format(name))) split_idx = dataset.get_idx_split() for split_name in {'train', 'valid', 'test'}: idxs = split_idx[split_name] split_name = split_name if split_name is not 'valid' else 'val' save_folder = os.path.join(root_folder, 'ogb', '{}'.format(name), '10fold_idx') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, '{}_idx-0.txt'.format(split_name)), 'w') as handle: for idx in idxs: handle.write('{}\n'.format(idx))
def main(): device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: #4min train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): args = ArgsInit().save_exp() if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') sub_dir = 'BS_{}-NF_{}'.format(args.batch_size, args.feature) dataset = PygGraphPropPredDataset(name=args.dataset) args.num_tasks = dataset.num_tasks logging.info('%s' % args) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] evaluator = Evaluator(args.dataset) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = DeeperGCN(args).to(device) logging.info(model) optimizer = optim.Adam(model.parameters(), lr=args.lr) results = { 'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0 } start_time = time.time() for epoch in range(1, args.epochs + 1): logging.info("=====Epoch {}".format(epoch)) logging.info('Training...') # epoch_loss = train(model, device, train_loader, optimizer, dataset.task_type) epoch_loss = train_flag(model, device, train_loader, optimizer, dataset.task_type, args) logging.info('Evaluating...') train_result = eval(model, device, train_loader, evaluator)[dataset.eval_metric] valid_result = eval(model, device, valid_loader, evaluator)[dataset.eval_metric] test_result = eval(model, device, test_loader, evaluator)[dataset.eval_metric] logging.info({ 'Train': train_result, 'Validation': valid_result, 'Test': test_result }) model.print_params(epoch=epoch) if train_result > results['highest_train']: results['highest_train'] = train_result if valid_result > results['highest_valid']: results['highest_valid'] = valid_result results['final_train'] = train_result results['final_test'] = test_result # save_ckpt(model, optimizer, # round(epoch_loss, 4), epoch, # args.model_save_path, # sub_dir, name_post='valid_best') logging.info("%s" % results) end_time = time.time() total_time = end_time - start_time logging.info('Total time: {}'.format( time.strftime('%H:%M:%S', time.gmtime(total_time))))
def main(): args = get_args() config = process_config(args) print(config) if config.get('seed') is not None: torch.manual_seed(config.seed) np.random.seed(config.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=config.dataset_name) seq_len_list = np.array([len(seq) for seq in dataset.data.y]) print('Target seqence less or equal to {} is {}%.'.format(config.max_seq_len, np.sum(seq_len_list <= config.max_seq_len) / len(seq_len_list))) split_idx = dataset.get_idx_split() # print(split_idx['train']) # print(split_idx['valid']) # print(split_idx['test']) # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']] # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']] # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']] # print('#train') # print(len(train_method_name)) # print('#valid') # print(len(valid_method_name)) # print('#test') # print(len(test_method_name)) # train_method_name_set = set(train_method_name) # valid_method_name_set = set(valid_method_name) # test_method_name_set = set(test_method_name) # # unique method name # print('#unique train') # print(len(train_method_name_set)) # print('#unique valid') # print(len(valid_method_name_set)) # print('#unique test') # print(len(test_method_name_set)) # # unique valid/test method name # print('#valid unseen during training') # print(len(valid_method_name_set - train_method_name_set)) # print('#test unseen during training') # print(len(test_method_name_set - train_method_name_set)) ### building vocabulary for sequence predition. Only use training data. vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], config.num_vocab) # test encoder and decoder # for data in dataset: # # PyG >= 1.5.0 # print(data.y) # # # PyG 1.4.3 # # print(data.y[0]) # data = encode_y_to_arr(data, vocab2idx, config.max_seq_len) # print(data.y_arr[0]) # decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab) # print(decoded_seq) # print('') ## test augment_edge # data = dataset[2] # print(data) # data_augmented = augment_edge(data) # print(data_augmented) ### set the transform function # augment_edge: add next-token edge as well as inverse edges. add edge attributes. # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence. dataset.transform = transforms.Compose([augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, config.max_seq_len)]) ### automatic evaluator. takes dataset name as input evaluator = Evaluator(config.dataset_name) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=config.hyperparams.batch_size, shuffle=True, num_workers=config.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers) nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz')) nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz')) ### Encoding node features into emb_dim vectors. ### The following three node features are used. # 1. node type # 2. node attribute # 3. node depth node_encoder = ASTNodeEncoder(config.architecture.hidden, num_nodetypes=len(nodetypes_mapping['type']), num_nodeattributes=len(nodeattributes_mapping['attr']), max_depth=20) model = Net(config.architecture, num_vocab=len(vocab2idx), max_seq_len=config.max_seq_len, node_encoder=node_encoder).to(device) # optimizer = optim.Adam(model.parameters(), lr=0.001) optimizer = optim.Adam(model.parameters(), lr=config.hyperparams.learning_rate) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config.hyperparams.step_size, gamma=config.hyperparams.decay_rate) valid_curve = [] test_curve = [] train_curve = [] trainL_curve = [] writer = SummaryWriter(config.directory) ts_fk_algo_hp = str(config.time_stamp) + '_' \ + str(config.commit_id[0:7]) + '_' \ + str(config.architecture.nonlinear_conv) + '_' \ + str(config.architecture.variants.fea_activation) + '_' \ + str(config.architecture.pooling) + '_' \ + str(config.architecture.JK) + '_' \ + str(config.architecture.layers) + '_' \ + str(config.architecture.hidden) + '_' \ + str(config.architecture.variants.BN) + '_' \ + str(config.architecture.dropout) + '_' \ + str(config.hyperparams.learning_rate) + '_' \ + str(config.hyperparams.step_size) + '_' \ + str(config.hyperparams.decay_rate) + '_' \ + 'B' + str(config.hyperparams.batch_size) + '_' \ + 'S' + str(config.seed) for epoch in range(1, config.hyperparams.epochs + 1): print("Epoch {} training...".format(epoch)) train_loss = train(model, device, train_loader, optimizer) scheduler.step() print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) valid_perf = eval(model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) test_perf = eval(model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) # print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf}) print('Train:', train_perf[dataset.eval_metric], 'Validation:', valid_perf[dataset.eval_metric], 'Test:', test_perf[dataset.eval_metric], 'Train loss:', train_loss) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) trainL_curve.append(train_loss) writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/traP': train_perf[dataset.eval_metric]}, epoch) writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/valP': valid_perf[dataset.eval_metric]}, epoch) writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/tstP': test_perf[dataset.eval_metric]}, epoch) writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/traL': train_loss}, epoch) writer.close() print('F1') best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) print('Finished test: {}, Validation: {}, Train: {}, epoch: {}, best train: {}, best loss: {}' .format(test_curve[best_val_epoch], valid_curve[best_val_epoch], train_curve[best_val_epoch], best_val_epoch, best_train, min(trainL_curve)))
def main(): seed = args.seed np.random.seed(seed) random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.cuda.manual_seed_all(seed) model_save_dir = f'models/{args.name}' os.makedirs(model_save_dir, exist_ok=True) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") print("Training") # writer = SummaryWriter(model_save_dir) with open(f'{model_save_dir}/arguments.txt', 'w') as f: json.dump(args.__dict__, f, indent=2) ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, topological=args.topological).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, topological=args.topological).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, topological=args.topological).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=dataset.num_classes, num_layer=args.num_layer, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, topological=args.topological).to(device) elif args.gnn == 'controller': model = ControllerTransformer().to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: # 4min train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: torch.save(model.state_dict(), os.path.join(model_save_dir, f'model-best.pth')) best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name = args.dataset, root='/cmlscratch/kong/datasets/ogb') seq_len_list = np.array([len(seq) for seq in dataset.data.y]) print('Target seqence less or equal to {} is {}%.'.format(args.max_seq_len, np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list))) split_idx = dataset.get_idx_split() # print(split_idx['train']) # print(split_idx['valid']) # print(split_idx['test']) # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']] # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']] # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']] # print('#train') # print(len(train_method_name)) # print('#valid') # print(len(valid_method_name)) # print('#test') # print(len(test_method_name)) # train_method_name_set = set(train_method_name) # valid_method_name_set = set(valid_method_name) # test_method_name_set = set(test_method_name) # # unique method name # print('#unique train') # print(len(train_method_name_set)) # print('#unique valid') # print(len(valid_method_name_set)) # print('#unique test') # print(len(test_method_name_set)) # # unique valid/test method name # print('#valid unseen during training') # print(len(valid_method_name_set - train_method_name_set)) # print('#test unseen during training') # print(len(test_method_name_set - train_method_name_set)) ### building vocabulary for sequence predition. Only use training data. vocab2idx, idx2vocab = get_vocab_mapping([dataset.data.y[i] for i in split_idx['train']], args.num_vocab) # test encoder and decoder # for data in dataset: # # PyG >= 1.5.0 # print(data.y) # # # PyG 1.4.3 # # print(data.y[0]) # data = encode_y_to_arr(data, vocab2idx, args.max_seq_len) # print(data.y_arr[0]) # decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab) # print(decoded_seq) # print('') ## test augment_edge # data = dataset[2] # print(data) # data_augmented = augment_edge(data) # print(data_augmented) ### set the transform function # augment_edge: add next-token edge as well as inverse edges. add edge attributes. # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence. dataset.transform = transforms.Compose([augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len)]) ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers = args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers = args.num_workers) nodetypes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz')) nodeattributes_mapping = pd.read_csv(os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz')) ### Encoding node features into emb_dim vectors. ### The following three node features are used. # 1. node type # 2. node attribute # 3. node depth node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes = len(nodetypes_mapping['type']), num_nodeattributes = len(nodeattributes_mapping['attr']), max_depth = 20) vals, tests = [], [] for run in range(args.runs): best_val, final_test = 0, 0 if args.gnn == 'gin': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs+1): loss = train(model, device, train_loader, optimizer, args) if epoch > args.epochs // 2 and epoch % args.test_freq == 0 or epoch == args.epochs: #4min train_perf = eval(model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) valid_perf = eval(model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) test_perf = eval(model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) result = (train_perf[dataset.eval_metric], valid_perf[dataset.eval_metric], test_perf[dataset.eval_metric]) _, val, tst = result if val > best_val: best_val = val final_test = tst print(f'Run{run} val:{best_val}, test:{final_test}') vals.append(best_val) tests.append(final_test) print('') print(f"Average val accuracy: {np.mean(vals)} ± {np.std(vals)}") print(f"Average test accuracy: {np.mean(tests)} ± {np.std(tests)}")
def main(): # Training settings parser = argparse.ArgumentParser(description='GNN baselines on ogbgmol* data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--gnn', type=str, default='gin-virtual', help='GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument('--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument('--pooling', type=str, default='mean', help='Pooling tecnhnique for graph embedding') parser.add_argument('--laf', type=str, default='mean', help='Init function if laf pooling is specified') parser.add_argument('--laf_layers', type=str, default='false', help='If set to true, internal layers will be initialized with laf function') parser.add_argument('--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-molhiv", help='dataset name (default: ogbg-molhiv)') parser.add_argument('--feature', type=str, default="full", help='full feature or simple feature') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') parser.add_argument('--seed', type=int, default=92, help='torch seed') parser.add_argument('--alternate', type=str, default='false', help='use alternate learning with laf') args = parser.parse_args() print(args) torch.manual_seed(args.seed) device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers, device=device, lafgrad=True).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers, device=device, lafgrad=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers, device=device, lafgrad=True).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers, device=device, lafgrad=True).to(device) elif args.gnn == 'gat': model = GNN(gnn_type='gat', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, laf_layers=args.laf_layers, device=device, lafgrad=True).to(device) else: raise ValueError('Invalid GNN type') #model.load_state_dict(torch.load("{}_fixed_training.mdl".format(args.filename))) model_params = [] laf_params = [] for n, p in model.named_parameters(): if n == 'pool.weights' or n == 'pool.alpha' or n == 'pool.beta' or n == 'pool.N' or n == 'pool.M': laf_params.append(p) else: model_params.append(p) optimizer = optim.Adam(model_params, lr=0.001) if laf_params == []: optimizerlaf = None else: optimizerlaf = optim.Adam(laf_params, lr=0.0001) flog = open(args.filename + ".log", 'a') valid_curve = [] test_curve = [] train_curve = [] if 'classification' in dataset.task_type: best_val = 0 else: best_val = 1e12 flog.write("{}\n".format(args)) bflag = True for epoch in range(1, args.epochs + 1): start = time.time() print("=====Epoch {}".format(epoch)) flog.write("=====Epoch {}\n".format(epoch)) print('Training...') #if args.alternate == 'false': # train_perf = train(model, device, train_loader, optimizer, optimizerlaf, dataset.task_type, evaluator) #else: # train_perf = train(model, device, train_loader, optimizer, None, dataset.task_type, evaluator) #if args.alternate == 'false': train_perf = train(model, device, train_loader, optimizer, optimizerlaf, dataset.task_type, evaluator, alternate=args.alternate) print('Evaluating...') # train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf}) print("Time {:.4f}s".format(time.time() - start)) if laf_params != []: print("{}\n".format(torch.norm(model.pool.weights))) flog.write("{}\n".format({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})) flog.write("Time: {}\n".format(time.time()-start)) if laf_params != []: flog.write("Laf weights norm: {}\n".format(torch.norm(model.pool.weights, dim=0))) flog.flush() train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) if 'classification' in dataset.task_type: if valid_perf[dataset.eval_metric] >= best_val: best_val = valid_perf[dataset.eval_metric] if not args.filename == '': if args.alternate == 'true': torch.save(model.state_dict(), '{}_fixed_training.mdl'.format(args.filename)) else: torch.save(model.state_dict(), '{}.mdl'.format(args.filename)) else: if valid_perf[dataset.eval_metric] <= best_val: best_val = epoch if not args.filename == '': if args.alternate == 'true': torch.save(model.state_dict(), '{}_fixed_training.mdl'.format(args.filename)) else: torch.save(model.state_dict(), '{}.mdl'.format(args.filename)) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) flog.write('Finished training!\n') flog.write('Best validation score: {}\n'.format(valid_curve[best_val_epoch])) flog.write('Test score: {}\n'.format(test_curve[best_val_epoch])) flog.flush() if not args.filename == '': torch.save({'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train}, args.filename + "_fixed_training.res") # if args.alternate == 'true'and optimizerlaf: # args.alternate = 'false' # flog.write("===================LAF TRAINING=================\n") # valid_curve = [] # test_curve = [] # train_curve = [] # if 'classification' in dataset.task_type: # best_val = 0 # else: # best_val = 1e12 # for epoch in range(1, args.epochs + 1): # start = time.time() # print("=====Epoch {}".format(epoch)) # flog.write("=====Epoch {}\n".format(epoch)) # print('Training...') # train_perf = train(model, device, train_loader, optimizerlaf, None, dataset.task_type, evaluator) # print('Evaluating...') # # train_perf = eval(model, device, train_loader, evaluator) # valid_perf = eval(model, device, valid_loader, evaluator) # test_perf = eval(model, device, test_loader, evaluator) # print({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf}) # print("Time {:.4f}s".format(time.time() - start)) # #print("{}\n".format(torch.norm(model.pool.weights))) # flog.write("{}\n".format({'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf})) # flog.write("Time: {}\n".format(time.time()-start)) # #flog.write("Laf weights norm: {}\n".format(torch.norm(model.pool.weights, dim=0))) # flog.flush() # train_curve.append(train_perf[dataset.eval_metric]) # valid_curve.append(valid_perf[dataset.eval_metric]) # test_curve.append(test_perf[dataset.eval_metric]) # if 'classification' in dataset.task_type: # if valid_perf[dataset.eval_metric] >= best_val: # best_val = valid_perf[dataset.eval_metric] # if not args.filename == '': # torch.save(model.state_dict(), '{}_laf_training.mdl'.format(args.filename)) # else: # if valid_perf[dataset.eval_metric] <= best_val: # best_val = epoch # if not args.filename == '': # torch.save(model.state_dict(), '{}_laf_training.mdl'.format(args.filename)) # if 'classification' in dataset.task_type: # best_val_epoch = np.argmax(np.array(valid_curve)) # best_train = max(train_curve) # else: # best_val_epoch = np.argmin(np.array(valid_curve)) # best_train = min(train_curve) # print('Finished training!') # print('Best validation score: {}'.format(valid_curve[best_val_epoch])) # print('Test score: {}'.format(test_curve[best_val_epoch])) # flog.write('Finished training!\n') # flog.write('Best validation score: {}\n'.format(valid_curve[best_val_epoch])) # flog.write('Test score: {}\n'.format(test_curve[best_val_epoch])) # flog.flush() # if not args.filename == '': # torch.save({'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], # 'Train': train_curve[best_val_epoch], 'BestTrain': best_train}, args.filename + "_laf_training.res") flog.close()
def main(): args = get_args() config = process_config(args) print(config) if config.get('seed') is not None: random.seed(config.seed) torch.manual_seed(config.seed) np.random.seed(config.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ### automatic dataloading and splitting sys.stdin = In() dataset = PygGraphPropPredDataset(name=config.dataset_name) if config.feature == 'full': pass elif config.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(config.dataset_name) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=config.hyperparams.batch_size, shuffle=True, num_workers=config.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=config.hyperparams.batch_size, shuffle=False, num_workers=config.num_workers) model = Net(config.architecture, num_tasks=dataset.num_tasks).to(device) optimizer = optim.Adam(model.parameters(), lr=config.hyperparams.learning_rate) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, step_size=config.hyperparams.step_size, gamma=config.hyperparams.decay_rate) valid_curve = [] test_curve = [] train_curve = [] trainL_curve = [] writer = SummaryWriter(config.directory) ts_fk_algo_hp = str(config.time_stamp) + '_' \ + str(config.commit_id[0:7]) + '_' \ + str(config.architecture.methods) + '_' \ + str(config.architecture.pooling) + '_' \ + str(config.architecture.JK) + '_' \ + str(config.architecture.layers) + '_' \ + str(config.architecture.hidden) + '_' \ + str(config.architecture.variants.BN) + '_' \ + str(config.architecture.dropout) + '_' \ + str(config.hyperparams.learning_rate) + '_' \ + str(config.hyperparams.step_size) + '_' \ + str(config.hyperparams.decay_rate) + '_' \ + 'B' + str(config.hyperparams.batch_size) + '_' \ + 'S' + str(config.seed if config.get('seed') is not None else "na") + '_' \ + 'W' + str(config.num_workers if config.get('num_workers') is not None else "na") for epoch in range(1, config.hyperparams.epochs + 1): print("Epoch {} training...".format(epoch)) train_loss = train(model, device, train_loader, optimizer, dataset.task_type) scheduler.step() print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print('Train:', train_perf[dataset.eval_metric], 'Validation:', valid_perf[dataset.eval_metric], 'Test:', test_perf[dataset.eval_metric], 'Train loss:', train_loss) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) trainL_curve.append(train_loss) writer.add_scalars( config.dataset_name, {ts_fk_algo_hp + '/traP': train_perf[dataset.eval_metric]}, epoch) writer.add_scalars( config.dataset_name, {ts_fk_algo_hp + '/valP': valid_perf[dataset.eval_metric]}, epoch) writer.add_scalars( config.dataset_name, {ts_fk_algo_hp + '/tstP': test_perf[dataset.eval_metric]}, epoch) writer.add_scalars(config.dataset_name, {ts_fk_algo_hp + '/traL': train_loss}, epoch) writer.close() if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print( 'Finished test: {}, Validation: {}, epoch: {}, best train: {}, best loss: {}' .format(test_curve[best_val_epoch], valid_curve[best_val_epoch], best_val_epoch, best_train, min(trainL_curve)))
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-code2 data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gcn-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gcn-virtual)') parser.add_argument('--drop_ratio', type=float, default=0, help='dropout ratio (default: 0)') parser.add_argument('--max_seq_len', type=int, default=5, help='maximum sequence length to predict (default: 5)') parser.add_argument( '--num_vocab', type=int, default=5000, help= 'the number of vocabulary used for sequence prediction (default: 5000)' ) parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=128, help='input batch size for training (default: 128)') parser.add_argument('--epochs', type=int, default=25, help='number of epochs to train (default: 25)') parser.add_argument('--random_split', action='store_true') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-code2", help='dataset name (default: ogbg-code2)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() print(args) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) seq_len_list = np.array([len(seq) for seq in dataset.data.y]) print('Target seqence less or equal to {} is {}%.'.format( args.max_seq_len, np.sum(seq_len_list <= args.max_seq_len) / len(seq_len_list))) split_idx = dataset.get_idx_split() if args.random_split: print('Using random split') perm = torch.randperm(len(dataset)) num_train, num_valid, num_test = len(split_idx['train']), len( split_idx['valid']), len(split_idx['test']) split_idx['train'] = perm[:num_train] split_idx['valid'] = perm[num_train:num_train + num_valid] split_idx['test'] = perm[num_train + num_valid:] assert (len(split_idx['train']) == num_train) assert (len(split_idx['valid']) == num_valid) assert (len(split_idx['test']) == num_test) # print(split_idx['train']) # print(split_idx['valid']) # print(split_idx['test']) # train_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['train']] # valid_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['valid']] # test_method_name = [' '.join(dataset.data.y[i]) for i in split_idx['test']] # print('#train') # print(len(train_method_name)) # print('#valid') # print(len(valid_method_name)) # print('#test') # print(len(test_method_name)) # train_method_name_set = set(train_method_name) # valid_method_name_set = set(valid_method_name) # test_method_name_set = set(test_method_name) # # unique method name # print('#unique train') # print(len(train_method_name_set)) # print('#unique valid') # print(len(valid_method_name_set)) # print('#unique test') # print(len(test_method_name_set)) # # unique valid/test method name # print('#valid unseen during training') # print(len(valid_method_name_set - train_method_name_set)) # print('#test unseen during training') # print(len(test_method_name_set - train_method_name_set)) ### building vocabulary for sequence predition. Only use training data. vocab2idx, idx2vocab = get_vocab_mapping( [dataset.data.y[i] for i in split_idx['train']], args.num_vocab) # test encoder and decoder # for data in dataset: # # PyG >= 1.5.0 # print(data.y) # # # PyG 1.4.3 # # print(data.y[0]) # data = encode_y_to_arr(data, vocab2idx, args.max_seq_len) # print(data.y_arr[0]) # decoded_seq = decode_arr_to_seq(data.y_arr[0], idx2vocab) # print(decoded_seq) # print('') ## test augment_edge # data = dataset[2] # print(data) # data_augmented = augment_edge(data) # print(data_augmented) ### set the transform function # augment_edge: add next-token edge as well as inverse edges. add edge attributes. # encode_y_to_arr: add y_arr to PyG data object, indicating the array representation of a sequence. dataset.transform = transforms.Compose([ augment_edge, lambda data: encode_y_to_arr(data, vocab2idx, args.max_seq_len) ]) ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) nodetypes_mapping = pd.read_csv( os.path.join(dataset.root, 'mapping', 'typeidx2type.csv.gz')) nodeattributes_mapping = pd.read_csv( os.path.join(dataset.root, 'mapping', 'attridx2attr.csv.gz')) print(nodeattributes_mapping) ### Encoding node features into emb_dim vectors. ### The following three node features are used. # 1. node type # 2. node attribute # 3. node depth node_encoder = ASTNodeEncoder(args.emb_dim, num_nodetypes=len(nodetypes_mapping['type']), num_nodeattributes=len( nodeattributes_mapping['attr']), max_depth=20) if args.gnn == 'gin': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(num_vocab=len(vocab2idx), max_seq_len=args.max_seq_len, node_encoder=node_encoder, num_layer=args.num_layer, gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) print(f'#Params: {sum(p.numel() for p in model.parameters())}') valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval( model, device, train_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) valid_perf = eval( model, device, valid_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) test_perf = eval( model, device, test_loader, evaluator, arr_to_seq=lambda arr: decode_arr_to_seq(arr, idx2vocab)) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) print('F1') best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': result_dict = { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train } torch.save(result_dict, args.filename)
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbgmol* data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-molhiv", help='dataset name (default: ogbg-molhiv)') parser.add_argument('--feature', type=str, default="full", help='full feature or simple feature') parser.add_argument("--verbose", "-v", action="store_true") parser.add_argument('--mu', type=float, default=0.5, help='hyperparameter') parser.add_argument('--num_seeds', type=int, default=10, help='number of seeds (default: 10)') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") mu = args.mu ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset) if args.feature == 'full': pass elif args.feature == 'simple': print('using simple feature') # only retain the top two node/edge features dataset.data.x = dataset.data.x[:, :2] dataset.data.edge_attr = dataset.data.edge_attr[:, :2] split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) num_seeds = args.num_seeds seeds = list(range(num_seeds)) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) result = np.zeros((4, num_seeds)) for seed in seeds: torch.manual_seed(seed) if args.gnn == 'gin': model = PREGGNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = PREGGNN(gnn_type='gin', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = PREGGNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = PREGGNN(gnn_type='gcn', num_tasks=dataset.num_tasks, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer, dataset.task_type, mu) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf[dataset.eval_metric]) valid_curve.append(valid_perf[dataset.eval_metric]) test_curve.append(test_perf[dataset.eval_metric]) if 'classification' in dataset.task_type: best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) else: best_val_epoch = np.argmin(np.array(valid_curve)) best_train = min(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) result[0][seed] = best_val_epoch result[1][seed] = train_curve[best_val_epoch] result[2][seed] = valid_curve[best_val_epoch] result[3][seed] = test_curve[best_val_epoch] if not args.verbose: if not os.path.exists("result"): os.makedirs("result") torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train, 'mu': mu, 'valid_curve': valid_curve, 'test_curve': test_curve, "train_curve": train_curve, 'dataset': args.dataset, "model": "iad" + args.gnn, 'epochs': args.epochs }, 'result/' + args.dataset + "_preg" + args.gnn + "_" + str(mu) + "_" + str(args.epochs) + "_" + str(seed) + "_" + str(num_seeds) + ".pth")
def main(): args = ArgsInit().save_exp() if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') sub_dir = 'BS_{}'.format(args.batch_size) if args.not_extract_node_feature: dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) else: if args.aggr == 'add': dataset = PygGraphPropPredDataset( name=args.dataset, transform=extract_node_feature_add) elif args.aggr == 'mean': dataset = PygGraphPropPredDataset( name=args.dataset, transform=extract_node_feature_mean) elif args.aggr == 'max': dataset = PygGraphPropPredDataset( name=args.dataset, transform=extract_node_feature_max) else: raise Exception('Unknown Aggregation Type') sub_dir = sub_dir + '-NF_{}'.format(args.aggr) args.num_tasks = dataset.num_classes evaluator = Evaluator(args.dataset) logging.info('%s' % args) split_idx = dataset.get_idx_split() train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) model = DeeperGCN(args).to(device) logging.info(model) optimizer = optim.Adam(model.parameters(), lr=args.lr) criterion = torch.nn.CrossEntropyLoss() results = { 'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0 } start_time = time.time() evaluate = True for epoch in range(1, args.epochs + 1): logging.info("=====Epoch {}".format(epoch)) logging.info('Training...') epoch_loss = train(model, device, train_loader, optimizer, criterion) if args.num_layers > args.num_layers_threshold: if epoch % args.eval_steps != 0: evaluate = False else: evaluate = True model.print_params(epoch=epoch) if evaluate: logging.info('Evaluating...') train_accuracy = eval(model, device, train_loader, evaluator) valid_accuracy = eval(model, device, valid_loader, evaluator) test_accuracy = eval(model, device, test_loader, evaluator) logging.info({ 'Train': train_accuracy, 'Validation': valid_accuracy, 'Test': test_accuracy }) if train_accuracy > results['highest_train']: results['highest_train'] = train_accuracy if valid_accuracy > results['highest_valid']: results['highest_valid'] = valid_accuracy results['final_train'] = train_accuracy results['final_test'] = test_accuracy save_ckpt(model, optimizer, round(epoch_loss, 4), epoch, args.model_save_path, sub_dir, name_post='valid_best') logging.info("%s" % results) end_time = time.time() total_time = end_time - start_time logging.info('Total time: {}'.format( time.strftime('%H:%M:%S', time.gmtime(total_time))))
class OGBPCBADataset(WILDSDataset): """ The OGB-molpcba dataset. This dataset is directly adopted from Open Graph Benchmark, and originally curated by MoleculeNet. Supported `split_scheme`: 'official' or 'scaffold', which are equivalent Input (x): Molecular graphs represented as Pytorch Geometric data objects Label (y): y represents 128-class binary labels. Metadata: - scaffold Each molecule is annotated with the scaffold ID that the molecule is assigned to. Website: https://ogb.stanford.edu/docs/graphprop/#ogbg-mol Original publication: @article{hu2020ogb, title={Open Graph Benchmark: Datasets for Machine Learning on Graphs}, author={W. {Hu}, M. {Fey}, M. {Zitnik}, Y. {Dong}, H. {Ren}, B. {Liu}, M. {Catasta}, J. {Leskovec}}, journal={arXiv preprint arXiv:2005.00687}, year={2020} } @article{wu2018moleculenet, title={MoleculeNet: a benchmark for molecular machine learning}, author={Z. {Wu}, B. {Ramsundar}, E. V {Feinberg}, J. {Gomes}, C. {Geniesse}, A. S {Pappu}, K. {Leswing}, V. {Pande}}, journal={Chemical science}, volume={9}, number={2}, pages={513--530}, year={2018}, publisher={Royal Society of Chemistry} } License: This dataset is distributed under the MIT license. https://github.com/snap-stanford/ogb/blob/master/LICENSE """ _dataset_name = 'ogbg-molpcba' _versions_dict = {'1.0': {'download_url': None, 'compressed_size': None}} def __init__(self, version=None, root_dir='data', download=False, split_scheme='official'): self._version = version if version is not None: raise ValueError( 'Versioning for OGB-MolPCBA is handled through the OGB package. Please set version=none.' ) # internally call ogb package self.ogb_dataset = PygGraphPropPredDataset(name='ogbg-molpcba', root=root_dir) # set variables self._data_dir = self.ogb_dataset.root if split_scheme == 'official': split_scheme = 'scaffold' self._split_scheme = split_scheme self._y_type = 'float' # although the task is binary classification, the prediction target contains nan value, thus we need float self._y_size = self.ogb_dataset.num_tasks self._n_classes = self.ogb_dataset.__num_classes__ self._split_array = torch.zeros(len(self.ogb_dataset)).long() split_idx = self.ogb_dataset.get_idx_split() self._split_array[split_idx['train']] = 0 self._split_array[split_idx['valid']] = 1 self._split_array[split_idx['test']] = 2 self._y_array = self.ogb_dataset.data.y self._metadata_fields = ['scaffold'] metadata_file_path = os.path.join(self.ogb_dataset.root, 'raw', 'scaffold_group.npy') if not os.path.exists(metadata_file_path): download_url( 'https://snap.stanford.edu/ogb/data/misc/ogbg_molpcba/scaffold_group.npy', os.path.join(self.ogb_dataset.root, 'raw')) self._metadata_array = torch.from_numpy( np.load(metadata_file_path)).reshape(-1, 1).long() if torch_geometric.__version__ >= '1.7.0': self._collate = PyGCollater(follow_batch=[], exclude_keys=[]) else: self._collate = PyGCollater(follow_batch=[]) self._metric = Evaluator('ogbg-molpcba') super().__init__(root_dir, download, split_scheme) def get_input(self, idx): return self.ogb_dataset[int(idx)] def eval(self, y_pred, y_true, metadata, prediction_fn=None): """ Computes all evaluation metrics. Args: - y_pred (FloatTensor): Binary logits from a model - y_true (LongTensor): Ground-truth labels - metadata (Tensor): Metadata - prediction_fn (function): A function that turns y_pred into predicted labels. Only None is supported because OGB Evaluators accept binary logits Output: - results (dictionary): Dictionary of evaluation metrics - results_str (str): String summarizing the evaluation metrics """ assert prediction_fn is None, "OGBPCBADataset.eval() does not support prediction_fn. Only binary logits accepted" input_dict = {"y_true": y_true, "y_pred": y_pred} results = self._metric.eval(input_dict) return results, f"Average precision: {results['ap']:.3f}\n"
def main(): parser = argparse.ArgumentParser(description='OGBN-MolHiv') parser.add_argument('--device', type=int, default=0) parser.add_argument('--num_workers', type=int, default=4) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--batch_size', type=int, default=64) parser.add_argument('--num_layers', type=int, default=5) parser.add_argument('--emb_dim', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=50) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--eval', action='store_true', help='If not set, we will only do the training part.') parser.add_argument('--eval_batch_size', type=int, default=2048) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) print(f"Running on {device}") dataset = PygGraphPropPredDataset(name='ogbg-molhiv') split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbg-molhiv') train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.eval_batch_size, shuffle=False, num_workers=0) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.eval_batch_size, shuffle=False, num_workers=0) model = GCN(args.emb_dim, num_classes=dataset.num_tasks, num_layers=args.num_layers, dropout=args.dropout).to(device) logger = Logger(args.runs, args) dur = [] for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, args.epochs + 1): t0 = time.time() loss = train(model, device, train_loader, optimizer) if epoch >= 3: dur.append(time.time() - t0) print('Training time/epoch {}'.format(np.mean(dur))) if not args.eval: continue val_rocauc = test(model, device, val_loader, evaluator)[dataset.eval_metric] test_rocauc = test(model, device, test_loader, evaluator)[dataset.eval_metric] logger.add_result(run, (0.0, val_rocauc, test_rocauc)) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Valid: {val_rocauc:.2f} ' f'Test: {test_rocauc:.2f}') if args.eval: logger.print_statistics(run) if args.eval: logger.print_statistics()
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppa data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument('--pooling', type=str, default='mean', help='Pooling tecnhnique for graph embedding') parser.add_argument('--laf', type=str, default='mean', help='Init function if laf pooling is specified') parser.add_argument( '--laf_layers', type=str, default='false', help= 'If set to true, internal layers will be initialized with laf function' ) parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppa", help='dataset name (default: ogbg-ppa)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') parser.add_argument('--seed', type=int, default=92, help='torch seed') args = parser.parse_args() print(args) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) split_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[split_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[split_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[split_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True, graph_pooling=args.pooling, laf_fun=args.laf, device=args.device).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] best_val = 0 flog = open(args.filename + ".log", 'w') flog.write("{}\n".format(args)) for epoch in range(1, args.epochs + 1): start = time.time() print("=====Epoch {}".format(epoch)) flog.write("=====Epoch {}\n".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) print("Time {:.4f}s".format(time.time() - start)) flog.write("{}\tTime: {}s\n".format( { 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }, time.time() - start)) flog.flush() train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) if valid_perf[dataset.eval_metric] >= best_val: best_val = valid_perf[dataset.eval_metric] if not args.filename == '': torch.save(model.state_dict(), '{}.mdl'.format(args.filename)) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) flog.write('Finished training!\n') flog.write('Best validation score: {}\n'.format( valid_curve[best_val_epoch])) flog.write('Test score: {}\n'.format(test_curve[best_val_epoch])) flog.flush() if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename + ".res")
def main(): # Training settings parser = argparse.ArgumentParser( description='GNN baselines on ogbg-ppi data with Pytorch Geometrics') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument( '--gnn', type=str, default='gin-virtual', help= 'GNN gin, gin-virtual, or gcn, or gcn-virtual (default: gin-virtual)') parser.add_argument('--drop_ratio', type=float, default=0.5, help='dropout ratio (default: 0.5)') parser.add_argument( '--num_layer', type=int, default=5, help='number of GNN message passing layers (default: 5)') parser.add_argument( '--emb_dim', type=int, default=300, help='dimensionality of hidden units in GNNs (default: 300)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--num_workers', type=int, default=0, help='number of workers (default: 0)') parser.add_argument('--dataset', type=str, default="ogbg-ppi", help='dataset name (default: ogbg-ppi)') parser.add_argument('--filename', type=str, default="", help='filename to output result (default: )') args = parser.parse_args() device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") ### automatic dataloading and splitting dataset = PygGraphPropPredDataset(name=args.dataset, transform=add_zeros) splitted_idx = dataset.get_idx_split() ### automatic evaluator. takes dataset name as input evaluator = Evaluator(args.dataset) train_loader = DataLoader(dataset[splitted_idx["train"]], batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) valid_loader = DataLoader(dataset[splitted_idx["valid"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) test_loader = DataLoader(dataset[splitted_idx["test"]], batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.gnn == 'gin': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gin-virtual': model = GNN(gnn_type='gin', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) elif args.gnn == 'gcn': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=False).to(device) elif args.gnn == 'gcn-virtual': model = GNN(gnn_type='gcn', num_class=37, emb_dim=args.emb_dim, drop_ratio=args.drop_ratio, virtual_node=True).to(device) else: raise ValueError('Invalid GNN type') optimizer = optim.Adam(model.parameters(), lr=0.001) valid_curve = [] test_curve = [] train_curve = [] for epoch in range(1, args.epochs + 1): print("=====Epoch {}".format(epoch)) print('Training...') train(model, device, train_loader, optimizer) print('Evaluating...') train_perf = eval(model, device, train_loader, evaluator) valid_perf = eval(model, device, valid_loader, evaluator) test_perf = eval(model, device, test_loader, evaluator) print({ 'Train': train_perf, 'Validation': valid_perf, 'Test': test_perf }) train_curve.append(train_perf['acc']) valid_curve.append(valid_perf['acc']) test_curve.append(test_perf['acc']) best_val_epoch = np.argmax(np.array(valid_curve)) best_train = max(train_curve) print('Finished training!') print('Best validation score: {}'.format(valid_curve[best_val_epoch])) print('Test score: {}'.format(test_curve[best_val_epoch])) if not args.filename == '': torch.save( { 'Val': valid_curve[best_val_epoch], 'Test': test_curve[best_val_epoch], 'Train': train_curve[best_val_epoch], 'BestTrain': best_train }, args.filename)
'Transferred from pretrained Mol-BBBP model (Damaged features)', 'self-transfer': 'Transferred from Mol-HIV source split', 'self-transfer-damaged': 'Transferred from Mol-HIV source split (Damaged features)' } BATCH_SIZE = 64 # --------------------------------------------------- # Data # --------------------------------------------------- # Mol-BBBP bbbp_dataset = PygGraphPropPredDataset(name='ogbg-molbbbp') bbbp_split_idx = bbbp_dataset.get_idx_split() train_loader = DataLoader(bbbp_dataset[bbbp_split_idx["train"]], batch_size=BATCH_SIZE, shuffle=True) valid_loader = DataLoader(bbbp_dataset[bbbp_split_idx["valid"]], batch_size=BATCH_SIZE, shuffle=False) test_loader = DataLoader(bbbp_dataset[bbbp_split_idx["test"]], batch_size=BATCH_SIZE, shuffle=False) bbbp_evaluator = Evaluator('ogbg-molbbbp') # Mol-HIV dataset = PygGraphPropPredDataset(name='ogbg-molhiv')