def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--kind', type=str, default="ReLU") args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() train_idx = split_idx['train'].to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) else: model = GCN(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def load_data(dataset_name="Cora", seed=10, n_splits=5): # Path in which the data will be stored path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset_name) if dataset_name in ["CS", "Physics"]: dataset = Coauthor(path, dataset_name, T.NormalizeFeatures()) elif dataset_name in ["Computers", "Photo"]: dataset = Amazon(path, dataset_name, T.NormalizeFeatures()) elif dataset_name in ["Cora", "Citeseer", "Pubmed"]: dataset = Planetoid(path, dataset_name, split='public', transform=T.NormalizeFeatures()) elif dataset_name in ["Arxiv", "Papers", "Products"]: dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name], root=path, transform=T.NormalizeFeatures()) elif dataset_name == "MAG": dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name], root=path) else: raise Exception("[!] Dataset not found: ", str(dataset_name)) if dataset_name in obg_datasets: data = split_ogb_data(dataset, dataset_name) else: data = dataset[0] # pyg graph object data = split_data(data, seed, n_splits) data.num_classes = dataset.num_classes return data
def eval_with_partition(args): model_load_path = args.model_load_path print("Starting evaluating model stored at", model_load_path) device = torch.device("cuda") dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder) graph = dataset[0] adj = SparseTensor(row=graph.edge_index[0], col=graph.edge_index[1]) if args.self_loop: adj = adj.set_diag() graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes # print('%s' % args) model = DeeperGCN(args).to(device) ckpt = torch.load(model_load_path) model.load_state_dict(ckpt['model_state_dict']) res = test_with_partition(model, graph, adj, split_idx, num_clusters=args.eval_cluster_number, partition_method=args.partition_method, evaluator=evaluator, device=device) print(res) return res
def __init__(self, path): dataset = "ogbn-proteins" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path) super(OGBNproteinsDataset, self).__init__(dataset, path) dataset_t = PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) # Move edge features to node features. self.data.x = dataset_t[0].adj_t.mean(dim=1) # dataset_t[0].adj_t.set_value_(None) del dataset_t setattr(OGBNproteinsDataset, "metric", "ROC-AUC") setattr(OGBNproteinsDataset, "loss", "binary_cross_entropy_with_logits") split_idx = self.get_idx_split() datalist = [] for d in self: setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) datalist.append(d) self.data, self.slices = self.collate(datalist)
def get_product_clusters(): dataset_name = "ogbn-products" dataset = PygNodePropPredDataset(name=dataset_name) print('The {} dataset has {} graph'.format(dataset_name, len(dataset))) data = dataset[0] print(data) split_idx = dataset.get_idx_split() train_idx = split_idx['train'] val_idx = split_idx['valid'] test_idx = split_idx['test'] train_mask = torch.zeros(data.num_nodes, dtype=torch.bool) train_mask[train_idx] = True data['train_mask'] = train_mask val_mask = torch.zeros(data.num_nodes, dtype=torch.bool) val_mask[val_idx] = True data['valid_mask'] = val_mask test_mask = torch.zeros(data.num_nodes, dtype=torch.bool) test_mask[test_idx] = True data['test_mask'] = test_mask cluster_data = ClusterData(data, num_parts=15000, save_dir="dataset") return cluster_data, dataset, data, split_idx
def main(): parser = argparse.ArgumentParser(description='OGBN-Proteins (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-proteins') split_idx = dataset.get_idx_split() data = dataset[0] x = scatter(data.edge_attr, data.edge_index[0], dim=0, dim_size=data.num_nodes, reduce='mean').to('cpu') if args.use_node_embedding: embedding = torch.load('embedding.pt', map_location='cpu') x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model = MLP(x.size(-1), args.hidden_channels, 112, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-proteins') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) if epoch % args.eval_steps == 0: result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_rocauc, valid_rocauc, test_rocauc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_rocauc:.2f}%, ' f'Valid: {100 * valid_rocauc:.2f}% ' f'Test: {100 * test_rocauc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def main(): args = ArgsInit().args dataset = PygNodePropPredDataset(name=args.dataset) graph = dataset[0] if args.self_loop: graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes print(args) model = DeeperGCN(args) print(model) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) result = test(model, graph.x, graph.edge_index, graph.y, split_idx, evaluator) print(result) model.print_params(final=True)
def main(): parser = argparse.ArgumentParser(description="OGBN-Arxiv (MLP)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_node_embedding", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.5) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=500) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygNodePropPredDataset(name="ogbn-arxiv") split_idx = dataset.get_idx_split() data = dataset[0] x = data.x if args.use_node_embedding: embedding = torch.load("embedding.pt", map_location="cpu") x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx["train"].to(device) model = MLP( x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, ).to(device) evaluator = Evaluator(name="ogbn-arxiv") logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_acc:.2f}%, " f"Valid: {100 * valid_acc:.2f}%, " f"Test: {100 * test_acc:.2f}%") logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=3000) parser.add_argument('--runs', type=int, default=1) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv',root='/mnt/ogbdata', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() print(split_idx['train'].nonzero().size(0)/len(split_idx['train']), split_idx['valid'].nonzero().size(0)/len(split_idx['train']), split_idx['test'].nonzero().size(0)/len(split_idx['train']))
def eval_model(params): model_load_path, args = params if os.path.isdir(args.model_load_path): model_load_dir = args.model_load_path model_load_path = os.path.join(model_load_dir, model_load_path) print("Starting evaluating model stored at", model_load_path) dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder) graph = dataset[0] if args.self_loop: graph.edge_index = add_self_loops(edge_index=graph.edge_index, num_nodes=graph.num_nodes)[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) args.in_channels = graph.x.size(-1) args.num_tasks = dataset.num_classes model = DeeperGCN(args) ckpt = torch.load(model_load_path, map_location=torch.device('cpu')) model.load_state_dict(ckpt['model_state_dict']) test_res = test(model, graph.x, graph.edge_index, graph.y, split_idx, evaluator) test_res["model_load_path"] = model_load_path return test_res
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (MLP)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_node_embedding', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] x = data.x if args.use_node_embedding: embedding = np.load('./embed_results/embeddings.npy') embedding = torch.from_numpy(embedding).float() x = torch.cat([x, embedding], dim=-1) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result = test(model, x, y_true, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics() total_params = sum(p.numel() for p in model.parameters()) print(f'mlp total params are {total_params}')
def main_fixed_mask(args): device = torch.device("cuda:" + str(args.device)) dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes model = DeeperGCN(args).to(device) pruning.add_mask(model, args.num_layers) for name, param in model.named_parameters(): if 'mask' in name: param.requires_grad = False optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0, 'epoch': 0} start_epoch = 1 for epoch in range(start_epoch, args.epochs + 1): epoch_loss = train_fixed(model, x, edge_index, y_true, train_idx, optimizer, args) result = test(model, x, edge_index, y_true, split_idx, evaluator) train_accuracy, valid_accuracy, test_accuracy = result if valid_accuracy > results['highest_valid']: results['highest_valid'] = valid_accuracy results['final_train'] = train_accuracy results['final_test'] = test_accuracy results['epoch'] = epoch print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' + 'Baseline (FIX Mask) Epoch:[{}/{}]\t LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}]' .format(epoch, args.epochs, epoch_loss, train_accuracy * 100, valid_accuracy * 100, test_accuracy * 100, results['final_test'] * 100, results['epoch'])) print("=" * 120) print("syd final: Baseline, Train:[{:.2f}] Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}]" .format( results['final_train'] * 100, results['highest_valid'] * 100, results['epoch'], results['final_test'] * 100)) print("=" * 120)
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (SIGN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') split_idx = dataset.get_idx_split() data = SIGN(args.num_layers)(dataset[0]) # This might take a while. xs = [data.x] + [data[f'x{i}'] for i in range(1, args.num_layers + 1)] xs_train = [x[split_idx['train']].to(device) for x in xs] xs_valid = [x[split_idx['valid']].to(device) for x in xs] xs_test = [x[split_idx['test']].to(device) for x in xs] y_train_true = data.y[split_idx['train']].to(device) y_valid_true = data.y[split_idx['valid']].to(device) y_test_true = data.y[split_idx['test']].to(device) model = MLP(data.x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, xs_train, y_train_true, optimizer) train_acc = test(model, xs_train, y_train_true, evaluator) valid_acc = test(model, xs_valid, y_valid_true, evaluator) test_acc = test(model, xs_test, y_test_true, evaluator) result = (train_acc, valid_acc, test_acc) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}%, ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def load_ogb_2(dataset): ## Load the dataset ## Setup PyTorch device_name = 'cuda:0' if torch.cuda.is_available() else 'cpu' device = torch.device(device_name) dataset = PygNodePropPredDataset(name=dataset, transform=T.ToSparseTensor()) ogb_data = dataset[0] # TODO: Not sure how to format adj_t... ogb_data.adj_t = ogb_data.adj_t.to_symmetric() ogb_data = ogb_data.to(device) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] train_idx, valid_idx, split_idx = train_idx.numpy(), valid_idx.numpy( ), test_idx.numpy() # Convert OGB's data split pytorch index vectors to Wang's data split numpy boolean masks train_mask_2 = indexes2booleanvec(ogb_data.num_nodes, train_idx) val_mask_2 = indexes2booleanvec(ogb_data.num_nodes, valid_idx) test_mask_2 = indexes2booleanvec(ogb_data.num_nodes, test_idx) # Add 1's down the diagonal of adj_t adj_t = ogb_data.adj_t.to_torch_sparse_coo_tensor() adj_t = adj_t + sparse_identity(adj_t.shape[0]) # Convert OGB's adjacency SparseTensor to Wang's adjacency index matrix (Nx2) adj_2_0 = adj_t.coalesce().indices().numpy() adj_2_0 = adj_2_0.T.astype('int32') ##adj_2_0 = np.vstack((adj_2_0, np.array([[i,i] for i in range(ogb_data.num_nodes)]))) adj_2_1 = adj_t.coalesce().values().numpy().astype('float64') adj_2_2 = tuple(adj_t.size()) #TODO: Fix the adjacency matrix, bc it probably is symmetric with identity adj_2 = (adj_2_0, adj_2_1, adj_2_2) from sklearn.preprocessing import OneHotEncoder labels_2 = ogb_data.y.numpy() labels_2 = OneHotEncoder(sparse=False).fit_transform(labels_2) #TODO: I don't know if this feature vector will work # OGB used a skip-gram encoding, # whereas Wang's Citeseer just used normalized rows with 1-0 for different words x = ogb_data.x + 1.5 norm_x = np.apply_along_axis(np.linalg.norm, 1, x) x = x / norm_x[:, None] x = x.to_sparse() features_2_0 = x.indices().numpy().T.astype('int32') features_2_1 = x.values().numpy() #features_2_1 = 1.5 + features_2_1 features_2_1 = features_2_1.astype('float64') features_2_2 = tuple(x.size()) features_2 = features_2_0, features_2_1, features_2_2 data2 = features_2, labels_2, adj_2, train_mask_2, val_mask_2, test_mask_2 return data2
def main(): parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_partitions', type=int, default=15000) parser.add_argument('--num_workers', type=int, default=6) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=256) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=20) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-products') splitted_idx = dataset.get_idx_split() data = dataset[0] # Convert split indices to boolean masks and add them to `data`. for key, idx in splitted_idx.items(): mask = torch.zeros(data.num_nodes, dtype=torch.bool) mask[idx] = True data[f'{key}_mask'] = mask cluster_data = ClusterData(data, num_parts=args.num_partitions, recursive=False, save_dir=dataset.processed_dir) loader = ClusterLoader(cluster_data, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers, args.dropout).to(device) evaluator = Evaluator(name='ogbn-products') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, loader, optimizer, device) if epoch % args.log_steps == 0: print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}') result = test(model, data, evaluator) logger.add_result(run, result) logger.print_statistics(run) logger.print_statistics()
def arxiv_data(root): # keep the same data loading logic for all architectures dataset = PygNodePropPredDataset( name="ogbn-arxiv", root=root, # transform=T.ToSparseTensor(), ) data = dataset[0] # data.adj_t = data.adj_t.to_symmetric() data.edge_index = to_undirected(data.edge_index) split_idx = dataset.get_idx_split() return data, split_idx
def get_data(args): dataset = PygNodePropPredDataset(name=args['dataset_name'], transform=T.ToSparseTensor()) evaluator = Evaluator(name=args['dataset_name']) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() split_idx = dataset.get_idx_split() device = 'cuda' if torch.cuda.is_available() else 'cpu' data = data.to(device) for setname in ['train', 'valid', 'test']: split_idx[setname] = split_idx[setname].to(device) return data, dataset, split_idx, evaluator
def get_dataset(path, name): assert name in ['Cora', 'CiteSeer', 'PubMed', 'DBLP', 'Karate', 'WikiCS', 'Coauthor-CS', 'Coauthor-Phy', 'Amazon-Computers', 'Amazon-Photo', 'ogbn-arxiv', 'ogbg-code'] name = 'dblp' if name == 'DBLP' else name root_path = osp.expanduser('~/datasets') if name == 'Coauthor-CS': return Coauthor(root=path, name='cs', transform=T.NormalizeFeatures()) if name == 'Coauthor-Phy': return Coauthor(root=path, name='physics', transform=T.NormalizeFeatures()) if name == 'WikiCS': return WikiCS(root=path, transform=T.NormalizeFeatures()) if name == 'Amazon-Computers': return Amazon(root=path, name='computers', transform=T.NormalizeFeatures()) if name == 'Amazon-Photo': return Amazon(root=path, name='photo', transform=T.NormalizeFeatures()) if name.startswith('ogbn'): return PygNodePropPredDataset(root=osp.join(root_path, 'OGB'), name=name, transform=T.NormalizeFeatures()) return (CitationFull if name == 'dblp' else Planetoid)(osp.join(root_path, 'Citation'), name, transform=T.NormalizeFeatures())
def __init__(self, path): dataset = "ogbn-products" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path) super(OGBNproductsDataset, self).__init__(dataset, path) # Pre-compute GCN normalization. # adj_t = self.data.adj_t.set_diag() # deg = adj_t.sum(dim=1).to(torch.float) # deg_inv_sqrt = deg.pow(-0.5) # deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 # adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) # self.data.adj_t = adj_t setattr(OGBNproductsDataset, "metric", "Accuracy") setattr(OGBNproductsDataset, "loss", "nll_loss") split_idx = self.get_idx_split() datalist = [] for d in self: setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) datalist.append(d) self.data, self.slices = self.collate(datalist)
def __init__(self, args=None): dataset = "ogbn-mag" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) if not osp.exists(path): PygNodePropPredDataset(dataset, path) super(OGBMAGDataset, self).__init__(path, dataset)
def __init__(self, path): dataset = "ogbn-mag" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path) super(OGBNmagDataset, self).__init__(dataset, path) # Preprocess rel_data = self[0] # We are only interested in paper <-> paper relations. self.data = Data( x=rel_data.x_dict["paper"], edge_index=rel_data.edge_index_dict[("paper", "cites", "paper")], y=rel_data.y_dict["paper"], ) # self.data = T.ToSparseTensor()(data) # self[0].adj_t = self[0].adj_t.to_symmetric() setattr(OGBNmagDataset, "metric", "Accuracy") setattr(OGBNmagDataset, "loss", "nll_loss") split_idx = self.get_idx_split() datalist = [] for d in self: setattr(d, "train_mask", index_to_mask(split_idx["train"], d.y.shape[0])) setattr(d, "val_mask", index_to_mask(split_idx["valid"], d.y.shape[0])) setattr(d, "test_mask", index_to_mask(split_idx["test"], d.y.shape[0])) datalist.append(d) self.data, self.slices = self.collate(datalist)
def __init__(self): dataset = "ogbn-papers100M" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) if not osp.exists(path): PygNodePropPredDataset(dataset, path) super(OGBArxivDataset, self).__init__(path, dataset)
def load_ogb(name, dataset_dir): if name[:4] == 'ogbn': dataset = PygNodePropPredDataset(name=name, root=dataset_dir) splits = dataset.get_idx_split() split_names = ['train_mask', 'val_mask', 'test_mask'] for i, key in enumerate(splits.keys()): mask = index2mask(splits[key], size=dataset.data.y.shape[0]) set_dataset_attr(dataset, split_names[i], mask, len(mask)) edge_index = to_undirected(dataset.data.edge_index) set_dataset_attr(dataset, 'edge_index', edge_index, edge_index.shape[1]) elif name[:4] == 'ogbg': dataset = PygGraphPropPredDataset(name=name, root=dataset_dir) splits = dataset.get_idx_split() split_names = [ 'train_graph_index', 'val_graph_index', 'test_graph_index' ] for i, key in enumerate(splits.keys()): id = splits[key] set_dataset_attr(dataset, split_names[i], id, len(id)) elif name[:4] == "ogbl": dataset = PygLinkPropPredDataset(name=name, root=dataset_dir) splits = dataset.get_edge_split() id = splits['train']['edge'].T if cfg.dataset.resample_negative: set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1]) # todo: applying transform for negative sampling is very slow dataset.transform = neg_sampling_transform else: id_neg = negative_sampling(edge_index=id, num_nodes=dataset.data.num_nodes[0], num_neg_samples=id.shape[1]) id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'train_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'train_edge_label', label, len(label)) id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'val_edge_label', label, len(label)) id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T id_all = torch.cat([id, id_neg], dim=-1) label = get_link_label(id, id_neg) set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1]) set_dataset_attr(dataset, 'test_edge_label', label, len(label)) else: raise ValueError('OGB dataset: {} non-exist') return dataset
def load_data(args, datapath): if args.dataset in ['arxiv'] and args.task == 'lp': data = {} dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset), root='/pasteur/u/jeffgu/hgcn/data') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index) induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index) induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index) neg_edges_train = negative_sampling(induced_edges_train) neg_edges_valid = negative_sampling(induced_edges_valid) neg_edges_test = negative_sampling(induced_edges_test) data['adj_train'] = to_scipy_sparse_matrix( dataset[0].edge_index).tocsr() data['features'] = dataset[0].x data['train_edges'], data[ 'train_edges_false'] = induced_edges_train, neg_edges_train data['val_edges'], data[ 'val_edges_false'] = induced_edges_valid, neg_edges_valid data['test_edges'], data[ 'test_edges_false'] = induced_edges_test, neg_edges_test elif args.task == 'nc': data = load_data_nc(args.dataset, args.use_feats, datapath, args.split_seed) else: data = load_data_lp(args.dataset, args.use_feats, datapath) adj = data['adj_train'] if args.task == 'lp': adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges( adj, args.val_prop, args.test_prop, args.split_seed) data['adj_train'] = adj_train data['train_edges'], data[ 'train_edges_false'] = train_edges, train_edges_false data['val_edges'], data[ 'val_edges_false'] = val_edges, val_edges_false data['test_edges'], data[ 'test_edges_false'] = test_edges, test_edges_false data['adj_train_norm'], data['features'] = process(data['adj_train'], data['features'], args.normalize_adj, args.normalize_feats) if args.dataset == 'airport': data['features'] = augment(data['adj_train'], data['features']) return data
def create_dataset(name): if name.startswith("ogbn"): dataset = name[5:] if dataset in ["arxiv", "products", "papers100M", "proteins"]: return PygNodePropPredDataset(name) elif name.startswith("saint"): dataset = name[6:] return SAINTDataset(dataset)
def main(): args = ArgsInit().args if args.use_gpu: device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available( ) else torch.device("cpu") else: device = torch.device('cpu') dataset = PygNodePropPredDataset(name=args.dataset) data = dataset[0] split_idx = dataset.get_idx_split() evaluator = Evaluator(args.dataset) x = data.x.to(device) y_true = data.y.to(device) edge_index = data.edge_index.to(device) edge_index = to_undirected(edge_index, data.num_nodes) if args.self_loop: edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0] args.in_channels = data.x.size(-1) args.num_tasks = dataset.num_classes print(args) model = DeeperGCN(args) model.load_state_dict(torch.load(args.model_load_path)['model_state_dict']) model.to(device) result = test(model, x, edge_index, y_true, split_idx, evaluator) train_accuracy, valid_accuracy, test_accuracy = result print({ 'Train': train_accuracy, 'Validation': valid_accuracy, 'Test': test_accuracy }) model.print_params(final=True)
def load_ogb_graph(dataset_name): if not os.path.isfile('torch_geometric_data/dgl_' + dataset_name): dataset = PygNodePropPredDataset(name="ogbn-" + dataset_name, root='torch_geometric_data/') split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] edge = dataset[0].edge_index num_classes = len(np.unique(dataset[0].y)) print("Nodes: %d, edges: %d, features: %d, classes: %d. \n" % (dataset[0].y.shape[0], len(edge[0]) / 2, len( dataset[0].x[0]), num_classes)) graph = dgl.DGLGraph((edge[0], edge[1])) graph.ndata['features'] = dataset[0].x graph.ndata['labels'] = dataset[0].y dgl.data.utils.save_graphs('torch_geometric_data/dgl_' + dataset_name, graph) torch.save( train_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/train_' + dataset_name + '.pt') torch.save( valid_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/valid_' + dataset_name + '.pt') torch.save( test_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/test_' + dataset_name + '.pt') labels = graph.ndata.pop('labels') features = graph.ndata.pop('features') features = torch.hstack([features, torch.ones([features.shape[0], 1])]) #print(features) elif os.path.isfile('torch_geometric_data/dgl_' + dataset_name): graph = dgl.data.utils.load_graphs('torch_geometric_data/dgl_' + dataset_name)[0][0] labels = graph.ndata.pop('labels') features = graph.ndata.pop('features') features = torch.hstack([features, torch.ones([features.shape[0], 1])]) train_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/train_' + dataset_name + '.pt') valid_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/valid_' + dataset_name + '.pt') test_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name + '/test_' + dataset_name + '.pt') num_classes = len(torch.unique(labels)) return graph, features, labels, num_classes, train_idx, valid_idx, test_idx
def __init__(self, path): dataset = "ogbn-papers100M" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) super(OGBNpapers100MDataset, self).__init__(dataset, path, transform=T.ToSparseTensor()) setattr(OGBNpapers100MDataset, "metric", "Accuracy") setattr(OGBNpapers100MDataset, "loss", "nll_loss")
def __init__(self, path): dataset = "ogbn-proteins" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) super(OGBNproteinsDataset, self).__init__(dataset, path, transform=T.ToSparseTensor()) setattr(OGBNproteinsDataset, "metric", "ROC-AUC") setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss")
def __init__(self): dataset = "ogbn-arxiv" path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data") if not osp.exists(path): PygNodePropPredDataset(dataset, path) super(OGBArxivDataset, self).__init__(path, dataset) #to_symmetric rev_edge_index = self.data.edge_index[[1, 0]] edge_index = torch.cat([self.data.edge_index, rev_edge_index], dim = 1).to(dtype=torch.int64) self.data.edge_index, self.data.edge_attr = coalesce(edge_index, None, self.data.num_nodes, self.data.num_nodes)