def __init__(self, path): dataset = "ogbn-papers100M" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) super(OGBNpapers100MDataset, self).__init__(dataset, path, transform=T.ToSparseTensor()) setattr(OGBNpapers100MDataset, "metric", "Accuracy") setattr(OGBNpapers100MDataset, "loss", "nll_loss")
def __init__(self, path): dataset = "ogbn-proteins" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) super(OGBNproteinsDataset, self).__init__(dataset, path, transform=T.ToSparseTensor()) setattr(OGBNproteinsDataset, "metric", "ROC-AUC") setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss")
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=500) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--kind', type=str, default="ReLU") args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() train_idx = split_idx['train'].to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) else: model = GCN(data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, kind=args.kind).to(device) evaluator = Evaluator(name='ogbn-arxiv') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.print_statistics(run) logger.print_statistics()
def __init__( self, N: int, dim: int, graph_info: dict, length_range: tuple, # e.g = (5,50) noise_range: tuple, # e.g. = (0.1,0.5), model_types: list, # e.g. = ["fBM","CTRW"], drift_range: list, # e.g. = (0.,0.3), seed_offset: int, # e.g. = 0): ): self.N = N self.seed_offset = seed_offset self.graph_info = graph_info self.generators = generators[dim] self.model_types = model_types self.length_range = length_range self.drift_range = drift_range self.noise_range = noise_range self.dim = dim if self.graph_info["features_on_edges"] == False: transform = Transforms.ToSparseTensor() else: transform = None super(TrajDataSet, self).__init__(transform=transform)
def __init__(self, path): dataset = "ogbn-proteins" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygNodePropPredDataset(name=dataset, root=path) super(OGBNproteinsDataset, self).__init__(dataset, path) dataset_t = PygNodePropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) # Move edge features to node features. self.data.x = dataset_t[0].adj_t.mean(dim=1) # dataset_t[0].adj_t.set_value_(None) del dataset_t setattr(OGBNproteinsDataset, "metric", "ROC-AUC") setattr(OGBNproteinsDataset, "loss", "binary_cross_entropy_with_logits") split_idx = self.get_idx_split() datalist = [] for d in self: setattr(d, "train_mask", index_to_mask(split_idx['train'], d.y.shape[0])) setattr(d, "val_mask", index_to_mask(split_idx['valid'], d.y.shape[0])) setattr(d, "test_mask", index_to_mask(split_idx['test'], d.y.shape[0])) datalist.append(d) self.data, self.slices = self.collate(datalist)
def load_test_data( ) -> Tuple[pyg.torch_geometric.data.Data, pyg.torch_geometric.data.Data, Dict[ str, torch.Tensor], Dict[str, torch.Tensor]]: ''' Returns Tuple valid_graph Graph containing all training edges test_graph Graph containing all training edges, plus validation edges valid_edges Dict of positive and negative edges from validation edge split (not in train_graph) test_edges Dict of positive and negative edges from test edge split (not in valid_graph) ''' dataset = PygLinkPropPredDataset(name='ogbl-ddi') transform = T.ToSparseTensor(False) edge_split = dataset.get_edge_split() valid_edges = edge_split['valid'] test_edges = edge_split['test'] valid_graph = dataset[0] test_graph = valid_graph.clone() # Add validation edges to valid_graph for test inference valid_edge_index = torch.cat([ test_graph.edge_index, valid_edges['edge'].T, valid_edges['edge'][:, [1, 0]].T ], dim=1) test_graph.edge_index = valid_edge_index valid_graph = transform(valid_graph) test_graph = transform(test_graph) return valid_graph, test_graph, valid_edges, test_edges
def load_training_data() -> Tuple[pyg.data.Data, pyg.data.Data, Dict[ str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]: ''' Returns Tuple train_graph Graph containing a subset of the training edges valid_graph Graph containing all training edges train_edges Dict of positive edges across entire train split eval_edges Dict of positive edges from the training edges set that aren't in eval_graph valid_edges Dict of positive and negative edges not in train_graph. ''' dataset = PygLinkPropPredDataset(name='ogbl-ddi') transform = T.ToSparseTensor(False) edge_split = dataset.get_edge_split() train_edges = edge_split['train'] valid_edges = edge_split['valid'] train_graph = dataset[0] valid_graph = train_graph.clone() # Partition training edges torch.manual_seed(12345) perm = torch.randperm(train_edges['edge'].shape[0]) eval_idxs, train_idxs = perm[:valid_edges['edge']. shape[0]], perm[valid_edges['edge'].shape[0]:] eval_edges = {'edge': train_edges['edge'][eval_idxs]} train_edges = {'edge': train_edges['edge'][train_idxs]} # Update graph object to have subset of edges and adj_t matrix train_edge_index = torch.cat( [train_edges['edge'].T, train_edges['edge'][:, [1, 0]].T], dim=1) train_graph.edge_index = train_edge_index train_graph = transform(train_graph) valid_graph = transform(valid_graph) return train_graph, valid_graph, edge_split[ 'train'], eval_edges, valid_edges
def main(): parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=3000) parser.add_argument('--runs', type=int, default=1) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-arxiv',root='/mnt/ogbdata', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() print(split_idx['train'].nonzero().size(0)/len(split_idx['train']), split_idx['valid'].nonzero().size(0)/len(split_idx['train']), split_idx['test'].nonzero().size(0)/len(split_idx['train']))
def __init__(self, data_dir): super().__init__() self.data_dir = data_dir self.transform = T.Compose([ T.OneHotDegree(self.num_features - 1), T.ToSparseTensor(), ])
def load_ogb_2(dataset): ## Load the dataset ## Setup PyTorch device_name = 'cuda:0' if torch.cuda.is_available() else 'cpu' device = torch.device(device_name) dataset = PygNodePropPredDataset(name=dataset, transform=T.ToSparseTensor()) ogb_data = dataset[0] # TODO: Not sure how to format adj_t... ogb_data.adj_t = ogb_data.adj_t.to_symmetric() ogb_data = ogb_data.to(device) split_idx = dataset.get_idx_split() train_idx, valid_idx, test_idx = split_idx["train"], split_idx[ "valid"], split_idx["test"] train_idx, valid_idx, split_idx = train_idx.numpy(), valid_idx.numpy( ), test_idx.numpy() # Convert OGB's data split pytorch index vectors to Wang's data split numpy boolean masks train_mask_2 = indexes2booleanvec(ogb_data.num_nodes, train_idx) val_mask_2 = indexes2booleanvec(ogb_data.num_nodes, valid_idx) test_mask_2 = indexes2booleanvec(ogb_data.num_nodes, test_idx) # Add 1's down the diagonal of adj_t adj_t = ogb_data.adj_t.to_torch_sparse_coo_tensor() adj_t = adj_t + sparse_identity(adj_t.shape[0]) # Convert OGB's adjacency SparseTensor to Wang's adjacency index matrix (Nx2) adj_2_0 = adj_t.coalesce().indices().numpy() adj_2_0 = adj_2_0.T.astype('int32') ##adj_2_0 = np.vstack((adj_2_0, np.array([[i,i] for i in range(ogb_data.num_nodes)]))) adj_2_1 = adj_t.coalesce().values().numpy().astype('float64') adj_2_2 = tuple(adj_t.size()) #TODO: Fix the adjacency matrix, bc it probably is symmetric with identity adj_2 = (adj_2_0, adj_2_1, adj_2_2) from sklearn.preprocessing import OneHotEncoder labels_2 = ogb_data.y.numpy() labels_2 = OneHotEncoder(sparse=False).fit_transform(labels_2) #TODO: I don't know if this feature vector will work # OGB used a skip-gram encoding, # whereas Wang's Citeseer just used normalized rows with 1-0 for different words x = ogb_data.x + 1.5 norm_x = np.apply_along_axis(np.linalg.norm, 1, x) x = x / norm_x[:, None] x = x.to_sparse() features_2_0 = x.indices().numpy().T.astype('int32') features_2_1 = x.values().numpy() #features_2_1 = 1.5 + features_2_1 features_2_1 = features_2_1.astype('float64') features_2_2 = tuple(x.size()) features_2 = features_2_0, features_2_1, features_2_2 data2 = features_2, labels_2, adj_2, train_mask_2, val_mask_2, test_mask_2 return data2
def __init__(self, path): dataset = "ogbl-biokg" # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset) PygLinkPropPredDataset(name=dataset, root=path, transform=T.ToSparseTensor()) super(OGBLbiokgDataset, self).__init__(dataset, path) setattr(OGBLbiokgDataset, "metric", "MRR") setattr(OGBLbiokgDataset, "loss", "pos_neg_loss")
def get_data(args): dataset = PygNodePropPredDataset(name=args['dataset_name'], transform=T.ToSparseTensor()) evaluator = Evaluator(name=args['dataset_name']) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() split_idx = dataset.get_idx_split() device = 'cuda' if torch.cuda.is_available() else 'cpu' data = data.to(device) for setname in ['train', 'valid', 'test']: split_idx[setname] = split_idx[setname].to(device) return data, dataset, split_idx, evaluator
def forward(self, data): data = T.ToSparseTensor()(data) x, adj_t = data.x, data.adj_t adj_t = gcn_norm(adj_t) x = F.dropout(x, self.dropout, training=self.training) x = x_0 = self.lins[0](x).relu() for conv in self.convs: x = F.dropout(x, self.dropout, training=self.training) x = conv(x=x, x_0=x_0, edge_index=adj_t[0]) x = x.relu() z = x x = F.dropout(x, self.dropout, training=self.training) x = self.lins[1](x) return z, x.log_softmax(dim=-1)
def __init__( self, dim: int, graph_info: dict, trajs: list, ): # e.g. = 0): self.N = len(trajs) self.trajs = trajs self.graph_info = graph_info self.dim = dim if self.graph_info["features_on_edges"] == False: transform = Transforms.ToSparseTensor() else: transform = None super(ExpTrajDataSet, self).__init__(transform=transform)
def get_dataset(name, root, use_sparse_tensor): path = osp.join(osp.dirname(osp.realpath(__file__)), root, name) transform = T.ToSparseTensor() if use_sparse_tensor else None if name == 'ogbn-mag': if transform is None: transform = T.ToUndirected(merge=True) else: transform = T.Compose([T.ToUndirected(merge=True), transform]) dataset = OGB_MAG(root=path, preprocess='metapath2vec', transform=transform) elif name == 'ogbn-products': dataset = PygNodePropPredDataset('ogbn-products', root=path, transform=transform) elif name == 'Reddit': dataset = Reddit(root=path, transform=transform) return dataset[0], dataset.num_classes
def setup_ogb(self): dataset = PygNodePropPredDataset(name='ogbn-arxiv', root=self.root, transform=T.ToSparseTensor()) data = dataset[0] self.metric = 'Accuracy' self.num_classes = dataset.num_classes self.split_idx = dataset.get_idx_split() self.x = data.x self.y = data.y self.adj_t = data.adj_t.to_symmetric() self.num_nodes = data.num_nodes if self.make_edge_index: row = self.adj_t.storage.row() col = self.adj_t.storage.col() self.edge_index = torch.stack((row, col), dim=0) self.criterion = torch.nn.CrossEntropyLoss()
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--k', type=int, default=50) parser.add_argument('--gpu_id', type=int, default=0) args = parser.parse_args() print(args) device = gpu_setup(args.gpu_id) dataset = PygLinkPropPredDataset(name='ogbl-ddi', transform=T.ToSparseTensor()) data = dataset[0] adj_t = data.adj_t.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} model = GCNWithAttention(args.hidden_channels, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, args.k).to(device) emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) print("model parameters {}".format(sum(p.numel() for p in model.parameters()))) print("predictor parameters {}".format(sum(p.numel() for p in predictor.parameters()))) print("total parameters {}".format(data.num_nodes*args.hidden_channels + sum(p.numel() for p in model.parameters())+sum(p.numel() for p in predictor.parameters()))) evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): torch.nn.init.xavier_uniform_(emb.weight) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(emb.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, emb.weight, adj_t, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, emb.weight, adj_t, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
import os.path as osp from ogb.nodeproppred import PygNodePropPredDataset, Evaluator import torch_geometric.transforms as T from torch_geometric.nn import LabelPropagation root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OGB') dataset = PygNodePropPredDataset('ogbn-arxiv', root, transform=T.Compose([ T.ToUndirected(), T.ToSparseTensor(), ])) split_idx = dataset.get_idx_split() evaluator = Evaluator(name='ogbn-arxiv') data = dataset[0] model = LabelPropagation(num_layers=3, alpha=0.9) out = model(data.y, data.adj_t, mask=split_idx['train']) y_pred = out.argmax(dim=-1, keepdim=True) val_acc = evaluator.eval({ 'y_true': data.y[split_idx['valid']], 'y_pred': y_pred[split_idx['valid']], })['acc'] test_acc = evaluator.eval({ 'y_true': data.y[split_idx['test']], 'y_pred': y_pred[split_idx['test']], })['acc']
def __init__(self, data_dir): super().__init__() self.data_dir = data_dir self.transform = T.ToSparseTensor()
def main(): parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--batch_size', type=int, default=64 * 1024) parser.add_argument('--lr', type=float, default=5e-4) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=1) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--k', type=int, default=100) parser.add_argument('--gpu_id', type=int, default=0) args = parser.parse_args() print(args) device = gpu_setup(args.gpu_id) dataset = PygLinkPropPredDataset(name='ogbl-collab') data = dataset[0] data.edge_weight = data.edge_weight.view(-1).to(torch.float) data = T.ToSparseTensor()(data) data = data.to(device) split_edge = dataset.get_edge_split() model = GCNWithAttention(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, args.k).to(device) predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout).to(device) print("model parameters {}".format( sum(p.numel() for p in model.parameters()))) print("predictor parameters {}".format( sum(p.numel() for p in predictor.parameters()))) print("total parameters {}".format( sum(p.numel() for p in model.parameters()) + sum(p.numel() for p in predictor.parameters()))) evaluator = Evaluator(name='ogbl-collab') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@50': Logger(args.runs, args), 'Hits@100': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, predictor, data, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
import os.path as osp import torch from torch.nn import Linear import torch.nn.functional as F from sklearn.metrics import f1_score from torch_geometric.datasets import PPI import torch_geometric.transforms as T from torch_geometric.nn import GCN2Conv from torch_geometric.loader import DataLoader path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'GCN2_PPI') pre_transform = T.Compose([T.GCNNorm(), T.ToSparseTensor()]) train_dataset = PPI(path, split='train', pre_transform=pre_transform) val_dataset = PPI(path, split='val', pre_transform=pre_transform) test_dataset = PPI(path, split='test', pre_transform=pre_transform) train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False) test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False) class Net(torch.nn.Module): def __init__(self, hidden_channels, num_layers, alpha, theta, shared_weights=True, dropout=0.0): super(Net, self).__init__() self.lins = torch.nn.ModuleList() self.lins.append(Linear(train_dataset.num_features, hidden_channels)) self.lins.append(Linear(hidden_channels, train_dataset.num_classes)) self.convs = torch.nn.ModuleList()
def main(): parser = argparse.ArgumentParser(description="OGBL-COLLAB (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_sage", action="store_true") parser.add_argument("--use_valedges_as_input", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.0) parser.add_argument("--batch_size", type=int, default=64 * 1024) parser.add_argument("--lr", type=float, default=0.001) parser.add_argument("--epochs", type=int, default=400) parser.add_argument("--eval_steps", type=int, default=1) parser.add_argument("--runs", type=int, default=1) parser.add_argument("--seed",type=int,default=1) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygLinkPropPredDataset(name="ogbl-collab") data = dataset[0] edge_index = data.edge_index data.edge_weight = data.edge_weight.view(-1).to(torch.float) data = T.ToSparseTensor()(data) split_edge = dataset.get_edge_split() # Use training + validation edges for inference on test set. if args.use_valedges_as_input: val_edge_index = split_edge["valid"]["edge"].t() full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1) data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t() data.full_adj_t = data.full_adj_t.to_symmetric() else: data.full_adj_t = data.adj_t data = data.to(device) if args.use_sage: model = SAGE( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) else: model = GCN( data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout, ).to(device) predictor = LinkPredictor( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout ).to(device) evaluator = Evaluator(name="ogbl-collab") loggers = { "Hits@10": Logger(args.runs, args), "Hits@50": Logger(args.runs, args), "Hits@100": Logger(args.runs, args), } for run in tqdm(range(args.runs)): torch.manual_seed(args.seed + run) np.random.seed(args.seed+run) model.reset_parameters() predictor.reset_parameters() optimizer = torch.optim.Adam( list(model.parameters()) + list(predictor.parameters()), lr=args.lr ) for epoch in range(1, 1 + args.epochs): loss = train(model, predictor, data, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test( model, predictor, data, split_edge, evaluator, args.batch_size ) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print( f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_hits:.2f}%, " f"Valid: {100 * valid_hits:.2f}%, " f"Test: {100 * test_hits:.2f}%" ) print("---") for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()
def __init__(self, final_graph, x_data, y_data): super(Dataset, self).__init__() self.final_graph = final_graph self.x_data = x_data self.y_data = y_data self.transform = T.ToSparseTensor(remove_edge_index=False)
def main(): parser = argparse.ArgumentParser(description='OGBN-Proteins (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.0) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=1000) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name='ogbn-proteins', transform=T.ToSparseTensor()) data = dataset[0] # Move edge features to node features. data.x = data.adj_t.mean(dim=1) data.adj_t.set_value_(None) split_idx = dataset.get_idx_split() train_idx = split_idx['train'].to(device) if args.use_sage: model = SAGE(data.num_features, args.hidden_channels, 112, args.num_layers, args.dropout).to(device) else: model = GCN(data.num_features, args.hidden_channels, 112, args.num_layers, args.dropout).to(device) # Pre-compute GCN normalization. adj_t = data.adj_t.set_diag() deg = adj_t.sum(dim=1).to(torch.float) deg_inv_sqrt = deg.pow(-0.5) deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0 adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1) data.adj_t = adj_t data = data.to(device) evaluator = Evaluator(name='ogbn-proteins') logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) if epoch % args.eval_steps == 0: result = test(model, data, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_rocauc, valid_rocauc, test_rocauc = result print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_rocauc:.2f}%, ' f'Valid: {100 * valid_rocauc:.2f}% ' f'Test: {100 * test_rocauc:.2f}%') logger.print_statistics(run) logger.print_statistics()
import os.path as osp import torch from ogb.nodeproppred import Evaluator, PygNodePropPredDataset import torch_geometric.transforms as T from torch_geometric.nn import MLP, CorrectAndSmooth root = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OGB') dataset = PygNodePropPredDataset('ogbn-products', root, transform=T.ToSparseTensor()) evaluator = Evaluator(name='ogbn-products') split_idx = dataset.get_idx_split() data = dataset[0] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = MLP([dataset.num_features, 200, 200, dataset.num_classes], dropout=0.5, batch_norm=True, relu_first=True).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.01) criterion = torch.nn.CrossEntropyLoss() x, y = data.x.to(device), data.y.to(device) train_idx = split_idx['train'].to(device) val_idx = split_idx['valid'].to(device) test_idx = split_idx['test'].to(device) x_train, y_train = x[train_idx], y[train_idx]
from ogb.graphproppred.mol_encoder import AtomEncoder from torch.nn import BatchNorm1d, Linear, ReLU, Sequential from torch.optim.lr_scheduler import ReduceLROnPlateau import torch_geometric.transforms as T from torch_geometric.loader import DataLoader from torch_geometric.nn import EGConv, global_mean_pool parser = argparse.ArgumentParser() parser.add_argument('--use_multi_aggregators', action='store_true', help='Switch between EGC-S and EGC-M') args = parser.parse_args() path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'OGB') dataset = OGBG('ogbg-molhiv', path, pre_transform=T.ToSparseTensor()) evaluator = Evaluator('ogbg-molhiv') split_idx = dataset.get_idx_split() train_dataset = dataset[split_idx['train']] val_dataset = dataset[split_idx['valid']] test_dataset = dataset[split_idx['test']] train_loader = DataLoader(train_dataset, batch_size=32, num_workers=4, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=256) test_loader = DataLoader(test_dataset, batch_size=256)
def run(file, data_name, model_name,lr): parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--use_sage', action='store_true') parser.add_argument('--num_layers', type=int, default=2) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--batch_size', type=int, default=64*1024) parser.add_argument('--lr', type=float, default=0.001) parser.add_argument('--epochs', type=int, default=200) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=10) parser.add_argument('--use_nd', action='store_true') parser.add_argument('--use_lgae', action='store_true') parser.add_argument('--use_vgae', action='store_true') parser.add_argument('--model', type=str, default='') parser.add_argument('--dataset', type=str, default='Citeseer') args = parser.parse_args() if data_name != None and model_name != None and lr != None: args.dataset = data_name args.model = model_name args.lr = lr print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' # device = 'cpu' device = torch.device(device) dataset = CitationFull(os.path.join('citation_data',args.dataset),name=args.dataset,transform=T.ToSparseTensor()) num_training = int(dataset.__len__()*0.8) num_val = int(dataset.__len__()*0.1) num_test = dataset.__len__() - (num_training+num_val) data = dataset[0] adj_t = data.adj_t.to(device) edge_index, edge_type = utils.dense_to_sparse(adj_t.to_dense()) data.edge_index = edge_index data.x = data.x.to(device) num_nodes = data.x.shape[0] num_edges = data.edge_index.shape[1] print(data) # nx_data = to_networkx(data, to_undirected=True) # print('graph density='+str(2*num_edges/(num_nodes*(num_nodes-1)))) # print('clustering coefficient='+str(nx.average_clustering(nx_data))) decoder_enable = args.model[-3:] if args.model[-3:] == '-nd': model_name = args.model[:-3] if model_name == 'lgae': model = LGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'vgae': model = DeepVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'gae': model = GraphAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arga': model = AdversarialGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'arvga': model = AdversarialVGAE(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'lrga': model = LRGA(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) elif model_name == 'sage': model = SAGEAutoEncoder(data.num_features, args.hidden_channels, args.hidden_channels, args.num_layers, args.dropout) if decoder_enable == '-nd': model.decoder = NeuralDecoder( args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout) evaluator = Evaluator(name='ogbl-ddi') model = model.to(device) loggers = {} K_list = ['20','50','100'] for k in K_list: loggers['Hits@'+k] = Logger(args.runs, args) for run in range(args.runs): torch.manual_seed(run) split_edge = utils.train_test_split_edges(data) # print(split_edge.train_pos_edge_index.shape) # print(split_edge.val_pos_edge_index.shape) # exit() split_edge.edge_index = edge_index # emb.weight.data = features model.reset_parameters() if args.model in ['arga','arga-nd','arvga','arvga-nd']: args.lr=0.005 optimizer = torch.optim.Adam( list(model.parameters()), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data.x, adj_t, split_edge, optimizer, args.batch_size) if epoch % args.eval_steps == 0: results = test(model, data.x, adj_t, split_edge, evaluator, args.batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits, test_auc, test_ap, val_auc, val_ap = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'auc: {100 * test_auc:.2f}%, ' f'ap: {100 * test_ap:.2f}%, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%', ) print('---') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) toWrite = loggers[key].print_statistics() file.write(str(args.lr)+' ' +key + ' ' +args.model+"'"+str(toWrite)+'\n') file.flush()
def main(): parser = argparse.ArgumentParser(description="OGBN-Arxiv (GNN)") parser.add_argument("--device", type=int, default=0) parser.add_argument("--log_steps", type=int, default=1) parser.add_argument("--use_sage", action="store_true") parser.add_argument("--num_layers", type=int, default=3) parser.add_argument("--hidden_channels", type=int, default=256) parser.add_argument("--dropout", type=float, default=0.5) parser.add_argument("--lr", type=float, default=0.01) parser.add_argument("--epochs", type=int, default=500) parser.add_argument("--runs", type=int, default=10) args = parser.parse_args() print(args) device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu" device = torch.device(device) dataset = PygNodePropPredDataset(name="ogbn-arxiv", transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() data = data.to(device) split_idx = dataset.get_idx_split() train_idx = split_idx["train"].to(device) if args.use_sage: model = SAGE( data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, ).to(device) else: model = GCN( data.num_features, args.hidden_channels, dataset.num_classes, args.num_layers, args.dropout, ).to(device) evaluator = Evaluator(name="ogbn-arxiv") logger = Logger(args.runs, args) for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, data, train_idx, optimizer) result = test(model, data, split_idx, evaluator) logger.add_result(run, result) if epoch % args.log_steps == 0: train_acc, valid_acc, test_acc = result print(f"Run: {run + 1:02d}, " f"Epoch: {epoch:02d}, " f"Loss: {loss:.4f}, " f"Train: {100 * train_acc:.2f}%, " f"Valid: {100 * valid_acc:.2f}% " f"Test: {100 * test_acc:.2f}%") logger.print_statistics(run) logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='gen_models') parser.add_argument('--device', type=int, default=0) parser.add_argument('--dataset', type=str, default='arxiv') parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--model', type=str, default='mlp') parser.add_argument('--num_layers', type=int, default=3) parser.add_argument('--hidden_channels', type=int, default=256) parser.add_argument('--use_embeddings', action='store_true') parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--lr', type=float, default=0.01) parser.add_argument('--epochs', type=int, default=300) parser.add_argument('--runs', type=int, default=10) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygNodePropPredDataset(name=f'ogbn-{args.dataset}', transform=T.ToSparseTensor()) data = dataset[0] data.adj_t = data.adj_t.to_symmetric() x = data.x split_idx = dataset.get_idx_split() preprocess_data = PygNodePropPredDataset(name=f'ogbn-{args.dataset}')[0] if args.dataset == 'arxiv': embeddings = torch.cat([ preprocess(preprocess_data, 'diffusion', post_fix=args.dataset), preprocess(preprocess_data, 'spectral', post_fix=args.dataset) ], dim=-1) elif args.dataset == 'products': embeddings = preprocess(preprocess_data, 'spectral', post_fix=args.dataset) if args.use_embeddings: x = torch.cat([x, embeddings], dim=-1) if args.dataset == 'arxiv': x = (x - x.mean(0)) / x.std(0) if args.model == 'mlp': model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes, args.num_layers, 0.5, args.dataset == 'products').to(device) elif args.model == 'linear': model = MLPLinear(x.size(-1), dataset.num_classes).to(device) elif args.model == 'plain': model = MLPLinear(x.size(-1), dataset.num_classes).to(device) elif args.model == 'sgc': model = SGC(x.size(-1), dataset.num_classes).to(device) x = x.to(device) y_true = data.y.to(device) train_idx = split_idx['train'].to(device) model_dir = prepare_folder(f'{args.dataset}_{args.model}', model) evaluator = Evaluator(name=f'ogbn-{args.dataset}') logger = Logger(args.runs, args) for run in range(args.runs): import gc gc.collect() print(sum(p.numel() for p in model.parameters())) model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid = 0 best_out = None for epoch in range(1, args.epochs): loss = train(model, x, y_true, train_idx, optimizer) result, out = test(model, x, y_true, split_idx, evaluator) train_acc, valid_acc, test_acc = result if valid_acc > best_valid: best_valid = valid_acc best_out = out.cpu().exp() if (epoch % 10 == 0): print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_acc:.2f}%, ' f'Valid: {100 * valid_acc:.2f}% ' f'Test: {100 * test_acc:.2f}%') logger.add_result(run, result) logger.print_statistics(run) torch.save(best_out, f'{model_dir}/{run}.pt') logger.print_statistics()
def main(): parser = argparse.ArgumentParser(description='OGBL-DDI') parser.add_argument('--device', type=int, default=0) parser.add_argument('--log_steps', type=int, default=1) parser.add_argument('--model', type=str, default='MAD_GCN', choices=[ 'GCN_Linear', 'SAGE_Linear', 'MAD_GCN', 'MAD_SAGE', 'MAD_Model' ]) parser.add_argument('--train_batch_size', type=int, default=4096) parser.add_argument('--test_batch_size', type=int, default=1024) parser.add_argument('--lr', type=float, default=0.005) parser.add_argument('--epochs', type=int, default=100) parser.add_argument('--eval_steps', type=int, default=5) parser.add_argument('--runs', type=int, default=5) args = parser.parse_args() print(args) device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu' device = torch.device(device) dataset = PygLinkPropPredDataset(name='ogbl-ddi', transform=T.ToSparseTensor()) data = dataset[0] adj_t = data.adj_t.to(device) split_edge = dataset.get_edge_split() # We randomly pick some training samples that we want to evaluate on: torch.manual_seed(12345) idx = torch.randperm(split_edge['train']['edge'].size(0)) idx = idx[:split_edge['valid']['edge'].size(0)] split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]} model = models.get_model(args.model)(data.num_nodes, adj_t).to(device) print(f"Parameters: {count_parameters(model)}") evaluator = Evaluator(name='ogbl-ddi') loggers = { 'Hits@10': Logger(args.runs, args), 'Hits@20': Logger(args.runs, args), 'Hits@30': Logger(args.runs, args), } for run in range(args.runs): model.reset_parameters() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) for epoch in range(1, 1 + args.epochs): loss = train(model, adj_t, split_edge, optimizer, args.train_batch_size) if epoch % args.eval_steps == 0: results = test(model, adj_t, split_edge, evaluator, args.test_batch_size) for key, result in results.items(): loggers[key].add_result(run, result) if epoch % args.log_steps == 0: for key, result in results.items(): train_hits, valid_hits, test_hits = result print(key) print(f'Run: {run + 1:02d}, ' f'Epoch: {epoch:02d}, ' f'Loss: {loss:.4f}, ' f'Train: {100 * train_hits:.2f}%, ' f'Valid: {100 * valid_hits:.2f}%, ' f'Test: {100 * test_hits:.2f}%') print('---') print(f'Finished epoch {epoch}') for key in loggers.keys(): print(key) loggers[key].print_statistics(run) for key in loggers.keys(): print(key) loggers[key].print_statistics()