示例#1
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--kind', type=str, default="ReLU")
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train'].to(device)

    if args.use_sage:
        model = SAGE(data.num_features, args.hidden_channels,
                     dataset.num_classes, args.num_layers,
                     args.dropout, kind=args.kind).to(device)
    else:
        model = GCN(data.num_features, args.hidden_channels,
                    dataset.num_classes, args.num_layers,
                    args.dropout, kind=args.kind).to(device)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data, train_idx, optimizer)
            result = test(model, data, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}% '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
示例#2
0
def load_data(dataset_name="Cora", seed=10, n_splits=5):
    # Path in which the data will be stored
    path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data',
                    dataset_name)
    if dataset_name in ["CS", "Physics"]:
        dataset = Coauthor(path, dataset_name, T.NormalizeFeatures())
    elif dataset_name in ["Computers", "Photo"]:
        dataset = Amazon(path, dataset_name, T.NormalizeFeatures())
    elif dataset_name in ["Cora", "Citeseer", "Pubmed"]:
        dataset = Planetoid(path,
                            dataset_name,
                            split='public',
                            transform=T.NormalizeFeatures())
    elif dataset_name in ["Arxiv", "Papers", "Products"]:
        dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name],
                                         root=path,
                                         transform=T.NormalizeFeatures())
    elif dataset_name == "MAG":
        dataset = PygNodePropPredDataset(name=ogb_data_name_conv[dataset_name],
                                         root=path)
    else:
        raise Exception("[!] Dataset not found: ", str(dataset_name))
    if dataset_name in obg_datasets:
        data = split_ogb_data(dataset, dataset_name)
    else:
        data = dataset[0]  # pyg graph object
        data = split_data(data, seed, n_splits)
        data.num_classes = dataset.num_classes
    return data
示例#3
0
def eval_with_partition(args):
    model_load_path = args.model_load_path
    print("Starting evaluating model stored at", model_load_path)

    device = torch.device("cuda")

    dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder)
    graph = dataset[0]
    adj = SparseTensor(row=graph.edge_index[0], col=graph.edge_index[1])
    if args.self_loop:
        adj = adj.set_diag()
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    # print('%s' % args)

    model = DeeperGCN(args).to(device)
    ckpt = torch.load(model_load_path)
    model.load_state_dict(ckpt['model_state_dict'])

    res = test_with_partition(model,
                              graph,
                              adj,
                              split_idx,
                              num_clusters=args.eval_cluster_number,
                              partition_method=args.partition_method,
                              evaluator=evaluator,
                              device=device)
    print(res)
    return res
示例#4
0
文件: ogb.py 项目: zdcuob/AutoGL
    def __init__(self, path):
        dataset = "ogbn-proteins"
        # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
        PygNodePropPredDataset(name=dataset, root=path)
        super(OGBNproteinsDataset, self).__init__(dataset, path)
        dataset_t = PygNodePropPredDataset(name=dataset,
                                           root=path,
                                           transform=T.ToSparseTensor())

        # Move edge features to node features.
        self.data.x = dataset_t[0].adj_t.mean(dim=1)
        # dataset_t[0].adj_t.set_value_(None)
        del dataset_t

        setattr(OGBNproteinsDataset, "metric", "ROC-AUC")
        setattr(OGBNproteinsDataset, "loss",
                "binary_cross_entropy_with_logits")
        split_idx = self.get_idx_split()
        datalist = []
        for d in self:
            setattr(d, "train_mask",
                    index_to_mask(split_idx['train'], d.y.shape[0]))
            setattr(d, "val_mask",
                    index_to_mask(split_idx['valid'], d.y.shape[0]))
            setattr(d, "test_mask",
                    index_to_mask(split_idx['test'], d.y.shape[0]))
            datalist.append(d)
        self.data, self.slices = self.collate(datalist)
示例#5
0
def get_product_clusters():
    dataset_name = "ogbn-products"
    dataset = PygNodePropPredDataset(name=dataset_name)

    print('The {} dataset has {} graph'.format(dataset_name, len(dataset)))

    data = dataset[0]
    print(data)
    split_idx = dataset.get_idx_split()
    train_idx = split_idx['train']
    val_idx = split_idx['valid']
    test_idx = split_idx['test']

    train_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    train_mask[train_idx] = True
    data['train_mask'] = train_mask

    val_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    val_mask[val_idx] = True
    data['valid_mask'] = val_mask

    test_mask = torch.zeros(data.num_nodes, dtype=torch.bool)
    test_mask[test_idx] = True
    data['test_mask'] = test_mask

    cluster_data = ClusterData(data, num_parts=15000, save_dir="dataset")
    return cluster_data, dataset, data, split_idx
示例#6
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Proteins (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=1000)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-proteins')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = scatter(data.edge_attr, data.edge_index[0], dim=0,
                dim_size=data.num_nodes, reduce='mean').to('cpu')

    if args.use_node_embedding:
        embedding = torch.load('embedding.pt', map_location='cpu')
        x = torch.cat([x, embedding], dim=-1)

    x = x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model = MLP(x.size(-1), args.hidden_channels, 112, args.num_layers,
                args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-proteins')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)

            if epoch % args.eval_steps == 0:
                result = test(model, x, y_true, split_idx, evaluator)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_rocauc, valid_rocauc, test_rocauc = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {100 * train_rocauc:.2f}%, '
                          f'Valid: {100 * valid_rocauc:.2f}% '
                          f'Test: {100 * test_rocauc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
示例#7
0
def main():

    args = ArgsInit().args

    dataset = PygNodePropPredDataset(name=args.dataset)
    graph = dataset[0]

    if args.self_loop:
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    print(args)

    model = DeeperGCN(args)

    print(model)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    result = test(model, graph.x, graph.edge_index, graph.y, split_idx,
                  evaluator)
    print(result)
    model.print_params(final=True)
示例#8
0
def main():
    parser = argparse.ArgumentParser(description="OGBN-Arxiv (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=500)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name="ogbn-arxiv")
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    y_true = data.y.to(device)
    train_idx = split_idx["train"].to(device)

    model = MLP(
        x.size(-1),
        args.hidden_channels,
        dataset.num_classes,
        args.num_layers,
        args.dropout,
    ).to(device)

    evaluator = Evaluator(name="ogbn-arxiv")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f"Run: {run + 1:02d}, "
                      f"Epoch: {epoch:02d}, "
                      f"Loss: {loss:.4f}, "
                      f"Train: {100 * train_acc:.2f}%, "
                      f"Valid: {100 * valid_acc:.2f}%, "
                      f"Test: {100 * test_acc:.2f}%")

        logger.print_statistics(run)
    logger.print_statistics()
示例#9
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=3000)
    parser.add_argument('--runs', type=int, default=1)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-arxiv',root='/mnt/ogbdata',
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_idx = dataset.get_idx_split()
    print(split_idx['train'].nonzero().size(0)/len(split_idx['train']),
    split_idx['valid'].nonzero().size(0)/len(split_idx['train']),
    split_idx['test'].nonzero().size(0)/len(split_idx['train']))
示例#10
0
def eval_model(params):
    model_load_path, args = params
    if os.path.isdir(args.model_load_path):
        model_load_dir = args.model_load_path
        model_load_path = os.path.join(model_load_dir, model_load_path)
    print("Starting evaluating model stored at", model_load_path)

    dataset = PygNodePropPredDataset(name=args.dataset, root=args.data_folder)
    graph = dataset[0]

    if args.self_loop:
        graph.edge_index = add_self_loops(edge_index=graph.edge_index,
                                          num_nodes=graph.num_nodes)[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    args.in_channels = graph.x.size(-1)
    args.num_tasks = dataset.num_classes

    model = DeeperGCN(args)
    ckpt = torch.load(model_load_path, map_location=torch.device('cpu'))
    model.load_state_dict(ckpt['model_state_dict'])
    test_res = test(model, graph.x, graph.edge_index, graph.y, split_idx,
                    evaluator)
    test_res["model_load_path"] = model_load_path

    return test_res
示例#11
0
文件: mlp.py 项目: haczqyf/GraphZoom
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = np.load('./embed_results/embeddings.npy')
        embedding = torch.from_numpy(embedding).float()
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    model = MLP(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, x, y_true, train_idx, optimizer)
            result = test(model, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()

    total_params = sum(p.numel() for p in model.parameters())
    print(f'mlp total params are {total_params}')
示例#12
0
def main_fixed_mask(args):

    device = torch.device("cuda:" + str(args.device))
    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)
    train_idx = split_idx['train'].to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    model = DeeperGCN(args).to(device)
    pruning.add_mask(model, args.num_layers)
    
    for name, param in model.named_parameters():
        if 'mask' in name:
            param.requires_grad = False

    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
    results = {'highest_valid': 0, 'final_train': 0, 'final_test': 0, 'highest_train': 0, 'epoch': 0}
    
    start_epoch = 1
    for epoch in range(start_epoch, args.epochs + 1):
    
        epoch_loss = train_fixed(model, x, edge_index, y_true, train_idx, optimizer, args)
        result = test(model, x, edge_index, y_true, split_idx, evaluator)
        train_accuracy, valid_accuracy, test_accuracy = result

        if valid_accuracy > results['highest_valid']:
            results['highest_valid'] = valid_accuracy
            results['final_train'] = train_accuracy
            results['final_test'] = test_accuracy
            results['epoch'] = epoch

        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
              'Baseline (FIX Mask) Epoch:[{}/{}]\t LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}]'
              .format(epoch, args.epochs, epoch_loss, train_accuracy * 100,
                                                               valid_accuracy * 100,
                                                               test_accuracy * 100, 
                                                               results['final_test'] * 100,
                                                               results['epoch']))
    print("=" * 120)
    print("syd final: Baseline, Train:[{:.2f}]  Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}]"
        .format(            results['final_train'] * 100,
                            results['highest_valid'] * 100,
                            results['epoch'],
                            results['final_test'] * 100))
    print("=" * 120)
示例#13
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (SIGN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    split_idx = dataset.get_idx_split()
    data = SIGN(args.num_layers)(dataset[0])  # This might take a while.

    xs = [data.x] + [data[f'x{i}'] for i in range(1, args.num_layers + 1)]
    xs_train = [x[split_idx['train']].to(device) for x in xs]
    xs_valid = [x[split_idx['valid']].to(device) for x in xs]
    xs_test = [x[split_idx['test']].to(device) for x in xs]

    y_train_true = data.y[split_idx['train']].to(device)
    y_valid_true = data.y[split_idx['valid']].to(device)
    y_test_true = data.y[split_idx['test']].to(device)

    model = MLP(data.x.size(-1), args.hidden_channels, dataset.num_classes,
                args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, xs_train, y_train_true, optimizer)

            train_acc = test(model, xs_train, y_train_true, evaluator)
            valid_acc = test(model, xs_valid, y_valid_true, evaluator)
            test_acc = test(model, xs_test, y_test_true, evaluator)
            result = (train_acc, valid_acc, test_acc)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_acc, valid_acc, test_acc = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {100 * train_acc:.2f}%, '
                      f'Valid: {100 * valid_acc:.2f}%, '
                      f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
示例#14
0
def load_ogb_2(dataset):
    ## Load the dataset

    ## Setup PyTorch
    device_name = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device_name)

    dataset = PygNodePropPredDataset(name=dataset,
                                     transform=T.ToSparseTensor())
    ogb_data = dataset[0]
    # TODO: Not sure how to format adj_t...
    ogb_data.adj_t = ogb_data.adj_t.to_symmetric()
    ogb_data = ogb_data.to(device)

    split_idx = dataset.get_idx_split()
    train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
        "valid"], split_idx["test"]
    train_idx, valid_idx, split_idx = train_idx.numpy(), valid_idx.numpy(
    ), test_idx.numpy()

    # Convert OGB's data split pytorch index vectors to Wang's data split numpy boolean masks
    train_mask_2 = indexes2booleanvec(ogb_data.num_nodes, train_idx)
    val_mask_2 = indexes2booleanvec(ogb_data.num_nodes, valid_idx)
    test_mask_2 = indexes2booleanvec(ogb_data.num_nodes, test_idx)

    # Add 1's down the diagonal of adj_t
    adj_t = ogb_data.adj_t.to_torch_sparse_coo_tensor()
    adj_t = adj_t + sparse_identity(adj_t.shape[0])
    # Convert OGB's adjacency SparseTensor to Wang's adjacency index matrix (Nx2)
    adj_2_0 = adj_t.coalesce().indices().numpy()
    adj_2_0 = adj_2_0.T.astype('int32')
    ##adj_2_0 = np.vstack((adj_2_0, np.array([[i,i] for i in range(ogb_data.num_nodes)])))
    adj_2_1 = adj_t.coalesce().values().numpy().astype('float64')
    adj_2_2 = tuple(adj_t.size())
    #TODO: Fix the adjacency matrix, bc it probably is symmetric with identity
    adj_2 = (adj_2_0, adj_2_1, adj_2_2)

    from sklearn.preprocessing import OneHotEncoder
    labels_2 = ogb_data.y.numpy()
    labels_2 = OneHotEncoder(sparse=False).fit_transform(labels_2)

    #TODO: I don't know if this feature vector will work
    # OGB used a skip-gram encoding,
    # whereas Wang's Citeseer just used normalized rows with 1-0 for different words
    x = ogb_data.x + 1.5
    norm_x = np.apply_along_axis(np.linalg.norm, 1, x)
    x = x / norm_x[:, None]
    x = x.to_sparse()
    features_2_0 = x.indices().numpy().T.astype('int32')
    features_2_1 = x.values().numpy()
    #features_2_1 = 1.5 + features_2_1
    features_2_1 = features_2_1.astype('float64')
    features_2_2 = tuple(x.size())
    features_2 = features_2_0, features_2_1, features_2_2

    data2 = features_2, labels_2, adj_2, train_mask_2, val_mask_2, test_mask_2

    return data2
示例#15
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Products (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=6)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygNodePropPredDataset(name='ogbn-products')
    splitted_idx = dataset.get_idx_split()
    data = dataset[0]

    # Convert split indices to boolean masks and add them to `data`.
    for key, idx in splitted_idx.items():
        mask = torch.zeros(data.num_nodes, dtype=torch.bool)
        mask[idx] = True
        data[f'{key}_mask'] = mask

    cluster_data = ClusterData(data,
                               num_parts=args.num_partitions,
                               recursive=False,
                               save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data,
                           batch_size=args.batch_size,
                           shuffle=True,
                           num_workers=args.num_workers)

    model = SAGE(data.x.size(-1), args.hidden_channels, 47, args.num_layers,
                 args.dropout).to(device)

    evaluator = Evaluator(name='ogbn-products')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, loader, optimizer, device)
            if epoch % args.log_steps == 0:
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}')
        result = test(model, data, evaluator)
        logger.add_result(run, result)
        logger.print_statistics(run)
    logger.print_statistics()
示例#16
0
文件: configs.py 项目: jingmouren/egc
def arxiv_data(root):
    # keep the same data loading logic for all architectures
    dataset = PygNodePropPredDataset(
        name="ogbn-arxiv",
        root=root,  # transform=T.ToSparseTensor(),
    )
    data = dataset[0]
    # data.adj_t = data.adj_t.to_symmetric()
    data.edge_index = to_undirected(data.edge_index)
    split_idx = dataset.get_idx_split()

    return data, split_idx
示例#17
0
def get_data(args):
    dataset = PygNodePropPredDataset(name=args['dataset_name'], transform=T.ToSparseTensor())
    evaluator = Evaluator(name=args['dataset_name'])

    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    split_idx = dataset.get_idx_split()

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    data = data.to(device)
    for setname in ['train', 'valid', 'test']:
        split_idx[setname] = split_idx[setname].to(device)

    return data, dataset, split_idx, evaluator
示例#18
0
文件: dataset.py 项目: CreaterLL/GCA
def get_dataset(path, name):
    assert name in ['Cora', 'CiteSeer', 'PubMed', 'DBLP', 'Karate', 'WikiCS', 'Coauthor-CS', 'Coauthor-Phy',
                    'Amazon-Computers', 'Amazon-Photo', 'ogbn-arxiv', 'ogbg-code']
    name = 'dblp' if name == 'DBLP' else name
    root_path = osp.expanduser('~/datasets')

    if name == 'Coauthor-CS':
        return Coauthor(root=path, name='cs', transform=T.NormalizeFeatures())

    if name == 'Coauthor-Phy':
        return Coauthor(root=path, name='physics', transform=T.NormalizeFeatures())

    if name == 'WikiCS':
        return WikiCS(root=path, transform=T.NormalizeFeatures())

    if name == 'Amazon-Computers':
        return Amazon(root=path, name='computers', transform=T.NormalizeFeatures())

    if name == 'Amazon-Photo':
        return Amazon(root=path, name='photo', transform=T.NormalizeFeatures())

    if name.startswith('ogbn'):
        return PygNodePropPredDataset(root=osp.join(root_path, 'OGB'), name=name, transform=T.NormalizeFeatures())

    return (CitationFull if name == 'dblp' else Planetoid)(osp.join(root_path, 'Citation'), name, transform=T.NormalizeFeatures())
示例#19
0
文件: ogb.py 项目: zdcuob/AutoGL
    def __init__(self, path):
        dataset = "ogbn-products"
        # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
        PygNodePropPredDataset(name=dataset, root=path)
        super(OGBNproductsDataset, self).__init__(dataset, path)
        # Pre-compute GCN normalization.
        # adj_t = self.data.adj_t.set_diag()
        # deg = adj_t.sum(dim=1).to(torch.float)
        # deg_inv_sqrt = deg.pow(-0.5)
        # deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        # adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
        # self.data.adj_t = adj_t

        setattr(OGBNproductsDataset, "metric", "Accuracy")
        setattr(OGBNproductsDataset, "loss", "nll_loss")
        split_idx = self.get_idx_split()
        datalist = []
        for d in self:
            setattr(d, "train_mask",
                    index_to_mask(split_idx['train'], d.y.shape[0]))
            setattr(d, "val_mask",
                    index_to_mask(split_idx['valid'], d.y.shape[0]))
            setattr(d, "test_mask",
                    index_to_mask(split_idx['test'], d.y.shape[0]))
            datalist.append(d)
        self.data, self.slices = self.collate(datalist)
示例#20
0
 def __init__(self, args=None):
     dataset = "ogbn-mag"
     path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data",
                     dataset)
     if not osp.exists(path):
         PygNodePropPredDataset(dataset, path)
     super(OGBMAGDataset, self).__init__(path, dataset)
示例#21
0
文件: ogb.py 项目: Frozenmad/AutoGL
    def __init__(self, path):
        dataset = "ogbn-mag"
        # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
        PygNodePropPredDataset(name=dataset, root=path)
        super(OGBNmagDataset, self).__init__(dataset, path)

        # Preprocess
        rel_data = self[0]
        # We are only interested in paper <-> paper relations.
        self.data = Data(
            x=rel_data.x_dict["paper"],
            edge_index=rel_data.edge_index_dict[("paper", "cites", "paper")],
            y=rel_data.y_dict["paper"],
        )

        # self.data = T.ToSparseTensor()(data)
        # self[0].adj_t = self[0].adj_t.to_symmetric()

        setattr(OGBNmagDataset, "metric", "Accuracy")
        setattr(OGBNmagDataset, "loss", "nll_loss")
        split_idx = self.get_idx_split()

        datalist = []
        for d in self:
            setattr(d, "train_mask", index_to_mask(split_idx["train"], d.y.shape[0]))
            setattr(d, "val_mask", index_to_mask(split_idx["valid"], d.y.shape[0]))
            setattr(d, "test_mask", index_to_mask(split_idx["test"], d.y.shape[0]))
            datalist.append(d)
        self.data, self.slices = self.collate(datalist)
示例#22
0
 def __init__(self):
     dataset = "ogbn-papers100M"
     path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data",
                     dataset)
     if not osp.exists(path):
         PygNodePropPredDataset(dataset, path)
     super(OGBArxivDataset, self).__init__(path, dataset)
示例#23
0
def load_ogb(name, dataset_dir):
    if name[:4] == 'ogbn':
        dataset = PygNodePropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = ['train_mask', 'val_mask', 'test_mask']
        for i, key in enumerate(splits.keys()):
            mask = index2mask(splits[key], size=dataset.data.y.shape[0])
            set_dataset_attr(dataset, split_names[i], mask, len(mask))
        edge_index = to_undirected(dataset.data.edge_index)
        set_dataset_attr(dataset, 'edge_index', edge_index,
                         edge_index.shape[1])

    elif name[:4] == 'ogbg':
        dataset = PygGraphPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_idx_split()
        split_names = [
            'train_graph_index', 'val_graph_index', 'test_graph_index'
        ]
        for i, key in enumerate(splits.keys()):
            id = splits[key]
            set_dataset_attr(dataset, split_names[i], id, len(id))

    elif name[:4] == "ogbl":
        dataset = PygLinkPropPredDataset(name=name, root=dataset_dir)
        splits = dataset.get_edge_split()

        id = splits['train']['edge'].T
        if cfg.dataset.resample_negative:
            set_dataset_attr(dataset, 'train_pos_edge_index', id, id.shape[1])
            # todo: applying transform for negative sampling is very slow
            dataset.transform = neg_sampling_transform
        else:
            id_neg = negative_sampling(edge_index=id,
                                       num_nodes=dataset.data.num_nodes[0],
                                       num_neg_samples=id.shape[1])
            id_all = torch.cat([id, id_neg], dim=-1)
            label = get_link_label(id, id_neg)
            set_dataset_attr(dataset, 'train_edge_index', id_all,
                             id_all.shape[1])
            set_dataset_attr(dataset, 'train_edge_label', label, len(label))

        id, id_neg = splits['valid']['edge'].T, splits['valid']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'val_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'val_edge_label', label, len(label))

        id, id_neg = splits['test']['edge'].T, splits['test']['edge_neg'].T
        id_all = torch.cat([id, id_neg], dim=-1)
        label = get_link_label(id, id_neg)
        set_dataset_attr(dataset, 'test_edge_index', id_all, id_all.shape[1])
        set_dataset_attr(dataset, 'test_edge_label', label, len(label))

    else:
        raise ValueError('OGB dataset: {} non-exist')

    return dataset
示例#24
0
def load_data(args, datapath):
    if args.dataset in ['arxiv'] and args.task == 'lp':
        data = {}
        dataset = PygNodePropPredDataset(name='ogbn-{}'.format(args.dataset),
                                         root='/pasteur/u/jeffgu/hgcn/data')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        induced_edges_train, _ = subgraph(train_idx, dataset[0].edge_index)
        induced_edges_valid, _ = subgraph(valid_idx, dataset[0].edge_index)
        induced_edges_test, _ = subgraph(test_idx, dataset[0].edge_index)
        neg_edges_train = negative_sampling(induced_edges_train)
        neg_edges_valid = negative_sampling(induced_edges_valid)
        neg_edges_test = negative_sampling(induced_edges_test)
        data['adj_train'] = to_scipy_sparse_matrix(
            dataset[0].edge_index).tocsr()
        data['features'] = dataset[0].x
        data['train_edges'], data[
            'train_edges_false'] = induced_edges_train, neg_edges_train
        data['val_edges'], data[
            'val_edges_false'] = induced_edges_valid, neg_edges_valid
        data['test_edges'], data[
            'test_edges_false'] = induced_edges_test, neg_edges_test
    elif args.task == 'nc':
        data = load_data_nc(args.dataset, args.use_feats, datapath,
                            args.split_seed)
    else:
        data = load_data_lp(args.dataset, args.use_feats, datapath)
        adj = data['adj_train']
        if args.task == 'lp':
            adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_edges(
                adj, args.val_prop, args.test_prop, args.split_seed)
            data['adj_train'] = adj_train
            data['train_edges'], data[
                'train_edges_false'] = train_edges, train_edges_false
            data['val_edges'], data[
                'val_edges_false'] = val_edges, val_edges_false
            data['test_edges'], data[
                'test_edges_false'] = test_edges, test_edges_false
    data['adj_train_norm'], data['features'] = process(data['adj_train'],
                                                       data['features'],
                                                       args.normalize_adj,
                                                       args.normalize_feats)
    if args.dataset == 'airport':
        data['features'] = augment(data['adj_train'], data['features'])
    return data
示例#25
0
文件: misc.py 项目: duzx16/lcgnn
def create_dataset(name):
    if name.startswith("ogbn"):
        dataset = name[5:]
        if dataset in ["arxiv", "products", "papers100M", "proteins"]:
            return PygNodePropPredDataset(name)
    elif name.startswith("saint"):
        dataset = name[6:]
        return SAINTDataset(dataset)
示例#26
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygNodePropPredDataset(name=args.dataset)
    data = dataset[0]
    split_idx = dataset.get_idx_split()

    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)
    y_true = data.y.to(device)

    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)

    if args.self_loop:
        edge_index = add_self_loops(edge_index, num_nodes=data.num_nodes)[0]

    args.in_channels = data.x.size(-1)
    args.num_tasks = dataset.num_classes

    print(args)

    model = DeeperGCN(args)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    result = test(model, x, edge_index, y_true, split_idx, evaluator)
    train_accuracy, valid_accuracy, test_accuracy = result

    print({
        'Train': train_accuracy,
        'Validation': valid_accuracy,
        'Test': test_accuracy
    })

    model.print_params(final=True)
示例#27
0
def load_ogb_graph(dataset_name):
    if not os.path.isfile('torch_geometric_data/dgl_' + dataset_name):
        dataset = PygNodePropPredDataset(name="ogbn-" + dataset_name,
                                         root='torch_geometric_data/')
        split_idx = dataset.get_idx_split()
        train_idx, valid_idx, test_idx = split_idx["train"], split_idx[
            "valid"], split_idx["test"]
        edge = dataset[0].edge_index
        num_classes = len(np.unique(dataset[0].y))
        print("Nodes: %d, edges: %d, features: %d, classes: %d. \n" %
              (dataset[0].y.shape[0], len(edge[0]) / 2, len(
                  dataset[0].x[0]), num_classes))
        graph = dgl.DGLGraph((edge[0], edge[1]))
        graph.ndata['features'] = dataset[0].x
        graph.ndata['labels'] = dataset[0].y
        dgl.data.utils.save_graphs('torch_geometric_data/dgl_' + dataset_name,
                                   graph)
        torch.save(
            train_idx, 'torch_geometric_data/ogbn_' + dataset_name +
            '/train_' + dataset_name + '.pt')
        torch.save(
            valid_idx, 'torch_geometric_data/ogbn_' + dataset_name +
            '/valid_' + dataset_name + '.pt')
        torch.save(
            test_idx, 'torch_geometric_data/ogbn_' + dataset_name + '/test_' +
            dataset_name + '.pt')
        labels = graph.ndata.pop('labels')
        features = graph.ndata.pop('features')
        features = torch.hstack([features, torch.ones([features.shape[0], 1])])
        #print(features)
    elif os.path.isfile('torch_geometric_data/dgl_' + dataset_name):
        graph = dgl.data.utils.load_graphs('torch_geometric_data/dgl_' +
                                           dataset_name)[0][0]
        labels = graph.ndata.pop('labels')
        features = graph.ndata.pop('features')
        features = torch.hstack([features, torch.ones([features.shape[0], 1])])
        train_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                               '/train_' + dataset_name + '.pt')
        valid_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                               '/valid_' + dataset_name + '.pt')
        test_idx = torch.load('torch_geometric_data/ogbn_' + dataset_name +
                              '/test_' + dataset_name + '.pt')
        num_classes = len(torch.unique(labels))

    return graph, features, labels, num_classes, train_idx, valid_idx, test_idx
示例#28
0
文件: ogb.py 项目: zhuyawen/AutoGL
 def __init__(self, path):
     dataset = "ogbn-papers100M"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygNodePropPredDataset(name=dataset,
                            root=path,
                            transform=T.ToSparseTensor())
     super(OGBNpapers100MDataset,
           self).__init__(dataset, path, transform=T.ToSparseTensor())
     setattr(OGBNpapers100MDataset, "metric", "Accuracy")
     setattr(OGBNpapers100MDataset, "loss", "nll_loss")
示例#29
0
文件: ogb.py 项目: zhuyawen/AutoGL
 def __init__(self, path):
     dataset = "ogbn-proteins"
     # path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data", dataset)
     PygNodePropPredDataset(name=dataset,
                            root=path,
                            transform=T.ToSparseTensor())
     super(OGBNproteinsDataset, self).__init__(dataset,
                                               path,
                                               transform=T.ToSparseTensor())
     setattr(OGBNproteinsDataset, "metric", "ROC-AUC")
     setattr(OGBNproteinsDataset, "loss", "BCEWithLogitsLoss")
示例#30
0
文件: ogb.py 项目: yunyoonaer/cogdl
    def __init__(self):
        dataset = "ogbn-arxiv"
        path = osp.join(osp.dirname(osp.realpath(__file__)), "../..", "data")
        if not osp.exists(path):
            PygNodePropPredDataset(dataset, path)
        super(OGBArxivDataset, self).__init__(path, dataset)

        #to_symmetric
        rev_edge_index = self.data.edge_index[[1, 0]]
        edge_index = torch.cat([self.data.edge_index, rev_edge_index], dim = 1).to(dtype=torch.int64)
        self.data.edge_index, self.data.edge_attr = coalesce(edge_index, None, self.data.num_nodes, self.data.num_nodes)