Пример #1
0
def load_test_data(
) -> Tuple[pyg.torch_geometric.data.Data, pyg.torch_geometric.data.Data, Dict[
    str, torch.Tensor], Dict[str, torch.Tensor]]:
    '''
    Returns Tuple
        valid_graph Graph containing all training edges
        test_graph Graph containing all training edges, plus validation edges
        valid_edges Dict of positive and negative edges from validation edge split (not in train_graph)
        test_edges Dict of positive and negative edges from test edge split (not in valid_graph)
    '''
    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    transform = T.ToSparseTensor(False)
    edge_split = dataset.get_edge_split()
    valid_edges = edge_split['valid']
    test_edges = edge_split['test']
    valid_graph = dataset[0]
    test_graph = valid_graph.clone()

    # Add validation edges to valid_graph for test inference
    valid_edge_index = torch.cat([
        test_graph.edge_index, valid_edges['edge'].T,
        valid_edges['edge'][:, [1, 0]].T
    ],
                                 dim=1)
    test_graph.edge_index = valid_edge_index

    valid_graph = transform(valid_graph)
    test_graph = transform(test_graph)
    return valid_graph, test_graph, valid_edges, test_edges
Пример #2
0
def load_training_data() -> Tuple[pyg.data.Data, pyg.data.Data, Dict[
    str, torch.Tensor], Dict[str, torch.Tensor], Dict[str, torch.Tensor]]:
    '''
    Returns Tuple
        train_graph Graph containing a subset of the training edges
        valid_graph Graph containing all training edges
        train_edges Dict of positive edges across entire train split
        eval_edges Dict of positive edges from the training edges set that aren't in eval_graph
        valid_edges Dict of positive and negative edges not in train_graph.
    '''
    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    transform = T.ToSparseTensor(False)
    edge_split = dataset.get_edge_split()
    train_edges = edge_split['train']
    valid_edges = edge_split['valid']
    train_graph = dataset[0]
    valid_graph = train_graph.clone()

    # Partition training edges
    torch.manual_seed(12345)
    perm = torch.randperm(train_edges['edge'].shape[0])
    eval_idxs, train_idxs = perm[:valid_edges['edge'].
                                 shape[0]], perm[valid_edges['edge'].shape[0]:]
    eval_edges = {'edge': train_edges['edge'][eval_idxs]}
    train_edges = {'edge': train_edges['edge'][train_idxs]}

    # Update graph object to have subset of edges and adj_t matrix
    train_edge_index = torch.cat(
        [train_edges['edge'].T, train_edges['edge'][:, [1, 0]].T], dim=1)
    train_graph.edge_index = train_edge_index
    train_graph = transform(train_graph)
    valid_graph = transform(valid_graph)

    return train_graph, valid_graph, edge_split[
        'train'], eval_edges, valid_edges
Пример #3
0
def main():

    args = ArgsInit().args

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128])
    split_edge = dataset.get_edge_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)

    edge_index = data.edge_index.to(device)

    args.in_channels = data.x.size(-1)
    args.num_tasks = 1

    print(args)

    model = DeeperGCN(args).to(device)
    predictor = LinkPredictor(args).to(device)

    model.load_state_dict(torch.load(args.model_load_path)['model_state_dict'])
    model.to(device)

    predictor.load_state_dict(
        torch.load(args.predictor_load_path)['model_state_dict'])
    predictor.to(device)

    hits = ['Hits@10', 'Hits@50', 'Hits@100']

    result = test(model, predictor, x, edge_index, split_edge, evaluator,
                  args.batch_size)

    for k in hits:
        train_result, valid_result, test_result = result[k]
        print('{}--Train: {}, Validation: {}, Test: {}'.format(
            k, train_result, valid_result, test_result))
Пример #4
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.0005)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    x = data.x.to(device)
    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)
    adj = SparseTensor(row=edge_index[0], col=edge_index[1])

    if args.use_sage:
        model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels,
                     args.num_layers, args.dropout).to(device)
    else:
        model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels,
                    args.num_layers, args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_value(None)
        adj = adj.set_diag()
        deg = adj.sum(dim=1)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, x, adj, split_edge, optimizer,
                         args.batch_size)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, x, adj, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)

    logger.print_statistics()
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--model',
                        type=str,
                        default='MAD_GCN',
                        choices=[
                            'GCN_Linear', 'SAGE_Linear', 'MAD_GCN', 'MAD_SAGE',
                            'MAD_Model'
                        ])
    parser.add_argument('--train_batch_size', type=int, default=4096)
    parser.add_argument('--test_batch_size', type=int, default=1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=5)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi',
                                     transform=T.ToSparseTensor())
    data = dataset[0]
    adj_t = data.adj_t.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    model = models.get_model(args.model)(data.num_nodes, adj_t).to(device)
    print(f"Parameters: {count_parameters(model)}")

    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, adj_t, split_edge, optimizer,
                         args.train_batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, adj_t, split_edge, evaluator,
                               args.test_batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')
            print(f'Finished epoch {epoch}')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #6
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-Citation2 (GNN)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_sage", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.0005)
    parser.add_argument("--epochs", type=int, default=50)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-citation2",
                                     transform=T.ToSparseTensor())
    data = dataset[0]
    data.adj_t = data.adj_t.to_symmetric()
    data = data.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596]
    split_edge["eval_train"] = {
        "source_node": split_edge["train"]["source_node"][idx],
        "target_node": split_edge["train"]["target_node"][idx],
        "target_node_neg": split_edge["valid"]["target_node_neg"],
    }

    if args.use_sage:
        model = SAGE(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)
    else:
        model = GCN(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)

        # Pre-compute GCN normalization.
        adj_t = data.adj_t.set_diag()
        deg = adj_t.sum(dim=1).to(torch.float)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float("inf")] = 0
        adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
        data.adj_t = adj_t

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation2")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer,
                         args.batch_size)
            print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}")

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f"Run: {run + 1:02d}, "
                          f"Epoch: {epoch:02d}, "
                          f"Loss: {loss:.4f}, "
                          f"Train: {train_mrr:.4f}, "
                          f"Valid: {valid_mrr:.4f}, "
                          f"Test: {test_mrr:.4f}")

        print("GraphSAGE" if args.use_sage else "GCN")
        logger.print_statistics(run)
    print("GraphSAGE" if args.use_sage else "GCN")
    logger.print_statistics()
Пример #7
0
def main_get_mask(args, imp_num):

    device = torch.device("cuda:" + str(args.device))
    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]

    # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128])
    split_edge = dataset.get_edge_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)

    edge_index = data.edge_index.to(device)

    args.in_channels = data.x.size(-1)
    args.num_tasks = 1

    model = DeeperGCN(args).to(device)
    pruning.add_mask(model, args)

    for name, param in model.named_parameters():
        if 'mask' in name:
            param.requires_grad = False

    predictor = LinkPredictor(args).to(device)
    optimizer = torch.optim.Adam(list(model.parameters()) +
                                 list(predictor.parameters()),
                                 lr=args.lr)

    results = {'epoch': 0}
    keys = ['highest_valid', 'final_train', 'final_test', 'highest_train']
    hits = ['Hits@10', 'Hits@50', 'Hits@100']

    for key in keys:
        results[key] = {k: 0 for k in hits}

    start_epoch = 1
    for epoch in range(start_epoch, args.mask_epochs + 1):

        t0 = time.time()

        epoch_loss = train.train_fixed(model, predictor, x, edge_index,
                                       split_edge, optimizer, args.batch_size,
                                       args)
        result = train.test(model, predictor, x, edge_index, split_edge,
                            evaluator, args.batch_size, args)

        k = 'Hits@50'
        train_result, valid_result, test_result = result[k]

        if train_result > results['highest_train'][k]:
            results['highest_train'][k] = train_result

        if valid_result > results['highest_valid'][k]:
            results['highest_valid'][k] = valid_result
            results['final_train'][k] = train_result
            results['final_test'][k] = test_result
            results['epoch'] = epoch

        epoch_time = (time.time() - t0) / 60
        print(
            time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
            'IMP:[{}] (GET Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] Time:[{:.2f}min]'
            .format(imp_num, epoch, args.mask_epochs, epoch_loss,
                    train_result * 100, valid_result * 100, test_result *
                    100, results['final_test'][k] *
                    100, results['epoch'], epoch_time))
    print('-' * 100)
    print(
        "syd : IMP:[{}] (FIX Mask) Final Result Train:[{:.2f}]  Valid:[{:.2f}]  Test:[{:.2f}]"
        .format(imp_num, results['final_train'][k] * 100,
                results['highest_valid'][k] * 100,
                results['final_test'][k] * 100))
    print('-' * 100)
Пример #8
0
Файл: gnn.py Проект: tjufan/LRGA
def main():
    parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=5e-4)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--k', type=int, default=100)
    parser.add_argument('--gpu_id', type=int, default=0)
    args = parser.parse_args()
    print(args)
    device = gpu_setup(args.gpu_id)

    dataset = PygLinkPropPredDataset(name='ogbl-collab')
    data = dataset[0]
    data.edge_weight = data.edge_weight.view(-1).to(torch.float)
    data = T.ToSparseTensor()(data)
    data = data.to(device)

    split_edge = dataset.get_edge_split()

    model = GCNWithAttention(data.num_features, args.hidden_channels,
                             args.hidden_channels, args.num_layers,
                             args.dropout, args.k).to(device)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)
    print("model parameters {}".format(
        sum(p.numel() for p in model.parameters())))
    print("predictor parameters {}".format(
        sum(p.numel() for p in predictor.parameters())))
    print("total parameters {}".format(
        sum(p.numel() for p in model.parameters()) +
        sum(p.numel() for p in predictor.parameters())))
    evaluator = Evaluator(name='ogbl-collab')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #9
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-Citation2 (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--eval_steps", type=int, default=10)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-citation2")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596]
    split_edge["eval_train"] = {
        "source_node": split_edge["train"]["source_node"][idx],
        "target_node": split_edge["train"]["target_node"][idx],
        "target_node_neg": split_edge["valid"]["target_node_neg"],
    }

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation2")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)
            print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}")

            if epoch % args.eval_steps == 0:
                result = test(predictor, x, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f"Run: {run + 1:02d}, "
                          f"Epoch: {epoch:02d}, "
                          f"Loss: {loss:.4f}, "
                          f"Train: {train_mrr:.4f}, "
                          f"Valid: {valid_mrr:.4f}, "
                          f"Test: {test_mrr:.4f}")

        print("Node2vec" if args.use_node_embedding else "MLP")
        logger.print_statistics(run)
    print("Node2vec" if args.use_node_embedding else "MLP")
    logger.print_statistics()
Пример #10
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-DDI (GNN)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_sage", action="store_true")
    parser.add_argument("--num_layers", type=int, default=2)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.005)
    parser.add_argument("--epochs", type=int, default=200)
    parser.add_argument("--eval_steps", type=int, default=5)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-ddi", transform=T.ToSparseTensor())
    data = dataset[0]
    adj_t = data.adj_t.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge["train"]["edge"].size(0))
    idx = idx[: split_edge["valid"]["edge"].size(0)]
    split_edge["eval_train"] = {"edge": split_edge["train"]["edge"][idx]}

    if args.use_sage:
        model = SAGE(
            args.hidden_channels,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)
    else:
        model = GCN(
            args.hidden_channels,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)
    predictor = LinkPredictor(
        args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout
    ).to(device)

    evaluator = Evaluator(name="ogbl-ddi")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@20": Logger(args.runs, args),
        "Hits@30": Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.nn.init.xavier_uniform_(emb.weight)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters())
            + list(emb.parameters())
            + list(predictor.parameters()),
            lr=args.lr,
        )

        for epoch in range(1, 1 + args.epochs):
            loss = train(
                model,
                predictor,
                emb.weight,
                adj_t,
                split_edge,
                optimizer,
                args.batch_size,
            )

            if epoch % args.eval_steps == 0:
                results = test(
                    model,
                    predictor,
                    emb.weight,
                    adj_t,
                    split_edge,
                    evaluator,
                    args.batch_size,
                )
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(
                            f"Run: {run + 1:02d}, "
                            f"Epoch: {epoch:02d}, "
                            f"Loss: {loss:.4f}, "
                            f"Train: {100 * train_hits:.2f}%, "
                            f"Valid: {100 * valid_hits:.2f}%, "
                            f"Test: {100 * test_hits:.2f}%"
                        )
                    print("---")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #11
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-COLLAB (GNN)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_sage", action="store_true")
    parser.add_argument("--use_valedges_as_input", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--epochs", type=int, default=400)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=1)
    parser.add_argument("--seed",type=int,default=1)
    args = parser.parse_args()
    print(args)
    
    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-collab")
    data = dataset[0]
    edge_index = data.edge_index
    data.edge_weight = data.edge_weight.view(-1).to(torch.float)
    data = T.ToSparseTensor()(data)

    split_edge = dataset.get_edge_split()

    
    # Use training + validation edges for inference on test set.
    if args.use_valedges_as_input:
        val_edge_index = split_edge["valid"]["edge"].t()
        full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1)
        data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t()
        data.full_adj_t = data.full_adj_t.to_symmetric()
    else:
        data.full_adj_t = data.adj_t

    data = data.to(device)

    if args.use_sage:
        model = SAGE(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)
    else:
        model = GCN(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)

    predictor = LinkPredictor(
        args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout
    ).to(device)

    evaluator = Evaluator(name="ogbl-collab")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in tqdm(range(args.runs)):
        torch.manual_seed(args.seed + run)
        np.random.seed(args.seed+run)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()), lr=args.lr
        )

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(
                    model, predictor, data, split_edge, evaluator, args.batch_size
                )
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(
                            f"Run: {run + 1:02d}, "
                            f"Epoch: {epoch:02d}, "
                            f"Loss: {loss:.4f}, "
                            f"Train: {100 * train_hits:.2f}%, "
                            f"Valid: {100 * valid_hits:.2f}%, "
                            f"Test: {100 * test_hits:.2f}%"
                        )
                    print("---")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #12
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--use_valedges_as_input', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=400)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-collab')
    data = dataset[0]
    edge_index = data.edge_index
    data.edge_weight = data.edge_weight.view(-1).to(torch.float)
    data = T.ToSparseTensor()(data)

    split_edge = dataset.get_edge_split()

    # Use training + validation edges for inference on test set.
    if args.use_valedges_as_input:
        val_edge_index = split_edge['valid']['edge'].t()
        full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1)
        data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t()
        data.full_adj_t = data.full_adj_t.to_symmetric()
    else:
        data.full_adj_t = data.adj_t

    data = data.to(device)

    if args.use_sage:
        model = SAGE(data.num_features, args.hidden_channels,
                     args.hidden_channels, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout).to(device)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-collab')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #13
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (MF)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--std', type=float, default=0.2)
    parser.add_argument('--hidden_channels', type=int, default=128)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=300)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    emb = torch.nn.Embedding(data.num_nodes, 96).to(device)
    predictor = LinkPredictor(96, args.hidden_channels, 1, args.num_layers,
                              args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        torch.nn.init.normal_(emb.weight, std=args.std)
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(emb.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, emb.weight, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                result = test(predictor, emb.weight, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        print('MF')
        logger.print_statistics(run)

    print('MF')
    logger.print_statistics()
Пример #14
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-PPA (GNN)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--use_sage", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=20)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-ppa",
                                     transform=T.ToSparseTensor())

    data = dataset[0]
    data.x = data.x.to(torch.float)
    if args.use_node_embedding:
        data.x = torch.cat([data.x, torch.load("embedding.pt")], dim=-1)
    data = data.to(device)

    split_edge = dataset.get_edge_split()

    if args.use_sage:
        model = SAGE(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)
    else:
        model = GCN(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)

        # Pre-compute GCN normalization.
        adj_t = data.adj_t.set_diag()
        deg = adj_t.sum(dim=1).to(torch.float)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float("inf")] = 0
        adj_t = deg_inv_sqrt.view(-1, 1) * adj_t * deg_inv_sqrt.view(1, -1)
        data.adj_t = adj_t

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-ppa")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f"Run: {run + 1:02d}, "
                              f"Epoch: {epoch:02d}, "
                              f"Loss: {loss:.4f}, "
                              f"Train: {100 * train_hits:.2f}%, "
                              f"Valid: {100 * valid_hits:.2f}%, "
                              f"Test: {100 * test_hits:.2f}%")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #15
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    data = dataset[0]
    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    x = torch.load('embedding.pt', map_location='cpu').to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, data.edge_index, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(predictor, x, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #16
0
    def process_PygLinkDataset_homo(self, dataset: PygLinkPropPredDataset):
        data = dataset[0]
        self._name = dataset.name

        self.head_node_type = "entity"
        self.metapaths = [(self.head_node_type, "default", self.head_node_type)
                          ]

        self.edge_index_dict = {self.metapaths[0]: data.edge_index}
        self.num_nodes_dict = self.get_num_nodes_dict(self.edge_index_dict)
        self.node_types = list(self.num_nodes_dict.keys())

        if hasattr(data, "x") and data.x is not None:
            self.x_dict = {self.head_node_type: data.x}
        elif hasattr(data, "x_dict") and data.x_dict is not None:
            self.x_dict = data.x_dict
        else:
            self.x_dict = {}
        self.node_attr_shape = {
            node_type: x.size(1)
            for node_type, x in self.x_dict.items()
        }

        split_idx = dataset.get_edge_split()
        train_triples, valid_triples, test_triples = split_idx[
            "train"], split_idx["valid"], split_idx["test"]

        self.edge_index = []
        self.edge_index.extend(
            [valid_triples["edge"], valid_triples["edge_neg"]])
        self.edge_index.extend(
            [test_triples["edge"], test_triples["edge_neg"]])
        self.edge_index.extend([train_triples["edge"]])
        self.edge_index = torch.cat(self.edge_index, dim=0)

        self.edge_reltype = []
        self.edge_reltype.extend([
            torch.ones(
                valid_triples["edge"].size(0)),  # Ones correspond to pos edges
            torch.zeros(valid_triples["edge_neg"].size(
                0)),  # Zeroes correspond to neg edges
            torch.ones(test_triples["edge"].size(0)),
            torch.zeros(test_triples["edge_neg"].size(0)),
            torch.ones(train_triples["edge"].size(0))
        ])
        self.edge_reltype = torch.cat(self.edge_reltype, dim=0).to(torch.int)

        # Build train/test/valid idx
        self.start_idx = {
            "valid": 0,
            "test": len(valid_triples["edge"]) + len(valid_triples["edge_neg"])
        }
        self.start_idx["train"] = self.start_idx["test"] + len(
            test_triples["edge"]) + len(test_triples["edge_neg"])

        self.validation_idx = torch.arange(self.start_idx["valid"],
                                           self.start_idx["test"])
        self.testing_idx = torch.arange(self.start_idx["test"],
                                        self.start_idx["train"])
        self.training_idx = torch.arange(
            self.start_idx["train"],
            self.start_idx["train"] + train_triples["edge"].size(0))

        assert self.validation_idx.max() < self.testing_idx.min()
        assert self.testing_idx.max() < self.training_idx.min()
Пример #17
0
def main_get_mask(args, imp_num, rewind_weight_mask=None, rewind_predict_weight=None, resume_train_ckpt=None):

    device = torch.device("cuda:" + str(args.device))
    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    
    # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128])
    split_edge = dataset.get_edge_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)

    edge_index = data.edge_index.to(device)

    args.in_channels = data.x.size(-1)
    args.num_tasks = 1

    model = DeeperGCN(args).to(device)
    pruning.add_mask(model, args)
    predictor = LinkPredictor(args).to(device)
    pruning.add_trainable_mask_noise(model, args, c=1e-4)
    optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr)

    results = {'epoch': 0 }
    keys = ['highest_valid', 'final_train', 'final_test', 'highest_train']
    hits = ['Hits@10', 'Hits@50', 'Hits@100']

    for key in keys:
        results[key] = {k: 0 for k in hits}

    start_epoch = 1
    if resume_train_ckpt:
        
        start_epoch = resume_train_ckpt['epoch']
        rewind_weight_mask = resume_train_ckpt['rewind_weight_mask']
        ori_model_dict = model.state_dict()
        over_lap = {k : v for k, v in resume_train_ckpt['model_state_dict'].items() if k in ori_model_dict.keys()}
        ori_model_dict.update(over_lap)
        model.load_state_dict(ori_model_dict)
        print("Resume at IMP:[{}] epoch:[{}] len:[{}/{}]!".format(imp_num, resume_train_ckpt['epoch'], len(over_lap.keys()), len(ori_model_dict.keys())))
        optimizer.load_state_dict(resume_train_ckpt['optimizer_state_dict'])
        adj_spar, wei_spar = pruning.print_sparsity(model, args)
    else:
        rewind_weight_mask = copy.deepcopy(model.state_dict())
        rewind_predict_weight = copy.deepcopy(predictor.state_dict())

    for epoch in range(start_epoch, args.mask_epochs + 1):

        t0 = time.time()     
        
        epoch_loss, prune_info_dict = train.train_mask(model, predictor, 
                                                                 x, 
                                                                 edge_index, 
                                                                 split_edge, 
                                                                 optimizer, args)

        result = train.test(model, predictor, 
                                   x, 
                                   edge_index, 
                                   split_edge, 
                                   evaluator, 
                                   args.batch_size, args)

        k = 'Hits@50'
        train_result, valid_result, test_result = result[k]
        if train_result > results['highest_train'][k]:
            results['highest_train'][k] = train_result

        if valid_result > results['highest_valid'][k]:
            results['highest_valid'][k] = valid_result
            results['final_train'][k] = train_result
            results['final_test'][k] = test_result
            results['epoch'] = epoch
            pruning.save_all(model, predictor, 
                                    rewind_weight_mask, 
                                    optimizer, 
                                    imp_num, 
                                    epoch, 
                                    args.model_save_path, 
                                    'IMP{}_train_ckpt'.format(imp_num))

        epoch_time = (time.time() - t0) / 60
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
              'IMP:[{}] (GET Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] | Adj[{:.3f}%] Wei[{:.3f}%] Time:[{:.2f}min]'
              .format(imp_num, epoch, args.mask_epochs, 
                                      epoch_loss, 
                                      train_result * 100,
                                      valid_result * 100,
                                      test_result * 100,
                                      results['final_test'][k] * 100,
                                      results['epoch'],
                                      prune_info_dict['adj_spar'], 
                                      prune_info_dict['wei_spar'],
                                      epoch_time))

    rewind_weight_mask, adj_spar, wei_spar = pruning.change(rewind_weight_mask, model, args)
    print('-' * 100)
    print("INFO : IMP:[{}] (GET MASK) Final Result Train:[{:.2f}]  Valid:[{:.2f}]  Test:[{:.2f}] | Adj:[{:.3f}%] Wei:[{:.3f}%]"
        .format(imp_num, results['final_train'][k] * 100,
                         results['highest_valid'][k] * 100,
                         results['final_test'][k] * 100,
                         adj_spar, 
                         wei_spar))
    print('-' * 100)
    return rewind_weight_mask, rewind_predict_weight
Пример #18
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='DEA-JKNet on ogbl-ddi')
    parser.add_argument('--device', type=int, default=0,
                        help='which gpu to use if any (default: 0)')
    parser.add_argument('--jk_mode', type=str, default="max",
                        help='JKNet aggregation method [max, mean, lstm, sum, cat] (default: max)')
    parser.add_argument('--remove_batchnorm', action='store_true',
                        help='remove batchnorm layers')
    parser.add_argument('--use_stored_x', action='store_true',
                        help='use precomputed distance encoding and node statistics')

    parser.add_argument('--embed_dim', type=int, default=256,
                        help='initial node embedding dimension')
    parser.add_argument('--gnn_hidden_dim', type=int, default=256,
                        help='GNN(DEA) hidden layer dimension (default: 256)')
    parser.add_argument('--gnn_num_layers', type=int, default=3,
                        help='number of GNN(DEA) message passing layers (default: 3, must be greater than 1)')
    parser.add_argument('--k', type=int, default=2,
                        help='GNN(DEA) number of hops')
    parser.add_argument('--mlp_hidden_dim', type=int, default=256,
                        help='linear hidden layer dimension for edge prediction (default: 256)')
    parser.add_argument('--mlp_num_layers', type=int, default=2,
                        help='number of linear layers for edge prediction (default: 2, must be greater than 1)')
    parser.add_argument('--dropout', type=float, default=0.5,
                        help='dropout rate (default: 0.5)')

    parser.add_argument('--batch_size', type=int, default=1024*64,
                        help='input batch size for training (default: 1024*64)')
    parser.add_argument('--epochs', type=int, default=400,
                        help='number of epochs to train (default: 400)')
    parser.add_argument('--runs', type=int, default=10,
                        help='number of runs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.005,
                        help='Adam learning rate (default: 0.005')

    args = parser.parse_args()
    print(args)

    device = torch.device("cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu")
    print('Device: {}'.format(device))

    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    graph = dataset[0]
    split_idx = dataset.get_edge_split()
    evaluator = Evaluator(name='ogbl-ddi')
    edge_index = graph.edge_index
    adj_t = torch_sparse.SparseTensor.from_edge_index(edge_index)

    print('Train edges:', split_idx['train']['edge'].shape)
    print('Val edges:', split_idx['valid']['edge'].shape)
    print('Val negative edges:', split_idx['valid']['edge_neg'].shape)
    print('Test edges:', split_idx['test']['edge'].shape)
    print('Test negative edges:', split_idx['test']['edge_neg'].shape)

    if not args.use_stored_x:
        print('start computing extra features (~30 min):')
        nx_graph = to_networkx(graph, to_undirected=True)
        nx_degree = nx.degree(nx_graph)
        nx_pagerank = nx.pagerank(nx_graph)

        nx_clustering = nx.clustering(nx_graph)
        nx_centrality = nx.closeness_centrality(nx_graph)

        # S=200 nodes
        # np.random.seed(0)
        node_subset = np.random.choice(nx_graph.number_of_nodes(), size=200, replace=False)
        spd_feature = get_spd_matrix(G=nx_graph, S=node_subset, max_spd=5)

        # S = all nodes
        lp_feature = get_lp_matrix(adj_t.to_dense(), max_steps=5)

        # Convert to tensor
        tensor_degree = torch.Tensor([t[1] for t in nx_degree]).unsqueeze(1)
        tensor_pagerank = torch.Tensor([t[1] for t in nx_pagerank.items()]).unsqueeze(1)
        tensor_clustering = torch.Tensor([t[1] for t in nx_clustering.items()]).unsqueeze(1)
        tensor_centrality = torch.Tensor([t[1] for t in nx_centrality.items()]).unsqueeze(1)

        tensor_spd = torch.Tensor(spd_feature)
        tensor_lp = torch.Tensor(lp_feature)

        # Concat
        feature_tensor_list = [tensor_degree, tensor_pagerank, tensor_clustering,
                               tensor_centrality, tensor_spd, tensor_lp]
        x_feature = torch.cat(feature_tensor_list, dim=1)
        print('extra feature shape:', x_feature.shape)
    else:
        print('load extra features:')
        x_df = pd.read_csv('x_feature.csv')
        x_feature_numpy = x_df.to_numpy()
        x_feature = torch.Tensor(x_feature_numpy)
        print('extra feature shape:', x_feature.shape)

    # Normalize to 0-1
    x_max = torch.max(x_feature, dim=0, keepdim=True)[0]
    x_min = torch.min(x_feature, dim=0, keepdim=True)[0]
    x_feature = (x_feature - x_min) / (x_max - x_min + 1e-6)

    edge_index = edge_index.to(device)
    adj_t = adj_t.to(device)
    x_feature = x_feature.to(device)

    gnn_in_dim = args.embed_dim + x_feature.shape[1]

    if args.jk_mode == 'cat':
        mlp_in_dim = args.gnn_hidden_dim * args.gnn_num_layers
    else:
        mlp_in_dim = args.gnn_hidden_dim
    model = DEA_GNN_JK(num_nodes=graph.num_nodes, embed_dim=args.embed_dim,
                       gnn_in_dim=gnn_in_dim, gnn_hidden_dim=args.gnn_hidden_dim, gnn_out_dim=args.gnn_hidden_dim,
                       gnn_num_layers=args.gnn_num_layers, mlp_in_dim=mlp_in_dim, mlp_hidden_dim=args.mlp_hidden_dim,
                       mlp_out_dim=1, mlp_num_layers=args.mlp_num_layers,
                       dropout=args.dropout, gnn_batchnorm=not args.remove_batchnorm,
                       mlp_batchnorm=not args.remove_batchnorm,
                       K=args.k, jk_mode=args.jk_mode).to(device)

    print(model)
    print('Number of parameters:', sum(p.numel() for p in model.parameters()))

    # Multiple runs
    RUNS = args.runs
    best_val_scores = np.zeros((RUNS,))
    best_test_scores = np.zeros((RUNS,))

    for i in range(RUNS):
        random.seed(i + 1)
        torch.manual_seed(i + 1)
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        result = train(model, optimizer, evaluator, graph, x_feature, edge_index, adj_t, split_idx,
                       device=device, batch_size=args.batch_size, num_epochs=args.epochs, save_model=False)

        best_val_scores[i] = result[1]
        best_test_scores[i] = result[2]

        print('Run', i + 1, 'done.')

    log = 'Mean Val Hits: {:.4f}, SD Val Hits: {:.4f}'
    print(log.format(np.mean(best_val_scores), np.std(best_val_scores, ddof=1)))
    log = 'Mean Test Hits: {:.4f}, SD Test Hits: {:.4f}'
    print(log.format(np.mean(best_test_scores), np.std(best_test_scores, ddof=1)))
Пример #19
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (GraphSAINT)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=16 * 1024)
    parser.add_argument('--walk_length', type=int, default=3)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--num_steps', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    loader = GraphSAINTRandomWalkSampler(data,
                                         batch_size=args.batch_size,
                                         walk_length=args.walk_length,
                                         num_steps=args.num_steps,
                                         sample_coverage=0,
                                         save_dir=dataset.processed_dir)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    run_idx = 0

    while run_idx < args.runs:
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        run_success = True
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(
                f'Run: {run_idx + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}'
            )
            if loss > 2.:
                run_success = False
                logger.reset(run_idx)
                print('Learning failed. Rerun...')
                break

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model,
                              predictor,
                              data,
                              split_edge,
                              evaluator,
                              batch_size=64 * 1024,
                              device=device)
                logger.add_result(run_idx, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run_idx + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        print('GraphSAINT')
        if run_success:
            logger.print_statistics(run_idx)
            run_idx += 1

    print('GraphSAINT')
    logger.print_statistics()
Пример #20
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-PPA (MF)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.005)
    parser.add_argument("--epochs", type=int, default=1000)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-ppa")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-ppa")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in range(args.runs):
        emb.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(emb.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(emb.weight, predictor, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(emb.weight, predictor, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f"Run: {run + 1:02d}, "
                              f"Epoch: {epoch:02d}, "
                              f"Loss: {loss:.4f}, "
                              f"Train: {100 * train_hits:.2f}%, "
                              f"Valid: {100 * valid_hits:.2f}%, "
                              f"Test: {100 * test_hits:.2f}%")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #21
0
if not args.keep_old:
    # Backup python files.
    copy('seal_link_pred.py', args.res_dir)
    copy('utils.py', args.res_dir)
log_file = os.path.join(args.res_dir, 'log.txt')
# Save command line input.
cmd_input = 'python ' + ' '.join(sys.argv) + '\n'
with open(os.path.join(args.res_dir, 'cmd_input.txt'), 'a') as f:
    f.write(cmd_input)
print('Command line input: ' + cmd_input + ' is saved.')
with open(log_file, 'a') as f:
    f.write('\n' + cmd_input)

if args.dataset.startswith('ogbl'):
    dataset = PygLinkPropPredDataset(name=args.dataset)
    split_edge = dataset.get_edge_split()
    data = dataset[0]

else:
    path = osp.join('dataset', args.dataset)
    dataset = Planetoid(path, args.dataset)
    split_edge = do_edge_split(dataset)
    print(split_edge)
    data = dataset[0]
    #data.edge_index = split_edge['train']['edge'].t()

if args.dataset.startswith('ogbl-citation'):
    args.eval_metric = 'mrr'
    directed = True
elif args.dataset.startswith('ogbl-bvg'):
    # args.eval_metric = 'auc'
Пример #22
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ppa')
    data = dataset[0]
    splitted_edge = dataset.get_edge_split()

    if args.use_node_embedding:
        x = data.x.to(torch.float)
        x = torch.cat([x, torch.load('embedding.pt')], dim=-1)
        x = x.to(device)
    else:
        x = data.x.to(torch.float).to(device)

    edge_index = data.edge_index.to(device)
    adj = SparseTensor(row=edge_index[0], col=edge_index[1])

    if args.use_sage:
        model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels,
                     args.num_layers, args.dropout).to(device)
    else:
        model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels,
                    args.num_layers, args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_diag()
        deg = adj.sum(dim=1).to(torch.float)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, x, adj, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, x, adj, splitted_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #23
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              batch_size=64 * 1024, device=device)
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Пример #24
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ppa')
    splitted_edge = dataset.get_edge_split()
    data = dataset[0]

    x = data.x.to(torch.float)
    embedding = torch.load('embedding.pt', map_location='cpu')
    x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        predictor.reset_parameters()

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(predictor, x, splitted_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #25
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-COLLAB (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=200)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-collab")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-collab")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(predictor, x, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f"Run: {run + 1:02d}, "
                              f"Epoch: {epoch:02d}, "
                              f"Loss: {loss:.4f}, "
                              f"Train: {100 * train_hits:.2f}%, "
                              f"Valid: {100 * valid_hits:.2f}%, "
                              f"Test: {100 * test_hits:.2f}%")
                    print("---")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #26
0
def main():

    args = ArgsInit().save_exp()

    if args.use_tensor_board:
        writer = SummaryWriter(log_dir=args.save)

    if args.use_gpu:
        device = torch.device("cuda:" +
                              str(args.device)) if torch.cuda.is_available(
                              ) else torch.device("cpu")
    else:
        device = torch.device('cpu')

    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128])
    split_edge = dataset.get_edge_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)

    edge_index = data.edge_index.to(device)

    args.in_channels = data.x.size(-1)
    args.num_tasks = 1

    logging.info('%s' % args)

    model = DeeperGCN(args).to(device)
    predictor = LinkPredictor(args).to(device)

    logging.info(model)
    logging.info(predictor)

    optimizer = torch.optim.Adam(list(model.parameters()) +
                                 list(predictor.parameters()),
                                 lr=args.lr)

    results = {}
    keys = ['highest_valid', 'final_train', 'final_test', 'highest_train']
    hits = ['Hits@10', 'Hits@50', 'Hits@100']

    for key in keys:
        results[key] = {k: 0 for k in hits}

    start_time = time.time()

    for epoch in range(1, args.epochs + 1):

        epoch_loss = train(model, predictor, x, edge_index, split_edge,
                           optimizer, args.batch_size)
        logging.info('Epoch {}, training loss {:.4f}'.format(
            epoch, epoch_loss))
        model.print_params(epoch=epoch)

        result = test(model, predictor, x, edge_index, split_edge, evaluator,
                      args.batch_size)

        for k in hits:
            # return a tuple
            train_result, valid_result, test_result = result[k]

            if args.use_tensor_board and k == 'Hits@50':
                writer.add_scalar('stats/train_loss', epoch_loss, epoch)
                writer.add_scalar('stats/train_Hits@50', train_result, epoch)
                writer.add_scalar('stats/valid_Hits@50', valid_result, epoch)
                writer.add_scalar('stats/test_Hits@50', test_result, epoch)

            if train_result > results['highest_train'][k]:
                results['highest_train'][k] = train_result

            if valid_result > results['highest_valid'][k]:
                results['highest_valid'][k] = valid_result
                results['final_train'][k] = train_result
                results['final_test'][k] = test_result

                save_ckpt(model,
                          optimizer,
                          round(epoch_loss, 4),
                          epoch,
                          args.model_save_path,
                          k,
                          name_post='valid_best')
                save_ckpt(predictor,
                          optimizer,
                          round(epoch_loss, 4),
                          epoch,
                          args.model_save_path,
                          k,
                          name_post='valid_best_link_predictor')

        logging.info(result)

    logging.info("%s" % results)

    end_time = time.time()
    total_time = end_time - start_time
    time_used = 'Total time: {}'.format(
        time.strftime('%H:%M:%S', time.gmtime(total_time)))
    logging.info(time_used)
Пример #27
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--k', type=int, default=50)
    parser.add_argument('--gpu_id', type=int, default=0)
    args = parser.parse_args()
    print(args)
    device = gpu_setup(args.gpu_id)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi',
                                     transform=T.ToSparseTensor())
    data = dataset[0]
    adj_t = data.adj_t.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    model = GCNWithAttention(args.hidden_channels, args.hidden_channels,
                args.hidden_channels, args.num_layers,
                args.dropout, args.k).to(device)

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)
    print("model parameters {}".format(sum(p.numel() for p in model.parameters())))
    print("predictor parameters {}".format(sum(p.numel() for p in predictor.parameters())))
    print("total parameters {}".format(data.num_nodes*args.hidden_channels + 
    sum(p.numel() for p in model.parameters())+sum(p.numel() for p in predictor.parameters())))
    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.nn.init.xavier_uniform_(emb.weight)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(emb.parameters()) +
            list(predictor.parameters()), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, emb.weight, adj_t, split_edge,
                         optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, emb.weight, adj_t, split_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
Пример #28
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load('embedding.pt', map_location='cpu')
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch % args.eval_steps == 0:
                result = test(predictor, x, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        print('Node2vec' if args.use_node_embedding else 'MLP')
        logger.print_statistics(run)
    print('Node2vec' if args.use_node_embedding else 'MLP')
    logger.print_statistics()
Пример #29
0
def main_fixed_mask(args, imp_num, resume_train_ckpt=None):

    device = torch.device("cuda:" + str(args.device))
    dataset = PygLinkPropPredDataset(name=args.dataset)
    data = dataset[0]
    # Data(edge_index=[2, 2358104], edge_weight=[2358104, 1], edge_year=[2358104, 1], x=[235868, 128])
    split_edge = dataset.get_edge_split()
    evaluator = Evaluator(args.dataset)

    x = data.x.to(device)

    edge_index = data.edge_index.to(device)

    args.in_channels = data.x.size(-1)
    args.num_tasks = 1

    model = DeeperGCN(args).to(device)
    pruning.add_mask(model, args)
    predictor = LinkPredictor(args).to(device)
    
    rewind_weight_mask, adj_spar, wei_spar = pruning.resume_change(resume_train_ckpt, model, args)
    model.load_state_dict(rewind_weight_mask)
    predictor.load_state_dict(resume_train_ckpt['predictor_state_dict'])

    # model.load_state_dict(rewind_weight_mask)
    # predictor.load_state_dict(rewind_predict_weight)
    adj_spar, wei_spar = pruning.print_sparsity(model, args)

    for name, param in model.named_parameters():
        if 'mask' in name:
            param.requires_grad = False

    optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr)
    #results = {}
    results = {'epoch': 0 }
    keys = ['highest_valid', 'final_train', 'final_test', 'highest_train']
    hits = ['Hits@10', 'Hits@50', 'Hits@100']
    
    for key in keys:
        results[key] = {k: 0 for k in hits}
    results['adj_spar'] = adj_spar
    results['wei_spar'] = wei_spar
    
    start_epoch = 1
    
    for epoch in range(start_epoch, args.fix_epochs + 1):

        t0 = time.time()
        epoch_loss = train.train_fixed(model, predictor, x, edge_index, split_edge, optimizer, args.batch_size, args)
        result = train.test(model, predictor, x, edge_index, split_edge, evaluator, args.batch_size, args)
        # return a tuple
        k = 'Hits@50'
        train_result, valid_result, test_result = result[k]

        if train_result > results['highest_train'][k]:
            results['highest_train'][k] = train_result

        if valid_result > results['highest_valid'][k]:
            results['highest_valid'][k] = valid_result
            results['final_train'][k] = train_result
            results['final_test'][k] = test_result
            results['epoch'] = epoch
            pruning.save_all(model, predictor, 
                                    rewind_weight_mask, 
                                    optimizer, 
                                    imp_num, 
                                    epoch, 
                                    args.model_save_path, 
                                    'IMP{}_fixed_ckpt'.format(imp_num))

        epoch_time = (time.time() - t0) / 60
        print(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + ' | ' +
              'IMP:[{}] (FIX Mask) Epoch:[{}/{}] LOSS:[{:.4f}] Train :[{:.2f}] Valid:[{:.2f}] Test:[{:.2f}] | Update Test:[{:.2f}] at epoch:[{}] Time:[{:.2f}min]'
              .format(imp_num, epoch, args.fix_epochs, epoch_loss, train_result * 100,
                                                               valid_result * 100,
                                                               test_result * 100, 
                                                               results['final_test'][k] * 100,
                                                               results['epoch'],
                                                               epoch_time))
    print("=" * 120)
    print("syd final: IMP:[{}], Train:[{:.2f}]  Best Val:[{:.2f}] at epoch:[{}] | Final Test Acc:[{:.2f}] Adj:[{:.2f}%] Wei:[{:.2f}%]"
        .format(imp_num,    results['final_train'][k] * 100,
                            results['highest_valid'][k] * 100,
                            results['epoch'],
                            results['final_test'][k] * 100,
                            results['adj_spar'],
                            results['wei_spar']))
    print("=" * 120)
Пример #30
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    edge_index = data.edge_index
    adj = SparseTensor(row=edge_index[0], col=edge_index[1]).to(device)

    if args.use_sage:
        model = SAGE(args.hidden_channels, args.hidden_channels,
                     args.hidden_channels, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(args.hidden_channels, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_diag()
        deg = adj.sum(dim=1)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.nn.init.xavier_uniform_(emb.weight)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(emb.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, emb.weight, adj, data.edge_index,
                         split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, emb.weight, adj, split_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()