示例#1
0
 def __init__(self):
     d_name = "ogbl-ppa"
     dataset = LinkPropPredDataset(name=d_name)
     splitted_edge = dataset.get_edge_split()
     graph = dataset[0]
     self.num_nodes = graph["num_nodes"]
     self.ogb_evaluator = Evaluator(name="ogbl-ppa")
示例#2
0
def main(_):
    ds = LinkPropPredDataset(FLAGS.dataset)
    split_edge = ds.get_edge_split()
    train_edges = split_edge['train']['edge']
    train_edges = np.concatenate([train_edges, train_edges[:, ::-1]], axis=0)

    spa = scipy.sparse.csr_matrix(
        (np.ones([len(train_edges)]), (train_edges[:, 0], train_edges[:, 1])))
    mult_f = tf_fsvd.WYSDeepWalkPF(spa,
                                   window=FLAGS.wys_window,
                                   mult_degrees=False,
                                   neg_sample_coef=FLAGS.wys_neg_coef)

    tt = tqdm.tqdm(range(FLAGS.num_runs))
    test_metrics = []
    val_metrics = []
    for run in tt:
        u, s, v = tf_fsvd.fsvd(mult_f,
                               FLAGS.k,
                               n_iter=FLAGS.svd_iters,
                               n_redundancy=FLAGS.k * 3)

        dataset = LinkPropPredDataset(FLAGS.dataset)
        evaluator = Evaluator(name=FLAGS.dataset)
        evaluator.K = FLAGS.hits
        split_edge = dataset.get_edge_split()

        metrics = []
        for split in ('test', 'valid'):
            pos_edges = split_edge[split]['edge']
            neg_edges = split_edge[split]['edge_neg']

            pos_scores = tf.reduce_sum(tf.gather(u * s, pos_edges[:, 0]) *
                                       tf.gather(v, pos_edges[:, 1]),
                                       axis=1).numpy()
            neg_scores = tf.reduce_sum(tf.gather(u * s, neg_edges[:, 0]) *
                                       tf.gather(v, neg_edges[:, 1]),
                                       axis=1).numpy()
            metric = evaluator.eval({
                'y_pred_pos': pos_scores,
                'y_pred_neg': neg_scores
            })
            metrics.append(metric['hits@%i' % FLAGS.hits])
        test_metrics.append(metrics[0])
        val_metrics.append(metrics[1])

        tt.set_description(
            'HITS@%i: validate=%g; test=%g' %
            (FLAGS.hits, np.mean(val_metrics), np.mean(test_metrics)))

    print('\n\n *** Trained for %i times and average metrics are:')
    print('HITS@20 test: mean=%g; std=%g' %
          (np.mean(test_metrics), np.std(test_metrics)))
    print('HITS@20 validate: mean=%g; std=%g' %
          (np.mean(val_metrics), np.std(val_metrics)))
示例#3
0
 def __init__(self, name):
     start = time.time()
     print("[I] Loading dataset %s..." % (name))
     self.name = name
     self.dataset = DglLinkPropPredDataset(name='ogbl-collab')
     
     self.graph = self.dataset[0]  # single DGL graph
     
     # Create edge feat by concatenating weight and year
     self.graph.edata['feat'] = torch.cat( 
         [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], 
         dim=1 
     )
     
     self.split_edge = self.dataset.get_edge_split()
     self.train_edges = self.split_edge['train']['edge']  # positive train edges
     self.val_edges = self.split_edge['valid']['edge']  # positive val edges
     self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
     self.test_edges = self.split_edge['test']['edge']  # positive test edges
     self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
     
     self.evaluator = Evaluator(name='ogbl-collab')
     
     print("[I] Finished loading.")
     print("[I] Data load time: {:.4f}s".format(time.time()-start))
示例#4
0
def evaluate_hits(name, pos_pred, neg_pred, K):
    """
    Compute hits
    Args:
        name(str): name of dataset
        pos_pred(Tensor): predict value of positive edges
        neg_pred(Tensor): predict value of negative edges
        K(int): num of hits

    Returns:
        hits(float): score of hits


    """
    evaluator = Evaluator(name)
    evaluator.K = K
    hits = evaluator.eval({
        'y_pred_pos': pos_pred,
        'y_pred_neg': neg_pred,
    })[f'hits@{K}']

    return hits
示例#5
0
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-ddi')
        
        self.graph = self.dataset[0]  # single DGL graph

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
        
        self.evaluator = Evaluator(name='ogbl-ddi')
        
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time()-start))
示例#6
0
class OgbEvaluator(object):
    def __init__(self):
        d_name = "ogbl-ppa"
        dataset = LinkPropPredDataset(name=d_name)
        splitted_edge = dataset.get_edge_split()
        graph = dataset[0]
        self.num_nodes = graph["num_nodes"]
        self.ogb_evaluator = Evaluator(name="ogbl-ppa")

    def eval(self, scores, labels, phase):
        labels = np.reshape(labels, [-1])
        ret = {}
        pos = scores[labels > 0.5].squeeze(-1)
        neg = scores[labels < 0.5].squeeze(-1)
        for K in [10, 50, 100]:
            self.ogb_evaluator.K = K
            ret['%s_hits@%s' % (phase, K)] = self.ogb_evaluator.eval({
                'y_pred_pos': pos,
                'y_pred_neg': neg,
            })[f'hits@{K}']
        return ret
示例#7
0
文件: mlp.py 项目: rryoung98/ogb
def main():
    parser = argparse.ArgumentParser(description="OGBL-Citation2 (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=100)
    parser.add_argument("--eval_steps", type=int, default=10)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-citation2")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge["train"]["source_node"].numel())[:86596]
    split_edge["eval_train"] = {
        "source_node": split_edge["train"]["source_node"][idx],
        "target_node": split_edge["train"]["target_node"][idx],
        "target_node_neg": split_edge["valid"]["target_node_neg"],
    }

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-citation2")
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)
            print(f"Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}")

            if epoch % args.eval_steps == 0:
                result = test(predictor, x, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f"Run: {run + 1:02d}, "
                          f"Epoch: {epoch:02d}, "
                          f"Loss: {loss:.4f}, "
                          f"Train: {train_mrr:.4f}, "
                          f"Valid: {valid_mrr:.4f}, "
                          f"Test: {test_mrr:.4f}")

        print("Node2vec" if args.use_node_embedding else "MLP")
        logger.print_statistics(run)
    print("Node2vec" if args.use_node_embedding else "MLP")
    logger.print_statistics()
示例#8
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=12)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    cluster_data = ClusterData(data, num_parts=args.num_partitions,
                               recursive=False, save_dir=dataset.processed_dir)

    loader = ClusterLoader(cluster_data, batch_size=args.batch_size,
                           shuffle=True, num_workers=args.num_workers)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model, predictor, data, split_edge, evaluator,
                              batch_size=64 * 1024, device=device)
                logger.add_result(run, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
示例#9
0
文件: gnn.py 项目: tjufan/LRGA
def main():
    parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=5e-4)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--k', type=int, default=100)
    parser.add_argument('--gpu_id', type=int, default=0)
    args = parser.parse_args()
    print(args)
    device = gpu_setup(args.gpu_id)

    dataset = PygLinkPropPredDataset(name='ogbl-collab')
    data = dataset[0]
    data.edge_weight = data.edge_weight.view(-1).to(torch.float)
    data = T.ToSparseTensor()(data)
    data = data.to(device)

    split_edge = dataset.get_edge_split()

    model = GCNWithAttention(data.num_features, args.hidden_channels,
                             args.hidden_channels, args.num_layers,
                             args.dropout, args.k).to(device)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)
    print("model parameters {}".format(
        sum(p.numel() for p in model.parameters())))
    print("predictor parameters {}".format(
        sum(p.numel() for p in predictor.parameters())))
    print("total parameters {}".format(
        sum(p.numel() for p in model.parameters()) +
        sum(p.numel() for p in predictor.parameters())))
    evaluator = Evaluator(name='ogbl-collab')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#10
0
    args.eval_metric = 'hits'
    directed = False
else:  # assume other datasets are undirected
    args.eval_metric = 'auc'
    directed = False

if args.use_valedges_as_input:
    val_edge_index = split_edge['valid']['edge'].t()
    if not directed:
        val_edge_index = to_undirected(val_edge_index)
    data.edge_index = torch.cat([data.edge_index, val_edge_index], dim=-1)
    val_edge_weight = torch.ones([val_edge_index.size(1), 1], dtype=int)
    data.edge_weight = torch.cat([data.edge_weight, val_edge_weight], 0)

if args.dataset.startswith('ogbl'):
    evaluator = Evaluator(name=args.dataset)
if args.eval_metric == 'hits':
    loggers = {
        'Hits@20': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }
elif args.eval_metric == 'mrr':
    loggers = {
        'MRR': Logger(args.runs, args),
    }
elif args.eval_metric == 'auc':
    loggers = {
        'AUC': Logger(args.runs, args),
    }
    
示例#11
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    edge_index = data.edge_index
    adj = SparseTensor(row=edge_index[0], col=edge_index[1]).to(device)

    if args.use_sage:
        model = SAGE(args.hidden_channels, args.hidden_channels,
                     args.hidden_channels, args.num_layers,
                     args.dropout).to(device)
    else:
        model = GCN(args.hidden_channels, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_diag()
        deg = adj.sum(dim=1)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.nn.init.xavier_uniform_(emb.weight)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(emb.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, emb.weight, adj, data.edge_index,
                         split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, emb.weight, adj, split_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#12
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--k', type=int, default=50)
    parser.add_argument('--gpu_id', type=int, default=0)
    args = parser.parse_args()
    print(args)
    device = gpu_setup(args.gpu_id)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi',
                                     transform=T.ToSparseTensor())
    data = dataset[0]
    adj_t = data.adj_t.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    model = GCNWithAttention(args.hidden_channels, args.hidden_channels,
                args.hidden_channels, args.num_layers,
                args.dropout, args.k).to(device)

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)
    print("model parameters {}".format(sum(p.numel() for p in model.parameters())))
    print("predictor parameters {}".format(sum(p.numel() for p in predictor.parameters())))
    print("total parameters {}".format(data.num_nodes*args.hidden_channels + 
    sum(p.numel() for p in model.parameters())+sum(p.numel() for p in predictor.parameters())))
    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        torch.nn.init.xavier_uniform_(emb.weight)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(emb.parameters()) +
            list(predictor.parameters()), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, emb.weight, adj_t, split_edge,
                         optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, emb.weight, adj_t, split_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#13
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ppa')
    data = dataset[0]
    splitted_edge = dataset.get_edge_split()

    if args.use_node_embedding:
        x = data.x.to(torch.float)
        x = torch.cat([x, torch.load('embedding.pt')], dim=-1)
        x = x.to(device)
    else:
        x = data.x.to(torch.float).to(device)

    edge_index = data.edge_index.to(device)
    adj = SparseTensor(row=edge_index[0], col=edge_index[1])

    if args.use_sage:
        model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels,
                     args.num_layers, args.dropout).to(device)
    else:
        model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels,
                    args.num_layers, args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_diag()
        deg = adj.sum(dim=1).to(torch.float)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()),
            lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, x, adj, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, x, adj, splitted_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#14
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (GraphSAINT)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=16 * 1024)
    parser.add_argument('--walk_length', type=int, default=3)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--num_steps', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    data.edge_index = to_undirected(data.edge_index, data.num_nodes)

    loader = GraphSAINTRandomWalkSampler(data,
                                         batch_size=args.batch_size,
                                         walk_length=args.walk_length,
                                         num_steps=args.num_steps,
                                         sample_coverage=0,
                                         save_dir=dataset.processed_dir)

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    model = GCN(data.x.size(-1), args.hidden_channels, args.hidden_channels,
                args.num_layers, args.dropout).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    run_idx = 0

    while run_idx < args.runs:
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        run_success = True
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, loader, optimizer, device)
            print(
                f'Run: {run_idx + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}'
            )
            if loss > 2.:
                run_success = False
                logger.reset(run_idx)
                print('Learning failed. Rerun...')
                break

            if epoch > 49 and epoch % args.eval_steps == 0:
                result = test(model,
                              predictor,
                              data,
                              split_edge,
                              evaluator,
                              batch_size=64 * 1024,
                              device=device)
                logger.add_result(run_idx, result)

                train_mrr, valid_mrr, test_mrr = result
                print(f'Run: {run_idx + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_mrr:.4f}, '
                      f'Valid: {valid_mrr:.4f}, '
                      f'Test: {test_mrr:.4f}')

        print('GraphSAINT')
        if run_success:
            logger.print_statistics(run_idx)
            run_idx += 1

    print('GraphSAINT')
    logger.print_statistics()
示例#15
0
torch.manual_seed(12345)
idx = torch.randperm(split_edge['train']['edge'].size(0))
idx = idx[:split_edge['valid']['edge'].size(0)]
split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}


# Load model, initial embeddings, the link predictor, and the evaluator

model_hgsage = HG_SAGE(data.num_nodes, dropout=dropout, hidden_dim=hidden_channels).to(device)
model = model_hgsage
emb = torch.nn.Embedding(data.num_nodes, hidden_channels).to(device)
predictor = LinkPredictor(hidden_channels, hidden_channels, 1,
                          num_layers, dropout).to(device)

evaluator = Evaluator(name='ogbl-ddi')

# Train and evaluate model

best_train, best_val, best_test = [], [], []
for run in range(runs):
    best_train_r, best_val_r, best_test_r = None, None, None
    torch.nn.init.xavier_uniform_(emb.weight)
    predictor.reset_parameters()
    optimizer = torch.optim.Adam(
        list(model.parameters()) + list(emb.parameters()) +
        list(predictor.parameters()), lr=lr)
        
    for epoch in range(1, 1 + epochs):
        loss = train(model, predictor, emb.weight, adj_t, theta, split_edge,
                      optimizer, batch_size)
def main():
    parser = argparse.ArgumentParser(description='OGBL-DDI')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--model',
                        type=str,
                        default='MAD_GCN',
                        choices=[
                            'GCN_Linear', 'SAGE_Linear', 'MAD_GCN', 'MAD_SAGE',
                            'MAD_Model'
                        ])
    parser.add_argument('--train_batch_size', type=int, default=4096)
    parser.add_argument('--test_batch_size', type=int, default=1024)
    parser.add_argument('--lr', type=float, default=0.005)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=5)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ddi',
                                     transform=T.ToSparseTensor())
    data = dataset[0]
    adj_t = data.adj_t.to(device)

    split_edge = dataset.get_edge_split()

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['edge'].size(0))
    idx = idx[:split_edge['valid']['edge'].size(0)]
    split_edge['eval_train'] = {'edge': split_edge['train']['edge'][idx]}

    model = models.get_model(args.model)(data.num_nodes, adj_t).to(device)
    print(f"Parameters: {count_parameters(model)}")

    evaluator = Evaluator(name='ogbl-ddi')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@20': Logger(args.runs, args),
        'Hits@30': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, adj_t, split_edge, optimizer,
                         args.train_batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, adj_t, split_edge, evaluator,
                               args.test_batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')
                    print('---')
            print(f'Finished epoch {epoch}')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#17
0
文件: gnn.py 项目: yelongshen/ogb
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.0005)
    parser.add_argument('--epochs', type=int, default=50)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    x = data.x.to(device)
    edge_index = data.edge_index.to(device)
    edge_index = to_undirected(edge_index, data.num_nodes)
    adj = SparseTensor(row=edge_index[0], col=edge_index[1])

    if args.use_sage:
        model = SAGE(x.size(-1), args.hidden_channels, args.hidden_channels,
                     args.num_layers, args.dropout).to(device)
    else:
        model = GCN(x.size(-1), args.hidden_channels, args.hidden_channels,
                    args.num_layers, args.dropout).to(device)

        # Pre-compute GCN normalization.
        adj = adj.set_value(None)
        adj = adj.set_diag()
        deg = adj.sum(dim=1)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        adj = deg_inv_sqrt.view(-1, 1) * adj * deg_inv_sqrt.view(1, -1)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, x, adj, split_edge, optimizer,
                         args.batch_size)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch % args.eval_steps == 0:
                result = test(model, predictor, x, adj, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)

    logger.print_statistics()
示例#18
0
    os.path.join(log_dir, "%s.log" % (dataset_name + "_" + setting_name))
])
sys.stderr = Logger([
    "%s.log" % (dataset_name + setting_name),
    os.path.join(log_dir, "%s.log" % (dataset_name + "_" + setting_name))
])
snapshot_dir = os.path.join(log_dir, "snapshot")
if not os.path.isdir(snapshot_dir):
    os.makedirs(snapshot_dir)
print("Process Id:", os.getpid())
print(os.path.join(log_dir, sys.argv[0]))
print(args)
shutil.copyfile(__file__, os.path.join(log_dir, "train.py"))
shutil.copyfile(model_file + ".py", os.path.join(log_dir, model_file + ".py"))

evaluator = Evaluator(name="ogbl-citation")
print(evaluator.expected_input_format)
print(evaluator.expected_output_format)

dataset = DglLinkPropPredDataset(name="ogbl-citation")
split_edge = dataset.get_edge_split()
num_worker = 16
train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
    "valid"], split_edge["test"]
graph = dataset[0]
origin_graph = copy.deepcopy(graph)
graph.readonly(False)
graph.add_edges(graph.edges()[1], graph.edges()[0])
graph.add_edges(
    torch.arange(0, graph.number_of_nodes()).long(),
    torch.arange(0, graph.number_of_nodes()).long())
示例#19
0
文件: mlp.py 项目: rryoung98/ogb
def main():
    parser = argparse.ArgumentParser(description="OGBL-COLLAB (MLP)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_node_embedding", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.01)
    parser.add_argument("--epochs", type=int, default=200)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-collab")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load("embedding.pt", map_location="cpu")
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-collab")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(predictor, x, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f"Run: {run + 1:02d}, "
                              f"Epoch: {epoch:02d}, "
                              f"Loss: {loss:.4f}, "
                              f"Train: {100 * train_hits:.2f}%, "
                              f"Valid: {100 * valid_hits:.2f}%, "
                              f"Test: {100 * test_hits:.2f}%")
                    print("---")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#20
0
def run(file, data_name, model_name,lr):
    parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64*1024)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--use_nd', action='store_true')
    parser.add_argument('--use_lgae', action='store_true')
    parser.add_argument('--use_vgae', action='store_true')
    parser.add_argument('--model', type=str, default='')

    parser.add_argument('--dataset', type=str, default='Citeseer')

    args = parser.parse_args()
    if data_name != None and model_name != None and lr != None:
        args.dataset = data_name
        args.model = model_name
        args.lr = lr
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    # device = 'cpu'
    device = torch.device(device)

    dataset = CitationFull(os.path.join('citation_data',args.dataset),name=args.dataset,transform=T.ToSparseTensor())
    num_training = int(dataset.__len__()*0.8)
    num_val = int(dataset.__len__()*0.1)
    num_test = dataset.__len__() - (num_training+num_val)

    data = dataset[0]
    adj_t = data.adj_t.to(device)
    edge_index, edge_type = utils.dense_to_sparse(adj_t.to_dense())
    data.edge_index = edge_index
    data.x = data.x.to(device)
    num_nodes = data.x.shape[0]
    num_edges = data.edge_index.shape[1]
    print(data)
    # nx_data = to_networkx(data, to_undirected=True)
    # print('graph density='+str(2*num_edges/(num_nodes*(num_nodes-1))))
    # print('clustering coefficient='+str(nx.average_clustering(nx_data)))


    decoder_enable = args.model[-3:]
    if args.model[-3:] == '-nd': model_name = args.model[:-3]
    
    if model_name == 'lgae':
        model = LGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'vgae':
        model = DeepVGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'gae':
        model = GraphAutoEncoder(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'arga':
        model = AdversarialGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'arvga':
        model = AdversarialVGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'lrga':
        model = LRGA(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'sage':
        model = SAGEAutoEncoder(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    if decoder_enable == '-nd':
        model.decoder = NeuralDecoder( args.hidden_channels,  
            args.hidden_channels, 1, args.num_layers, args.dropout)
    
    evaluator = Evaluator(name='ogbl-ddi')

    model = model.to(device)

    loggers = {}
    K_list = ['20','50','100']
    for k in K_list:
        loggers['Hits@'+k] = Logger(args.runs, args)

    for run in range(args.runs):
        torch.manual_seed(run)
        split_edge = utils.train_test_split_edges(data)
        # print(split_edge.train_pos_edge_index.shape)
        # print(split_edge.val_pos_edge_index.shape)


        # exit()
        split_edge.edge_index = edge_index

        # emb.weight.data = features
        model.reset_parameters()

        if args.model in ['arga','arga-nd','arvga','arvga-nd']:
            args.lr=0.005
        optimizer = torch.optim.Adam(
                list(model.parameters()), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data.x, adj_t, split_edge,
                         optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, data.x, adj_t, split_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)
            

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits, test_auc, test_ap, val_auc, val_ap = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'auc: {100 * test_auc:.2f}%, '
                              f'ap: {100 * test_ap:.2f}%, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%', )
                    print('---')



        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        toWrite = loggers[key].print_statistics()

        file.write(str(args.lr)+' ' +key + ' ' +args.model+"'"+str(toWrite)+'\n')
        file.flush()
示例#21
0
def main():
    parser = argparse.ArgumentParser(
        description='Link prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglLinkPropPredDataset(name=args.dataset)

    # Manually add self-loop link since GCN will wash out the feature of isolated nodes.
    n_nodes = dataset[0].number_of_nodes()
    g_data = dgl.add_self_loop(dataset[0])
    g_data = dgl.to_bidirected(g_data)

    for k in dataset[0].node_attr_schemes().keys():
        g_data.ndata[k] = dataset[0].ndata[k]
    print(g_data.number_of_nodes(), g_data.number_of_edges())

    g_data.create_formats_()

    cluster_dataset = ClusterIterDataset(args.dataset,
                                         g_data,
                                         args.num_partitions,
                                         use_pp=False)
    cluster_iterator = DataLoader(cluster_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  collate_fn=partial(subgraph_collate_fn,
                                                     g_data,
                                                     negs=args.negs))

    model = GCN(g_data.ndata['feat'].size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0
        for epoch in range(1, 1 + args.epochs):
            loss, c_epoch_time, c_to_device_time, c_ff_time, c_pred_loss_time, c_bp_time, c_io_time, c_memory, c_part1 = train(
                model, predictor, cluster_iterator, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')
            epoch_time += c_epoch_time
            to_device_time += c_to_device_time
            ff_time += c_ff_time
            pred_loss_time += c_pred_loss_time
            bp_time += c_bp_time
            io_time += c_io_time
            part_1 += c_part1
            memory = max(memory, c_memory[0])

            if epoch % args.eval_steps == 0:
                print('Ave')
                print('epoch time: ', epoch_time / args.eval_steps)
                print('to_device_time: ', to_device_time / args.eval_steps)
                print('ff_time: ', ff_time / args.eval_steps)
                print('part1_time: ', part_1 / args.eval_steps)
                print('pred_loss_time: ', pred_loss_time / args.eval_steps)
                print('bp_time: ', bp_time / args.eval_steps)
                print('io_time: ', io_time / args.eval_steps)
                print('max memory', memory)
                print('\n')
                epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0

                result = test(model, predictor, g_data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
示例#22
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-PPA (MF)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.005)
    parser.add_argument("--epochs", type=int, default=1000)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-ppa")
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    emb = torch.nn.Embedding(data.num_nodes, args.hidden_channels).to(device)

    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name="ogbl-ppa")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in range(args.runs):
        emb.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(emb.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(emb.weight, predictor, split_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(emb.weight, predictor, split_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f"Run: {run + 1:02d}, "
                              f"Epoch: {epoch:02d}, "
                              f"Loss: {loss:.4f}, "
                              f"Train: {100 * train_hits:.2f}%, "
                              f"Valid: {100 * valid_hits:.2f}%, "
                              f"Test: {100 * test_hits:.2f}%")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#23
0
def run(file, data_name, model_name,lr):
    parser = argparse.ArgumentParser(description='OGBL-DDI (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_sage', action='store_true')
    parser.add_argument('--num_layers', type=int, default=2)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--batch_size', type=int, default=64*1024)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=5)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--use_nd', action='store_true')
    parser.add_argument('--use_lgae', action='store_true')
    parser.add_argument('--use_vgae', action='store_true')
    parser.add_argument('--model', type=str, default='')

    parser.add_argument('--dataset', type=str, default='Citeseer')

    args = parser.parse_args()
    if data_name != None and model_name != None and lr != None:
        args.dataset = data_name
        args.model = model_name
        args.lr = lr
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    # device = 'cpu'
    device = torch.device(device)

    dataset = CitationFull(os.path.join('citation_data',args.dataset),name=args.dataset,transform=T.ToSparseTensor())
    num_training = int(dataset.__len__()*0.8)
    num_val = int(dataset.__len__()*0.1)
    num_test = dataset.__len__() - (num_training+num_val)

    data = dataset[0]
    print('data:',vars(data))
    adj_t = data.adj_t.to(device)
    edge_index, edge_type = utils.dense_to_sparse(adj_t.to_dense())
    data.edge_index = edge_index
    data.x = data.x.to(device)
    split_edge = utils.train_test_split_edges(data)
    split_edge.edge_index = edge_index

    print(data)
    print(edge_index.shape)

    decoder_enable = args.model[-3:]
    if args.model[-3:] == '-nd': model_name = args.model[:-3]
    

    if model_name == 'lgae':
        model = LGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'vgae':
        model = DeepVGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'gae':
        model = GraphAutoEncoder(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'arga':
        model = AdversarialGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    elif model_name == 'arvga':
        model = AdversarialVGAE(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'lrga':
        model = LRGA(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)
    elif model_name == 'sage':
        model = SAGEAutoEncoder(data.num_features, args.hidden_channels,
                    args.hidden_channels, args.num_layers,
                    args.dropout)

    if decoder_enable == '-nd':
        model.decoder = NeuralDecoder( args.hidden_channels,  
            args.hidden_channels, 1, args.num_layers, args.dropout)

    evaluator = Evaluator(name='ogbl-ddi')

    model = model.to(device)

    loggers = {
        'metrics': Logger(args.runs, args)
    }

    for run in range(args.runs):
        torch.manual_seed(run)
        model.reset_parameters()

        if args.model in ['arga','arga-nd','arvga','arvga-nd']:
            args.lr=0.005
        optimizer = torch.optim.Adam(
                list(model.parameters()), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, data.x, adj_t, split_edge,
                         optimizer, args.batch_size)

        result = test(model, data.x, data, split_edge, evaluator, args.batch_size)
        loggers['metrics'].add_result(run, result)

    for key in loggers.keys():
        print(key)
        toWrite = loggers[key].print_statistics()
        file.write(args.model+'\t'+'\t'.join(toWrite)+'\n')
        file.flush()
        os.fsync(file)
示例#24
0
def main():
    parser = argparse.ArgumentParser()
    add_train_args(parser)
    add_common_args(parser)
    args = parser.parse_args()
    add_experiment(args)
    device = model_utils.get_device()

    # Load dataset from disk
    print('Loading train data...')
    train_graph, valid_graph, train_edges, eval_edges, valid_edges = model_utils.load_training_data()
    if args.train_partial_graph:
        train_edges['edge'] = eval_edges['edge']

    train_dl = data.DataLoader(
        data.TensorDataset(train_edges['edge']),
        batch_size=args.train_batch_size,
        shuffle=True,
    )
    dev_dl = data.DataLoader(
        data.TensorDataset(
            torch.cat([valid_edges['edge'], valid_edges['edge_neg']], dim=0),
            torch.cat([torch.ones(valid_edges['edge'].shape[0]),
                       torch.zeros(valid_edges['edge_neg'].shape[0])], dim=0),
        ),
        batch_size=args.val_batch_size,
        shuffle=True,
    )

    # Initialize node embeddings
    print('Computing initial embeddings')
    train_graph = model_utils.initialize_embeddings(
        train_graph, 'train_embeddings.pt', args.refresh_embeddings)
    valid_graph = model_utils.initialize_embeddings(
        valid_graph, 'valid_embeddings.pt', args.refresh_embeddings)
    if not args.train_partial_graph:
        train_graph = valid_graph

    # Stats evaluator
    evaluator = Evaluator(name='ogbl-ddi')

    # Initialize a model
    model = models.get_model(args.model)(
        # train_graph.x.shape, train_graph.adj_t.to(device)
        num_nodes=train_graph.num_nodes, adj_t=train_graph.adj_t.to(device)
    )

    # load from checkpoint if path specified
    if args.load_path is not None:
        model = model_utils.load_model(model, args.load_path)
    print(f"Parameters: {model_utils.count_parameters(model)}")

    # Move model to GPU if necessary
    model.to(device)

    # Initialize optimizer
    optimizer = optim.Adam(
        model.parameters(),
        lr=args.learning_rate,
        weight_decay=args.weight_decay,
    )

    # Scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        mode='min',
        factor=0.5,
        patience=30,
        verbose=True,
    )

    os.makedirs(f'{args.save_path}/{args.experiment}')
    print(f'Created new experiment: {args.experiment}')
    save_arguments(args, f'{args.save_path}/{args.experiment}/args.txt')

    # Train!
    trained_model = train_model(
        train_graph,
        valid_graph,
        train_dl,
        dev_dl,
        evaluator,
        model,
        optimizer,
        scheduler,
        args,
    )

    # Save trained model
    filename = f'{args.save_path}/{args.experiment}/{model.__class__.__name__}_trained.checkpoint'
    model_utils.save_model(trained_model, filename)
示例#25
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=20)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-ppa')
    splitted_edge = dataset.get_edge_split()
    data = dataset[0]

    x = data.x.to(torch.float)
    embedding = torch.load('embedding.pt', map_location='cpu')
    x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        predictor.reset_parameters()

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(predictor, x, splitted_edge, evaluator,
                               args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#26
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Citation2 (MLP)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_embedding', action='store_true')
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=100)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name='ogbl-citation2')
    split_edge = dataset.get_edge_split()
    data = dataset[0]

    # We randomly pick some training samples that we want to evaluate on:
    torch.manual_seed(12345)
    idx = torch.randperm(split_edge['train']['source_node'].numel())[:86596]
    split_edge['eval_train'] = {
        'source_node': split_edge['train']['source_node'][idx],
        'target_node': split_edge['train']['target_node'][idx],
        'target_node_neg': split_edge['valid']['target_node_neg'],
    }

    x = data.x
    if args.use_node_embedding:
        embedding = torch.load('embedding.pt', map_location='cpu')
        x = torch.cat([x, embedding], dim=-1)
    x = x.to(device)

    predictor = LinkPredictor(x.size(-1), args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-citation2')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(predictor.parameters(), lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, x, split_edge, optimizer, args.batch_size)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')

            if epoch % args.eval_steps == 0:
                result = test(predictor, x, split_edge, evaluator,
                              args.batch_size)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        print('Node2vec' if args.use_node_embedding else 'MLP')
        logger.print_statistics(run)
    print('Node2vec' if args.use_node_embedding else 'MLP')
    logger.print_statistics()
示例#27
0
文件: run.py 项目: rpatil524/ogb
def main(args):
    if (not args.do_train) and (not args.do_valid) and (not args.do_test) and (
            not args.evaluate_train):
        raise ValueError('one of train/val/test mode must be choosed.')

    if args.init_checkpoint:
        override_config(args)

    args.save_path = 'log/%s/%s/%s-%s/%s' % (
        args.dataset, args.model, args.hidden_dim, args.gamma,
        time.time()) if args.save_path == None else args.save_path
    writer = SummaryWriter(args.save_path)

    # Write logs to checkpoint and console
    set_logger(args)

    dataset = LinkPropPredDataset(name='ogbl-biokg')
    split_edge = dataset.get_edge_split()
    train_triples, valid_triples, test_triples = split_edge[
        "train"], split_edge["valid"], split_edge["test"]
    nrelation = int(max(train_triples['relation'])) + 1
    entity_dict = dict()
    cur_idx = 0
    for key in dataset[0]['num_nodes_dict']:
        entity_dict[key] = (cur_idx,
                            cur_idx + dataset[0]['num_nodes_dict'][key])
        cur_idx += dataset[0]['num_nodes_dict'][key]
    nentity = sum(dataset[0]['num_nodes_dict'].values())

    evaluator = Evaluator(name=args.dataset)

    args.nentity = nentity
    args.nrelation = nrelation

    logging.info('Model: %s' % args.model)
    logging.info('Dataset: %s' % args.dataset)
    logging.info('#entity: %d' % nentity)
    logging.info('#relation: %d' % nrelation)

    # train_triples = split_dict['train']
    logging.info('#train: %d' % len(train_triples['head']))
    # valid_triples = split_dict['valid']
    logging.info('#valid: %d' % len(valid_triples['head']))
    # test_triples = split_dict['test']
    logging.info('#test: %d' % len(test_triples['head']))

    train_count, train_true_head, train_true_tail = defaultdict(
        lambda: 4), defaultdict(list), defaultdict(list)
    for i in tqdm(range(len(train_triples['head']))):
        head, relation, tail = train_triples['head'][i], train_triples[
            'relation'][i], train_triples['tail'][i]
        head_type, tail_type = train_triples['head_type'][i], train_triples[
            'tail_type'][i]
        train_count[(head, relation, head_type)] += 1
        train_count[(tail, -relation - 1, tail_type)] += 1
        train_true_head[(relation, tail)].append(head)
        train_true_tail[(head, relation)].append(tail)

    kge_model = KGEModel(
        model_name=args.model,
        nentity=nentity,
        nrelation=nrelation,
        hidden_dim=args.hidden_dim,
        gamma=args.gamma,
        double_entity_embedding=args.double_entity_embedding,
        double_relation_embedding=args.double_relation_embedding,
        evaluator=evaluator)

    logging.info('Model Parameter Configuration:')
    for name, param in kge_model.named_parameters():
        logging.info('Parameter %s: %s, require_grad = %s' %
                     (name, str(param.size()), str(param.requires_grad)))

    if args.cuda:
        kge_model = kge_model.cuda()

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        checkpoint = torch.load(
            os.path.join(args.init_checkpoint, 'checkpoint'))
        entity_dict = checkpoint['entity_dict']

    if args.do_train:
        # Set training dataloader iterator
        train_dataloader_head = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'head-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_dataloader_tail = DataLoader(
            TrainDataset(train_triples, nentity, nrelation,
                         args.negative_sample_size, 'tail-batch', train_count,
                         train_true_head, train_true_tail, entity_dict),
            batch_size=args.batch_size,
            shuffle=True,
            num_workers=max(1, args.cpu_num // 2),
            collate_fn=TrainDataset.collate_fn)

        train_iterator = BidirectionalOneShotIterator(train_dataloader_head,
                                                      train_dataloader_tail)

        # Set training configuration
        current_learning_rate = args.learning_rate
        optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                            kge_model.parameters()),
                                     lr=current_learning_rate)
        if args.warm_up_steps:
            warm_up_steps = args.warm_up_steps
        else:
            warm_up_steps = args.max_steps // 2

    if args.init_checkpoint:
        # Restore model from checkpoint directory
        # logging.info('Loading checkpoint %s...' % args.init_checkpoint)
        # checkpoint = torch.load(os.path.join(args.init_checkpoint, 'checkpoint'))
        init_step = checkpoint['step']
        kge_model.load_state_dict(checkpoint['model_state_dict'])
        # entity_dict = checkpoint['entity_dict']
        if args.do_train:
            current_learning_rate = checkpoint['current_learning_rate']
            warm_up_steps = checkpoint['warm_up_steps']
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        logging.info('Ramdomly Initializing %s Model...' % args.model)
        init_step = 0

    step = init_step

    logging.info('Start Training...')
    logging.info('init_step = %d' % init_step)
    logging.info('batch_size = %d' % args.batch_size)
    logging.info('negative_adversarial_sampling = %d' %
                 args.negative_adversarial_sampling)
    logging.info('hidden_dim = %d' % args.hidden_dim)
    logging.info('gamma = %f' % args.gamma)
    logging.info('negative_adversarial_sampling = %s' %
                 str(args.negative_adversarial_sampling))
    if args.negative_adversarial_sampling:
        logging.info('adversarial_temperature = %f' %
                     args.adversarial_temperature)

    # Set valid dataloader as it would be evaluated during training

    if args.do_train:
        logging.info('learning_rate = %d' % current_learning_rate)

        training_logs = []

        #Training Loop
        for step in range(init_step, args.max_steps):

            log = kge_model.train_step(kge_model, optimizer, train_iterator,
                                       args)
            training_logs.append(log)

            if step >= warm_up_steps:
                current_learning_rate = current_learning_rate / 10
                logging.info('Change learning_rate to %f at step %d' %
                             (current_learning_rate, step))
                optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                                    kge_model.parameters()),
                                             lr=current_learning_rate)
                warm_up_steps = warm_up_steps * 3

            if step % args.save_checkpoint_steps == 0 and step > 0:  # ~ 41 seconds/saving
                save_variable_list = {
                    'step': step,
                    'current_learning_rate': current_learning_rate,
                    'warm_up_steps': warm_up_steps,
                    'entity_dict': entity_dict
                }
                save_model(kge_model, optimizer, save_variable_list, args)

            if step % args.log_steps == 0:
                metrics = {}
                for metric in training_logs[0].keys():
                    metrics[metric] = sum(
                        [log[metric]
                         for log in training_logs]) / len(training_logs)
                log_metrics('Train', step, metrics, writer)
                training_logs = []

            if args.do_valid and step % args.valid_steps == 0 and step > 0:
                logging.info('Evaluating on Valid Dataset...')
                metrics = kge_model.test_step(kge_model, valid_triples, args,
                                              entity_dict)
                log_metrics('Valid', step, metrics, writer)

        save_variable_list = {
            'step': step,
            'current_learning_rate': current_learning_rate,
            'warm_up_steps': warm_up_steps
        }
        save_model(kge_model, optimizer, save_variable_list, args)

    if args.do_valid:
        logging.info('Evaluating on Valid Dataset...')
        metrics = kge_model.test_step(kge_model, valid_triples, args,
                                      entity_dict)
        log_metrics('Valid', step, metrics, writer)

    if args.do_test:
        logging.info('Evaluating on Test Dataset...')
        metrics = kge_model.test_step(kge_model, test_triples, args,
                                      entity_dict)
        log_metrics('Test', step, metrics, writer)

    if args.evaluate_train:
        logging.info('Evaluating on Training Dataset...')
        small_train_triples = {}
        indices = np.random.choice(len(train_triples['head']),
                                   args.ntriples_eval_train,
                                   replace=False)
        for i in train_triples:
            if 'type' in i:
                small_train_triples[i] = [train_triples[i][x] for x in indices]
            else:
                small_train_triples[i] = train_triples[i][indices]
        metrics = kge_model.test_step(kge_model,
                                      small_train_triples,
                                      args,
                                      entity_dict,
                                      random_sampling=True)
        log_metrics('Train', step, metrics, writer)
示例#28
0
def main():
    parser = argparse.ArgumentParser(description="OGBL-COLLAB (GNN)")
    parser.add_argument("--device", type=int, default=0)
    parser.add_argument("--log_steps", type=int, default=1)
    parser.add_argument("--use_sage", action="store_true")
    parser.add_argument("--use_valedges_as_input", action="store_true")
    parser.add_argument("--num_layers", type=int, default=3)
    parser.add_argument("--hidden_channels", type=int, default=256)
    parser.add_argument("--dropout", type=float, default=0.0)
    parser.add_argument("--batch_size", type=int, default=64 * 1024)
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--epochs", type=int, default=400)
    parser.add_argument("--eval_steps", type=int, default=1)
    parser.add_argument("--runs", type=int, default=1)
    parser.add_argument("--seed",type=int,default=1)
    args = parser.parse_args()
    print(args)
    
    device = f"cuda:{args.device}" if torch.cuda.is_available() else "cpu"
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name="ogbl-collab")
    data = dataset[0]
    edge_index = data.edge_index
    data.edge_weight = data.edge_weight.view(-1).to(torch.float)
    data = T.ToSparseTensor()(data)

    split_edge = dataset.get_edge_split()

    
    # Use training + validation edges for inference on test set.
    if args.use_valedges_as_input:
        val_edge_index = split_edge["valid"]["edge"].t()
        full_edge_index = torch.cat([edge_index, val_edge_index], dim=-1)
        data.full_adj_t = SparseTensor.from_edge_index(full_edge_index).t()
        data.full_adj_t = data.full_adj_t.to_symmetric()
    else:
        data.full_adj_t = data.adj_t

    data = data.to(device)

    if args.use_sage:
        model = SAGE(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)
    else:
        model = GCN(
            data.num_features,
            args.hidden_channels,
            args.hidden_channels,
            args.num_layers,
            args.dropout,
        ).to(device)

    predictor = LinkPredictor(
        args.hidden_channels, args.hidden_channels, 1, args.num_layers, args.dropout
    ).to(device)

    evaluator = Evaluator(name="ogbl-collab")
    loggers = {
        "Hits@10": Logger(args.runs, args),
        "Hits@50": Logger(args.runs, args),
        "Hits@100": Logger(args.runs, args),
    }

    for run in tqdm(range(args.runs)):
        torch.manual_seed(args.seed + run)
        np.random.seed(args.seed+run)
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(
            list(model.parameters()) + list(predictor.parameters()), lr=args.lr
        )

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, split_edge, optimizer, args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(
                    model, predictor, data, split_edge, evaluator, args.batch_size
                )
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(
                            f"Run: {run + 1:02d}, "
                            f"Epoch: {epoch:02d}, "
                            f"Loss: {loss:.4f}, "
                            f"Train: {100 * train_hits:.2f}%, "
                            f"Valid: {100 * valid_hits:.2f}%, "
                            f"Test: {100 * test_hits:.2f}%"
                        )
                    print("---")

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()
示例#29
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-Reviews (MLP)')
    parser.add_argument('--suffix', type=str, default='groc')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--use_node_features', action='store_true')
    parser.add_argument('--hidden_channels', type=int, default=64)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=64 * 1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--runs', type=int, default=10)
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = PygLinkPropPredDataset(name=f'ogbl-reviews-{args.suffix}')
    splitted_edge = dataset.get_edge_split()
    data = dataset[0]

    product_idx = (data.x[:, 0] == 0).nonzero().flatten()
    user_idx = (data.x[:, 0] == 1).nonzero().flatten()

    user_emb = torch.nn.Parameter(
        torch.Tensor(user_idx.size(0), args.hidden_channels).to(device))

    if args.use_node_features:
        product_x = data.x[product_idx, 1:].to(device)
    else:
        product_x = None

    product_emb = torch.nn.Parameter(
        torch.Tensor(product_idx.size(0), args.hidden_channels).to(device))

    for split in ['train', 'valid', 'test']:
        edge = splitted_edge[f'{split}_edge'].t()
        edge[1] -= product_emb.size(0)
        splitted_edge[f'{split}_edge'] = edge

    predictor = LinkPredictor(
        args.hidden_channels + (300 if product_x is not None else 0), 1,
        args.dropout).to(device)

    evaluator = Evaluator(name=f'ogbl-reviews-{args.suffix}')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        predictor.reset_parameters()
        glorot(user_emb)
        glorot(product_emb)

        optimizer = torch.optim.Adam([user_emb, product_emb] +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(predictor, product_emb, user_emb, product_x,
                         splitted_edge, optimizer, args.batch_size)
            result = test(predictor, product_emb, user_emb, product_x,
                          splitted_edge, evaluator, args.batch_size)
            logger.add_result(run, result)

            if epoch % args.log_steps == 0:
                train_rmse, valid_rmse, test_rmse = result
                print(f'Run: {run + 1:02d}, '
                      f'Epoch: {epoch:02d}, '
                      f'Loss: {loss:.4f}, '
                      f'Train: {train_rmse:.4f}, '
                      f'Valid: {valid_rmse:.4f}, '
                      f'Test: {test_rmse:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
示例#30
0
def train_model(
    train_graph: pyg.torch_geometric.data.Data,
    valid_graph: pyg.torch_geometric.data.Data,
    train_dl: data.DataLoader,
    dev_dl: data.DataLoader,
    evaluator: Evaluator,
    model: nn.Module,
    optimizer: optim.Optimizer,
    lr_scheduler: optim.lr_scheduler._LRScheduler,
    args: argparse.Namespace,
) -> nn.Module:

    device = model_utils.get_device()
    loss_fn = nn.functional.binary_cross_entropy
    val_loss_fn = nn.functional.binary_cross_entropy
    best_val_loss = torch.tensor(float('inf'))
    best_val_hits = torch.tensor(0.0)
    saved_checkpoints = []
    writer = SummaryWriter(log_dir=f'{args.log_dir}/{args.experiment}')

    for e in range(1, args.train_epochs + 1):
        print(f'Training epoch {e}...')

        # Training portion
        torch.cuda.empty_cache()
        torch.set_grad_enabled(True)
        with tqdm(total=args.train_batch_size * len(train_dl)) as progress_bar:
            model.train()

            # Load graph into GPU
            adj_t = train_graph.adj_t.to(device)
            edge_index = train_graph.edge_index.to(device)
            x = train_graph.x.to(device)

            pos_pred = []
            neg_pred = []

            for i, (y_pos_edges,) in enumerate(train_dl):
                y_pos_edges = y_pos_edges.to(device).T
                y_neg_edges = negative_sampling(
                    edge_index,
                    num_nodes=train_graph.num_nodes,
                    num_neg_samples=y_pos_edges.shape[1]
                ).to(device)
                y_batch = torch.cat([torch.ones(y_pos_edges.shape[1]), torch.zeros(
                    y_neg_edges.shape[1])], dim=0).to(device)  # Ground truth edge labels (1 or 0)

                # Forward pass on model
                optimizer.zero_grad()
                y_pred = model(adj_t, torch.cat(
                    [y_pos_edges, y_neg_edges], dim=1))
                loss = loss_fn(y_pred, y_batch)

                # Backward pass and optimization
                loss.backward()
                optimizer.step()
                if args.use_scheduler:
                    lr_scheduler.step(loss)

                batch_acc = torch.mean(
                    1 - torch.abs(y_batch.detach() - torch.round(y_pred.detach()))).item()

                pos_pred += [y_pred[y_batch == 1].detach()]
                neg_pred += [y_pred[y_batch == 0].detach()]

                progress_bar.update(y_pos_edges.shape[1])
                progress_bar.set_postfix(loss=loss.item(), acc=batch_acc)
                writer.add_scalar(
                    "train/Loss", loss, ((e - 1) * len(train_dl) + i) * args.train_batch_size)
                writer.add_scalar("train/Accuracy", batch_acc,
                                  ((e - 1) * len(train_dl) + i) * args.train_batch_size)

                del y_pos_edges
                del y_neg_edges
                del y_pred
                del loss

            del adj_t
            del edge_index
            del x

            # Training set evaluation Hits@K Metrics
            pos_pred = torch.cat(pos_pred, dim=0)
            neg_pred = torch.cat(neg_pred, dim=0)
            results = {}
            for K in [10, 20, 30]:
                evaluator.K = K
                hits = evaluator.eval({
                    'y_pred_pos': pos_pred,
                    'y_pred_neg': neg_pred,
                })[f'hits@{K}']
                results[f'Hits@{K}'] = hits
            print()
            print(f'Train Statistics')
            print('*' * 30)
            for k, v in results.items():
                print(f'{k}: {v}')
                writer.add_scalar(
                    f"train/{k}", v, (pos_pred.shape[0] + neg_pred.shape[0]) * e)
            print('*' * 30)

            del pos_pred
            del neg_pred

        # Validation portion
        torch.cuda.empty_cache()
        torch.set_grad_enabled(False)
        with tqdm(total=args.val_batch_size * len(dev_dl)) as progress_bar:
            model.eval()

            adj_t = valid_graph.adj_t.to(device)
            edge_index = valid_graph.edge_index.to(device)
            x = valid_graph.x.to(device)

            val_loss = 0.0
            accuracy = 0
            num_samples_processed = 0
            pos_pred = []
            neg_pred = []
            for i, (edges_batch, y_batch) in enumerate(dev_dl):
                edges_batch = edges_batch.T.to(device)
                y_batch = y_batch.to(device)

                # Forward pass on model in validation environment
                y_pred = model(adj_t, edges_batch)
                loss = val_loss_fn(y_pred, y_batch)

                num_samples_processed += edges_batch.shape[1]
                batch_acc = torch.mean(
                    1 - torch.abs(y_batch - torch.round(y_pred))).item()
                accuracy += batch_acc * edges_batch.shape[1]
                val_loss += loss.item() * edges_batch.shape[1]

                pos_pred += [y_pred[y_batch == 1].detach()]
                neg_pred += [y_pred[y_batch == 0].detach()]

                progress_bar.update(edges_batch.shape[1])
                progress_bar.set_postfix(
                    val_loss=val_loss / num_samples_processed,
                    acc=accuracy/num_samples_processed)
                writer.add_scalar(
                    "Val/Loss", loss, ((e - 1) * len(dev_dl) + i) * args.val_batch_size)
                writer.add_scalar(
                    "Val/Accuracy", batch_acc, ((e - 1) * len(dev_dl) + i) * args.val_batch_size)

                del edges_batch
                del y_batch
                del y_pred
                del loss

            del adj_t
            del edge_index
            del x

            # Validation evaluation Hits@K Metrics
            pos_pred = torch.cat(pos_pred, dim=0)
            neg_pred = torch.cat(neg_pred, dim=0)
            results = {}
            for K in [10, 20, 30]:
                evaluator.K = K
                hits = evaluator.eval({
                    'y_pred_pos': pos_pred,
                    'y_pred_neg': neg_pred,
                })[f'hits@{K}']
                results[f'Hits@{K}'] = hits
            print()
            print(f'Validation Statistics')
            print('*' * 30)
            for k, v in results.items():
                print(f'{k}: {v}')
                writer.add_scalar(
                    f"Val/{k}", v, (pos_pred.shape[0] + neg_pred.shape[0]) * e)
            print('*' * 30)

            del pos_pred
            del neg_pred

            # Save model if it's the best one yet.
            if results['Hits@20'] > best_val_hits:
                best_val_hits = results['Hits@20']
                filename = f'{args.save_path}/{args.experiment}/{model.__class__.__name__}_best_val.checkpoint'
                model_utils.save_model(model, filename)
                print(f'Model saved!')
                print(f'Best validation Hits@20 yet: {best_val_hits}')
            # Save model on checkpoints.
            if e % args.checkpoint_freq == 0:
                filename = f'{args.save_path}/{args.experiment}/{model.__class__.__name__}_epoch_{e}.checkpoint'
                model_utils.save_model(model, filename)
                print(f'Model checkpoint reached!')
                saved_checkpoints.append(filename)
                # Delete checkpoints if there are too many
                while len(saved_checkpoints) > args.num_checkpoints:
                    os.remove(saved_checkpoints.pop(0))

    return model