Пример #1
0
def load_ogbl(name,
              device=th.device('cpu'),
              root='/home/eva_share_users/zhuyu'):
    from ogb.linkproppred import DglLinkPropPredDataset

    print('load', name)
    data = DglLinkPropPredDataset(name=name, root=root)
    print('finish loading', name)
    splitted_idx = data.get_edge_split()
    if name == 'ogbl-citation2':
        splitted_idx['train']['edge'] = th.cat(
            (splitted_idx['train']['source_node'].unsqueeze(1),
             splitted_idx['train']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['valid']['edge'] = th.cat(
            (splitted_idx['valid']['source_node'].unsqueeze(1),
             splitted_idx['valid']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['valid']['neg_edge'] = th.cat(
            (splitted_idx['valid']['source_node'].repeat(1000).unsqueeze(1),
             splitted_idx['valid']['target_node_neg'].view(-1).unsqueeze(1)),
            axis=1)
        splitted_idx['test']['edge'] = th.cat(
            (splitted_idx['test']['source_node'].unsqueeze(1),
             splitted_idx['test']['target_node'].unsqueeze(1)),
            axis=1)
        splitted_idx['test']['neg_edge'] = th.cat(
            (splitted_idx['test']['source_node'].repeat(1000).unsqueeze(1),
             splitted_idx['test']['target_node_neg'].view(-1).unsqueeze(1)),
            axis=1)
    graph = data[0]
    #from IPython import embed; embed()
    return graph, splitted_idx
Пример #2
0
 def __init__(self, name):
     start = time.time()
     print("[I] Loading dataset %s..." % (name))
     self.name = name
     self.dataset = DglLinkPropPredDataset(name='ogbl-collab')
     
     self.graph = self.dataset[0]  # single DGL graph
     
     # Create edge feat by concatenating weight and year
     self.graph.edata['feat'] = torch.cat( 
         [self.graph.edata['edge_weight'], self.graph.edata['edge_year']], 
         dim=1 
     )
     
     self.split_edge = self.dataset.get_edge_split()
     self.train_edges = self.split_edge['train']['edge']  # positive train edges
     self.val_edges = self.split_edge['valid']['edge']  # positive val edges
     self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
     self.test_edges = self.split_edge['test']['edge']  # positive test edges
     self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
     
     self.evaluator = Evaluator(name='ogbl-collab')
     
     print("[I] Finished loading.")
     print("[I] Data load time: {:.4f}s".format(time.time()-start))
Пример #3
0
def load_ogb_dataset(dataset):
    """
    Load OGB dataset
    Args:
        dataset(str): name of dataset (ogbl-collab, ogbl-ddi, ogbl-citation)

    Returns:
        graph(DGLGraph): graph
        split_edge(dict): split edge

    """
    dataset = DglLinkPropPredDataset(name=dataset)
    split_edge = dataset.get_edge_split()
    graph = dataset[0]

    return graph, split_edge
Пример #4
0
class DDIDataset(Dataset):
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-ddi')
        
        self.graph = self.dataset[0]  # single DGL graph

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
        
        self.evaluator = Evaluator(name='ogbl-ddi')
        
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time()-start))

    def _add_positional_encodings(self, pos_enc_dim):
        
        # Graph positional encoding v/ Laplacian eigenvectors
        self.graph = positional_encoding(self.graph, pos_enc_dim)
Пример #5
0
def prepare_train_labels() -> Tuple[dgl.DGLHeteroGraph, Tensor, Tensor, Tensor]:
    dataset = DglLinkPropPredDataset(name="ogbl-collab")
    split_edge = dataset.get_edge_split()
    train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
    graph: dgl.DGLGraph = dataset[0]

    train_src_nodes = torch.cat([train_edge["edge"][:, 0], train_edge["edge"][:, 1]], dim=0)
    train_dst_nodes = torch.cat([train_edge["edge"][:, 1], train_edge["edge"][:, 0]], dim=0)
    train_graph = dgl.graph(data=(train_src_nodes, train_dst_nodes), num_nodes=graph.number_of_nodes())

    train_graph.ndata["feat"] = graph.ndata["feat"]

    train_labels = train_edge["edge"]
    train_labels = torch.cat([train_labels, torch.ones(len(train_labels), 1).long()], dim=1)
    valid_labels = get_label_from_split(valid_edge)
    test_labels = get_label_from_split(test_edge)
    return train_graph, train_labels, valid_labels, test_labels
Пример #6
0
def prepare_ogb(name):
    dataset = DglLinkPropPredDataset(name)

    split_edge = dataset.get_edge_split()
    train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
        "valid"], split_edge["test"]
    g = dataset[0]  # dgl graph object containing only training edges

    train_data = sample_data(1500000, train_edge, sampling=True).numpy()
    valid_data = sample_data(50000, valid_edge)
    test_data = sample_data(50000, test_edge)

    num_nodes = g.number_of_nodes()
    num_rels = len(torch.unique(train_edge['relation']))
    del g

    return train_data, valid_data, test_data, num_nodes, num_rels
Пример #7
0
    def __init__(self, name):
        start = time.time()
        print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-ddi')
        
        self.graph = self.dataset[0]  # single DGL graph

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges
        
        self.evaluator = Evaluator(name='ogbl-ddi')
        
        print("[I] Finished loading.")
        print("[I] Data load time: {:.4f}s".format(time.time()-start))
Пример #8
0
def load_OGB(dataset):
    if dataset == 'mag':
        dataset = DglNodePropPredDataset(name='ogbn-mag')
        return dataset
        # split_idx = dataset.get_idx_split()
        # train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
        # graph, label = dataset[0]  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
    elif dataset in ['biokg', 'wikikg']:
        d_name = 'ogbl-' + dataset
        dataset = DglLinkPropPredDataset(name=d_name)

        split_edge = dataset.get_edge_split()
        train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
        graph = dataset[0]  # dgl graph object containing only training edges
Пример #9
0
    os.path.join(log_dir, "%s.log" % (dataset_name + "_" + setting_name))
])
snapshot_dir = os.path.join(log_dir, "snapshot")
if not os.path.isdir(snapshot_dir):
    os.makedirs(snapshot_dir)
print("Process Id:", os.getpid())
print(os.path.join(log_dir, sys.argv[0]))
print(args)
shutil.copyfile(__file__, os.path.join(log_dir, "train.py"))
shutil.copyfile(model_file + ".py", os.path.join(log_dir, model_file + ".py"))

evaluator = Evaluator(name="ogbl-citation")
print(evaluator.expected_input_format)
print(evaluator.expected_output_format)

dataset = DglLinkPropPredDataset(name="ogbl-citation")
split_edge = dataset.get_edge_split()
num_worker = 16
train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
    "valid"], split_edge["test"]
graph = dataset[0]
origin_graph = copy.deepcopy(graph)
graph.readonly(False)
graph.add_edges(graph.edges()[1], graph.edges()[0])
graph.add_edges(
    torch.arange(0, graph.number_of_nodes()).long(),
    torch.arange(0, graph.number_of_nodes()).long())
graph.edata["etype"] = torch.cat([
    torch.ones(
        (graph.number_of_edges() - graph.number_of_nodes()) // 2).long(),
    (torch.ones(
Пример #10
0
def main():
    parser = argparse.ArgumentParser(
        description='Link prediction (Cluster-GCN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--dataset', type=str, default='ogbl-citation')
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--num_partitions', type=int, default=15000)
    parser.add_argument('--num_workers', type=int, default=4)
    parser.add_argument('--num_layers', type=int, default=3)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.0)
    parser.add_argument('--batch_size', type=int, default=256)
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--eval_steps', type=int, default=10)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--negs', type=int, default=1)
    parser.add_argument('--gnn_type', type=str, default='gcn')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglLinkPropPredDataset(name=args.dataset)

    # Manually add self-loop link since GCN will wash out the feature of isolated nodes.
    n_nodes = dataset[0].number_of_nodes()
    g_data = dgl.add_self_loop(dataset[0])
    g_data = dgl.to_bidirected(g_data)

    for k in dataset[0].node_attr_schemes().keys():
        g_data.ndata[k] = dataset[0].ndata[k]
    print(g_data.number_of_nodes(), g_data.number_of_edges())

    g_data.create_formats_()

    cluster_dataset = ClusterIterDataset(args.dataset,
                                         g_data,
                                         args.num_partitions,
                                         use_pp=False)
    cluster_iterator = DataLoader(cluster_dataset,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=args.num_workers,
                                  collate_fn=partial(subgraph_collate_fn,
                                                     g_data,
                                                     negs=args.negs))

    model = GCN(g_data.ndata['feat'].size(-1),
                args.hidden_channels,
                args.hidden_channels,
                args.num_layers,
                args.dropout,
                gnn_type=args.gnn_type).to(device)
    predictor = LinkPredictor(args.hidden_channels, args.hidden_channels, 1,
                              args.num_layers, args.dropout).to(device)

    evaluator = Evaluator(name=args.dataset)
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0
        for epoch in range(1, 1 + args.epochs):
            loss, c_epoch_time, c_to_device_time, c_ff_time, c_pred_loss_time, c_bp_time, c_io_time, c_memory, c_part1 = train(
                model, predictor, cluster_iterator, optimizer, device)
            print(f'Run: {run + 1:02d}, Epoch: {epoch:02d}, Loss: {loss:.4f}')
            epoch_time += c_epoch_time
            to_device_time += c_to_device_time
            ff_time += c_ff_time
            pred_loss_time += c_pred_loss_time
            bp_time += c_bp_time
            io_time += c_io_time
            part_1 += c_part1
            memory = max(memory, c_memory[0])

            if epoch % args.eval_steps == 0:
                print('Ave')
                print('epoch time: ', epoch_time / args.eval_steps)
                print('to_device_time: ', to_device_time / args.eval_steps)
                print('ff_time: ', ff_time / args.eval_steps)
                print('part1_time: ', part_1 / args.eval_steps)
                print('pred_loss_time: ', pred_loss_time / args.eval_steps)
                print('bp_time: ', bp_time / args.eval_steps)
                print('io_time: ', io_time / args.eval_steps)
                print('max memory', memory)
                print('\n')
                epoch_time, to_device_time, ff_time, pred_loss_time, bp_time, io_time, memory, part_1 = 0, 0, 0, 0, 0, 0, 0, 0

                result = test(model, predictor, g_data, split_edge, evaluator,
                              64 * 4 * args.batch_size, device)
                logger.add_result(run, result)

                if epoch % args.log_steps == 0:
                    train_mrr, valid_mrr, test_mrr = result
                    print(f'Run: {run + 1:02d}, '
                          f'Epoch: {epoch:02d}, '
                          f'Loss: {loss:.4f}, '
                          f'Train: {train_mrr:.4f}, '
                          f'Valid: {valid_mrr:.4f}, '
                          f'Test: {test_mrr:.4f}')

        logger.print_statistics(run)
    logger.print_statistics()
Пример #11
0
def load_from_ogbl_with_name(name):
    choices = ['ogbl-collab', 'ogbl-ddi', 'ogbl-ppa', 'ogbl-citation']
    assert name in choices, "name must be selected from " + str(choices)
    dataset = DglLinkPropPredDataset(name)
    return dataset[0]
Пример #12
0
eids = np.arange(graph.number_of_edges())
eids = np.random.permutation(eids)
test_pos_u, test_pos_v = u[eids[:50]], v[eids[:50]]
train_pos_u, train_pos_v = u[eids[50:]], v[eids[50:]]

# Find all negative edges and split them for training and testing
adj = sp.coo_matrix((np.ones(len(u)), (u, v)))
adj_neg = 1 - adj.todense() - np.eye(34)
neg_u, neg_v = np.where(adj_neg != 0)
neg_eids = np.random.choice(len(neg_u), 200)
test_neg_u, test_neg_v = neg_u[neg_eids[:50]], neg_v[neg_eids[:50]]
train_neg_u, train_neg_v = neg_u[neg_eids[50:]], neg_v[neg_eids[50:]]

# Code from OGB instructions
# You may need to use DGL backend, otherwise PyTorch will be used.
dataset = DglLinkPropPredDataset(name='ogbl-collab')
graph = dataset[0]

split_edge = dataset.get_edge_split()
train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], \
                                    split_edge["test"]
"""
Building models
I stack different layers together, here is the full list of supported layers in Tensorflow https://docs.dgl.ai/api/python/nn.tensorflow.html, 
PyTorch https://docs.dgl.ai/api/python/nn.pytorch.html, and MXNet https://docs.dgl.ai/api/python/nn.mxnet.html backends

Below I'm using GraphSage (SAmple and aggreGatE) -- this aggregate feature information from a node's local neighborhood 
(e.g., the degrees or text attributes of nearby nodes)

Paper: https://arxiv.org/pdf/1706.02216.pdf
Blog: https://towardsdatascience.com/an-intuitive-explanation-of-graphsage-6df9437ee64f
Пример #13
0
class COLLABDataset(Dataset):
    def __init__(self, name, norm='none', verbose=True):
        start = time.time()
        if verbose:
            print("[I] Loading dataset %s..." % (name))
        self.name = name
        self.dataset = DglLinkPropPredDataset(name='ogbl-collab')

        self.graph = self.dataset[0]  # single DGL graph
        #self._add_positional_encodings(10, norm)
        self._add_eig(norm=norm, number=6)

        # Create edge feat by concatenating weight and year
        self.graph.edata['feat'] = torch.cat(
            [self.graph.edata['edge_weight'], self.graph.edata['edge_year']],
            dim=1
        )

        self.split_edge = self.dataset.get_edge_split()
        self.train_edges = self.split_edge['train']['edge']  # positive train edges
        self.val_edges = self.split_edge['valid']['edge']  # positive val edges
        self.val_edges_neg = self.split_edge['valid']['edge_neg']  # negative val edges
        self.test_edges = self.split_edge['test']['edge']  # positive test edges
        self.test_edges_neg = self.split_edge['test']['edge_neg']  # negative test edges

        self.evaluator = Evaluator(name='ogbl-collab')
        if verbose:
            print("[I] Finished loading.")
            print("[I] Data load time: {:.4f}s".format(time.time() - start))

    def _add_positional_encodings(self, pos_enc_dim, norm):
        # Graph positional encoding v/ Laplacian eigenvectors
        self.graph = positional_encoding(self.graph, pos_enc_dim, norm)

    def _add_eig(self, norm='none', number=6):

        dataset = LinkPropPredDataset(name='ogbl-collab')
        graph = dataset[0]
        G = nx.Graph()
        G.add_nodes_from([i for i in range(235868)])

        for nod1, nod2 in zip(graph['edge_index'][0], graph['edge_index'][1]):
            G.add_edge(nod1, nod2)

        components = list(nx.connected_components(G))
        list_G = []
        list_nodes = []

        for component in components:
            G_new = nx.Graph()
            G_new.add_nodes_from(list(component))
            list_G.append(G_new)
            list_nodes.append(list(component))
        for i in range(len(list_G)):
            for nod1, nod2 in list(G.edges(list_nodes[i])):
                list_G[i].add_edge(nod1, nod2)

        EigVec_global = np.ones((235868, number))
        for g in list_G:
            node_list = list(g.nodes)
            A = nx.adjacency_matrix(g, nodelist=node_list).astype(float)
            if norm == 'none':
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D - A
            elif norm == 'sym':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-0.5), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A) * D_norm
            elif norm == 'walk':
                D_norm = sp.diags(list(map(lambda x: x[1]**(-1), g.degree())))
                D = sp.diags(list(map(lambda x: x[1], g.degree())))
                L = D_norm * (D - A)

            if len(node_list) > 2:
                EigVal, EigVec = sp.linalg.eigs(L, k=min(len(node_list) - 2, number), which='SR', tol=0)
                EigVec = EigVec[:, EigVal.argsort()] / np.max(EigVec[:, EigVal.argsort()], 0)
                EigVec_global[node_list, : min(len(node_list) - 2, number)] = EigVec[:, :]
            elif len(node_list) == 2:
                EigVec_global[node_list[0], :number] = np.zeros((1, number))
        self.graph.ndata['eig'] = torch.from_numpy(EigVec_global).float()
        print(sorted(self.graph.ndata['eig'][1]))
Пример #14
0
def test_datasetsaver():
    # test on graph classification
    # ogbg-molhiv

    test_task = 'link'

    # testing all the dataset objects are working.
    if test_task == 'graph':
        from ogb.graphproppred import PygGraphPropPredDataset, DglGraphPropPredDataset, GraphPropPredDataset
        dataset_name = 'ogbg-molhiv'
        dataset = PygGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglGraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = GraphPropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'node':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-arxiv'  # test ogbn-proteins
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'link':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-collab'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    elif test_task == 'heteronode':
        from ogb.nodeproppred import NodePropPredDataset, PygNodePropPredDataset, DglNodePropPredDataset
        dataset_name = 'ogbn-mag'
        dataset = PygNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = DglNodePropPredDataset(dataset_name)
        dataset.get_idx_split()
        dataset = NodePropPredDataset(dataset_name)
        dataset.get_idx_split()
    elif test_task == 'heterolink':
        from ogb.linkproppred import LinkPropPredDataset, PygLinkPropPredDataset, DglLinkPropPredDataset
        dataset_name = 'ogbl-biokg'
        dataset = PygLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = DglLinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
        dataset = LinkPropPredDataset(dataset_name)
        dataset.get_edge_split()
    else:
        raise ValueError('Invalid task category')

    print(dataset[0])
    if 'link' in test_task:
        print(dataset.get_edge_split())
    else:
        print(dataset.get_idx_split())

    if 'graph' in test_task:
        graph_list = dataset.graphs
    else:
        graph_list = [dataset.graph]

    if 'link' not in test_task:
        labels = dataset.labels

    is_hetero = 'hetero' in test_task
    version = 2 if dataset_name == 'ogbn-mag' else 1
    saver = DatasetSaver(dataset_name, is_hetero, version=version)

    # saving graph objects
    saver.save_graph_list(graph_list)
    # saving target labels
    if 'link' not in test_task:
        saver.save_target_labels(labels)
    # saving split
    if 'link' in test_task:
        split_idx = dataset.get_edge_split()
    else:
        split_idx = dataset.get_idx_split()
    # second argument must be the name of the split
    saver.save_split(split_idx, dataset.meta_info['split'])
    # copying mapping dir
    # saver.copy_mapping_dir(f"dataset/{'_'.join(dataset_name.split('-'))}/mapping/")
    saver.copy_mapping_dir("dataset/{}/mapping/".format('_'.join(
        dataset_name.split('-'))))

    saver.save_task_info(
        dataset.task_type, dataset.eval_metric,
        dataset.num_classes if hasattr(dataset, 'num_classes') else None)

    meta_dict = saver.get_meta_dict()

    print(meta_dict)

    print('Now testing.')

    if 'graph' in test_task:
        print('library agnostic')
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = GraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglGraphPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
    elif 'node' in test_task:
        print('library agnostic')
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = NodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('Pytorch Geometric')
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())
        print('DGL')
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglNodePropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        print(dataset.get_idx_split())

    elif 'link' in test_task:
        print('library agnostic')
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = LinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('Pytorch Geometric')
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = PygLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
        print('DGL')
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        dataset = DglLinkPropPredDataset(dataset_name, meta_dict=meta_dict)
        print(dataset[0])
        # print(dataset.get_edge_split())
    else:
        raise ValueError('Invalid task category')

    # zip
    saver.zip()
    print('Finished zipping!')

    saver.cleanup()
Пример #15
0
def main():
    parser = argparse.ArgumentParser(description='OGBL-COLLAB (GNN)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--log_steps', type=int, default=1)
    parser.add_argument('--eval_steps', type=int, default=1)
    parser.add_argument('--runs', type=int, default=1)
    
    parser.add_argument('--gnn_type', type=str, default='gated-gcn')
    parser.add_argument('--num_layer', type=int, default=3)
    parser.add_argument('--emb_dim', type=int, default=64)
    parser.add_argument('--dropout', type=float, default=0.0)
    
    parser.add_argument('--batch_size', type=int, default=32*1024)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--epochs', type=int, default=200)
    
    args = parser.parse_args()
    print(args)
    
    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)
    print(device)

    dataset = DglLinkPropPredDataset(name='ogbl-collab')
    split_edge = dataset.get_edge_split()
    data = dataset[0]
    print(data)

    model = GNN(gnn_type=args.gnn_type, emb_dim=args.emb_dim, num_layer=args.num_layer, dropout=args.dropout).to(device)
    print(model)
    total_param = 0
    for param in model.parameters():
        total_param += np.prod(list(param.data.size()))
    print(f'Model parameters: {total_param}')

    predictor = LinkPredictor(emb_dim=args.emb_dim).to(device)
    print(predictor)
    total_param = 0
    for param in predictor.parameters():
        total_param += np.prod(list(param.data.size()))
    print(f'Predictor parameters: {total_param}')

    evaluator = Evaluator(name='ogbl-collab')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }
    
    tb_logger = SummaryWriter(
        os.path.join(
            "logs", 
            f"{args.gnn_type}-L{args.num_layer}-h{args.emb_dim}-d{args.dropout}-LR{args.lr}", 
            time.strftime("%Y%m%dT%H%M%S")
        )
    )
    
    for run in range(args.runs):
        assert args.runs == 1
        # model.reset_parameters()

        optimizer = torch.optim.Adam(list(model.parameters()) + list(predictor.parameters()), lr=args.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=5, min_lr=1e-5, verbose=True)

        for epoch in range(1, 1 + args.epochs):

            loss = train(model, predictor, data, split_edge, optimizer, args.batch_size, device)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, split_edge, evaluator, args.batch_size, device)

                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    tb_logger.add_scalar('loss', loss, epoch)
                    tb_logger.add_scalar('lr', optimizer.param_groups[0]['lr'], epoch)
                    
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

                        tb_logger.add_scalar(f'{key}/train_hits', 100 * train_hits, epoch)
                        tb_logger.add_scalar(f'{key}/valid_hits', 100 * valid_hits, epoch)
                        tb_logger.add_scalar(f'{key}/test_hits', 100 * test_hits, epoch)     
                        
                    print('---')
                
                scheduler.step(100 * results["Hits@10"][1])
            
            if optimizer.param_groups[0]['lr'] < 1e-5:
                break

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)
Пример #16
0

def evaluate(model, edge_split, device, num_workers):
    with torch.no_grad():
        node_emb = model.inference(graph, device, 4096, num_workers, 'cpu')
        results = []
        for split in ['valid', 'test']:
            src = edge_split[split]['source_node'].to(device)
            dst = edge_split[split]['target_node'].to(device)
            neg_dst = edge_split[split]['target_node_neg'].to(device)
            results.append(
                compute_mrr(model, node_emb, src, dst, neg_dst, device))
    return results


dataset = DglLinkPropPredDataset('ogbl-citation2')
graph = dataset[0]
graph, reverse_eids = to_bidirected_with_reverse_mapping(graph)
graph = graph.to('cuda' if args.pure_gpu else 'cpu')
reverse_eids = reverse_eids.to(device)
seed_edges = torch.arange(graph.num_edges()).to(device)
edge_split = dataset.get_edge_split()

model = SAGE(graph.ndata['feat'].shape[1], 256).to(device)
opt = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)

sampler = dgl.dataloading.NeighborSampler([15, 10, 5],
                                          prefetch_node_feats=['feat'])
sampler = dgl.dataloading.as_edge_prediction_sampler(
    sampler,
    exclude='reverse_id',
Пример #17
0
# -*- coding: utf-8 -*-
# @Time    : 2020-10-20 15:39
# @Author  : xiaorui su
# @Email   :  [email protected]
# @File    : load_kg.py
# @Software : PyCharm

d_name = "ogbl-biokg"
import dgl
import numpy as np
import torch as th
from ogb.linkproppred import DglLinkPropPredDataset
import random

dataset = DglLinkPropPredDataset(name=d_name)
print(dataset[0])
split_edge = dataset.get_edge_split()
#print(split_edge)
#five kinds of nodes: drug, protein,function,disease
train_edge, valid_edge, test_edge = split_edge["train"], split_edge[
    "valid"], split_edge["test"]
graph = dataset[0]  # dgl graph object containing only training edges

#head_type,head,relation,tail_type,tail
#ids: head,relation,tail


def construct_entity():
    ##construct entity2id.txt for biokg
    # 10687
    disease_entity = "dataset/ogbl_biokg/mapping/disease_entidx2name.csv"
def main():
    parser = argparse.ArgumentParser(description='OGBL-PPA (Full-Batch)')
    parser.add_argument('--use_gpu',
                        action='store_true',
                        help='Use gpu for computation (default: False)')
    parser.add_argument(
        '--log_steps',
        type=int,
        default=1,
        help='Print training progress every {log_steps} epochs (default: 1)')
    parser.add_argument('--use_sage',
                        action='store_true',
                        help='Use GraphSAGE rather than GCN (default: False)')
    parser.add_argument(
        '--num_layers',
        type=int,
        default=3,
        help='Number of GNN layers to use as well as '
        'linear layers to use for final link prediction (default: 3)')
    parser.add_argument('--hidden_feats',
                        type=int,
                        default=256,
                        help='Size for hidden representations (default: 256)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.0,
                        help='Dropout (default: 0.0)')
    parser.add_argument(
        '--batch_size',
        type=int,
        default=64 * 1024,
        help='Batch size to use for link prediction (default: 64 * 1024)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        help='Learning rate (default: 0.01)')
    parser.add_argument('--epochs',
                        type=int,
                        default=20,
                        help='Number of epochs for training (default: 20)')
    parser.add_argument(
        '--eval_steps',
        type=int,
        default=1,
        help='Evaluate hits@100 every {eval_steps} epochs (default: 1)')
    parser.add_argument(
        '--runs',
        type=int,
        default=10,
        help='Number of random experiments to perform (default: 10)')
    args = parser.parse_args()
    print(args)

    if args.use_gpu and torch.cuda.is_available():
        device = torch.device('cuda:0')
    else:
        device = torch.device('cpu')

    dataset = DglLinkPropPredDataset(name='ogbl-ppa')
    # Get DGLGraph
    data = dataset[0]
    data.readonly(False)
    data.add_edges(data.nodes(), data.nodes())
    data = data.to(device)
    splitted_edge = dataset.get_edge_split()
    x = data.ndata['feat'].float().to(device)

    if args.use_sage:
        model = GraphSAGE(
            in_feats=x.size(-1),
            hidden_feats=[args.hidden_feats for _ in range(args.num_layers)],
            activation=[F.relu for _ in range(args.num_layers - 1)] + [None],
            dropout=[0] + [args.dropout
                           for _ in range(args.num_layers - 1)]).to(device)
    else:
        model = GCN(
            in_feats=x.size(-1),
            hidden_feats=[args.hidden_feats for _ in range(args.num_layers)],
            activation=[F.relu for _ in range(args.num_layers - 1)] + [None],
            residual=[False for _ in range(args.num_layers)],
            batchnorm=[False for _ in range(args.num_layers)],
            dropout=[args.dropout
                     for _ in range(args.num_layers - 1)] + [0]).to(device)

    predictor = HadamardLinkPredictor(in_feats=args.hidden_feats,
                                      hidden_feats=args.hidden_feats,
                                      num_layers=args.num_layers,
                                      n_tasks=1,
                                      dropout=args.dropout).to(device)

    evaluator = Evaluator(name='ogbl-ppa')
    loggers = {
        'Hits@10': Logger(args.runs, args),
        'Hits@50': Logger(args.runs, args),
        'Hits@100': Logger(args.runs, args),
    }

    for run in range(args.runs):
        model.reset_parameters()
        predictor.reset_parameters()
        optimizer = torch.optim.Adam(list(model.parameters()) +
                                     list(predictor.parameters()),
                                     lr=args.lr)

        for epoch in range(1, 1 + args.epochs):
            loss = train(model, predictor, data, x, splitted_edge, optimizer,
                         args.batch_size)

            if epoch % args.eval_steps == 0:
                results = test(model, predictor, data, x, splitted_edge,
                               evaluator, args.batch_size)
                for key, result in results.items():
                    loggers[key].add_result(run, result)

                if epoch % args.log_steps == 0:
                    for key, result in results.items():
                        train_hits, valid_hits, test_hits = result
                        print(key)
                        print(f'Run: {run + 1:02d}, '
                              f'Epoch: {epoch:02d}, '
                              f'Loss: {loss:.4f}, '
                              f'Train: {100 * train_hits:.2f}%, '
                              f'Valid: {100 * valid_hits:.2f}%, '
                              f'Test: {100 * test_hits:.2f}%')

        for key in loggers.keys():
            print(key)
            loggers[key].print_statistics(run)

    for key in loggers.keys():
        print(key)
        loggers[key].print_statistics()