示例#1
0
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    data = DglNodePropPredDataset(name=name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['features'] = graph.ndata['feat']
    graph.ndata['labels'] = labels
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    train_mask[train_nid] = 1
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    val_mask[val_nid] = 1
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.int64)
    test_mask[test_nid] = 1
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    return graph, len(th.unique(graph.ndata['labels']))
示例#2
0
文件: utils.py 项目: ziqiaomeng/dgl
def load_ogb_mag():
    name = 'ogbn-mag'
    from ogb.nodeproppred import DglNodePropPredDataset

    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

    print('load', name)
    dataset = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    split_idx = dataset.get_idx_split()
    train_idx = split_idx["train"]['paper']
    val_idx = split_idx["valid"]['paper']
    test_idx = split_idx["test"]['paper']
    hg_orig, labels = dataset[0]
    subgs = {}
    for etype in hg_orig.canonical_etypes:
        u, v = hg_orig.all_edges(etype=etype)
        subgs[etype] = (u, v)
        subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
    hg = dgl.heterograph(subgs)
    hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
    hg.nodes['paper'].data['labels'] = labels['paper'].squeeze()
    train_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    train_mask[train_idx] = True
    val_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    val_mask[val_idx] = True
    test_mask = torch.zeros((hg.number_of_nodes('paper'), ), dtype=torch.bool)
    test_mask[test_idx] = True
    hg.nodes['paper'].data['train_mask'] = train_mask
    hg.nodes['paper'].data['val_mask'] = val_mask
    hg.nodes['paper'].data['test_mask'] = test_mask

    num_classes = dataset.num_classes
    return OGBDataset(hg, num_classes, 'paper')
示例#3
0
    def __init__(self, dataset_name='ogbn-arxiv', k=5):
        super(Ogbn, self).__init__()
        print("Loading dataset {}".format(dataset_name))
        self.dataset_name = dataset_name
        self.dataset_path = os.path.join('./data',
                                         dataset_name.replace('-', '_'))
        ogbn_dataset = DglNodePropPredDataset(dataset_name, root='./data')
        self.graph, self.label = ogbn_dataset[0]
        self.graph = self.graph.add_self_loop()
        self.label = self.label.flatten()

        self.split = ogbn_dataset.get_idx_split()

        self.length = self.graph.num_nodes()
        self.nodes = self.graph.nodes()
        self.edges = self.graph.edges()
        self.k = k
        print("Generate Context...")
        time1 = time.time()
        self.context = np.squeeze(
            dgl.sampling.random_walk(self.graph, self.nodes, length=self.k)[0])
        time2 = time.time()
        print("Context shape: {}, time: {}s".format(self.context.shape,
                                                    time2 - time1))
        print("Loading WL...")
        self.path_WL = os.path.join(self.dataset_path, 'WL.pkl')
        self.WL = self.load_WL(self.path_WL)
        max_wl = 0
        for i in self.WL.values():
            max_wl = i if i > max_wl else max_wl
        print("Max WL id: ", max_wl)
示例#4
0
def load_ogb_product(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    os.symlink('/tmp/dataset/', os.path.join(os.getcwd(), 'dataset'))

    print('load', name)
    data = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['label'] = labels
    in_feats = graph.ndata['feat'].shape[1]
    num_labels = len(
        torch.unique(labels[torch.logical_not(torch.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx[
        'valid'], splitted_idx['test']
    train_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    train_mask[train_nid] = True
    val_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    val_mask[val_nid] = True
    test_mask = torch.zeros((graph.number_of_nodes(), ), dtype=torch.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask

    return OGBDataset(graph, num_labels)
示例#5
0
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    data = DglNodePropPredDataset(name=name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata["features"] = graph.ndata["feat"]
    graph.ndata["labels"] = labels
    in_feats = graph.ndata["features"].shape[1]
    num_labels = len(th.unique(labels))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = (
        splitted_idx["train"],
        splitted_idx["valid"],
        splitted_idx["test"],
    )
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata["train_mask"] = train_mask
    graph.ndata["val_mask"] = val_mask
    graph.ndata["test_mask"] = test_mask
    return graph, len(th.unique(graph.ndata["labels"]))
def load_ogb(name):
    from ogb.nodeproppred import DglNodePropPredDataset

    print('load', name)
    data = DglNodePropPredDataset(name=name)
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    graph, labels = data[0]
    labels = labels[:, 0]

    graph.ndata['features'] = graph.ndata['feat']
    del graph.ndata['feat']
    graph.ndata['labels'] = labels
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    print('finish constructing', name)
    return graph, num_labels
示例#7
0
def load_data(name, ogb_root, seed, device):
    if name == 'ogbn-arxiv':
        data = DglNodePropPredDataset('ogbn-arxiv', ogb_root)
        g, labels = data[0]
        split = data.get_idx_split()
        return g.to(device), labels.squeeze(dim=-1).to(device), data.num_classes, \
               split['train'].to(device), split['valid'].to(device), split['test'].to(device)
    elif name in ('cora', 'citeseer', 'pubmed'):
        data = load_citation_dataset(name)
    elif name == 'cora_full':
        data = gnn_benckmark.CoraFullDataset()
    elif name in ('cs', 'physics'):
        data = gnn_benckmark.Coauthor(name)
    elif name in ('photo', 'computers'):
        data = gnn_benckmark.AmazonCoBuy(name)
    else:
        raise ValueError('Unknown dataset:', name)

    g = data[0].to(device)
    # https://github.com/dmlc/dgl/issues/2479
    num_classes = data.num_classes
    if name in ('photo', 'computers'):
        num_classes = g.ndata['label'].max().item() + 1
    if 'train_mask' in g.ndata:
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
    else:
        train_idx, val_idx, test_idx = split_idx(torch.arange(g.num_nodes()),
                                                 0.2, 0.3, seed)
    return g, g.ndata['label'], num_classes, train_idx.to(device), val_idx.to(
        device), test_idx.to(device)
示例#8
0
 def __init__(self, batch_size: int):
     super().__init__()
     dataset = DglNodePropPredDataset(name='ogbn-arxiv')
     self.split_idx = dataset.get_idx_split()
     self.g, labels = dataset[0]
     self.g.ndata["label"] = labels.squeeze()
     self.g = add_self_loop(self.g)
     self.batch_size = batch_size
示例#9
0
文件: arxiv.py 项目: jcformanek/gnn
def load_dataset():
    dataset = DglNodePropPredDataset(name='ogbn-arxiv')
    split_idx = dataset.get_idx_split()

    # there is only one graph in Node Property Prediction datasets
    g, labels = dataset[0]
    g = dgl.add_self_loop(g)
    
    return g, labels, split_idx
示例#10
0
def main():
    parser = argparse.ArgumentParser(description='OGBN-Arxiv (Full-Batch)')
    parser.add_argument('--device', type=int, default=0)
    parser.add_argument('--hidden_channels', type=int, default=256)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--lr', type=float, default=0.01)
    parser.add_argument('--wd', type=float, default=0)
    parser.add_argument('--epochs', type=int, default=500)
    parser.add_argument('--runs', type=int, default=10)
    parser.add_argument('--model', type=str, default='AFFN')
    args = parser.parse_args()
    print(args)

    device = f'cuda:{args.device}' if torch.cuda.is_available() else 'cpu'
    device = torch.device(device)

    dataset = DglNodePropPredDataset(name='ogbn-arxiv')

    split_idx = dataset.get_idx_split()
    graph, label = dataset[0]

    # 有向图转无向图,添加反向边
    g = dgl.DGLGraph((graph.edges()[0], graph.edges()[1]))
    g.add_edges(graph.edges()[1], graph.edges()[0])

    x = graph.ndata['feat'].to(device)
    y_true = label.to(device)

    train_idx = split_idx['train'].to(device)

    model = Net(x.size(-1), args.hidden_channels, dataset.num_classes,
                args.model, args.dropout).to(device)
    print(model)

    evaluator = Evaluator(name='ogbn-arxiv')
    logger = Logger(args.runs, args)

    for run in range(args.runs):
        model.reset_parameters()
        optimizer = torch.optim.Adam(
            model.parameters(), lr=args.lr, weight_decay=args.wd)
        for epoch in range(1, 1 + args.epochs):
            loss = train(model, g, x, y_true, train_idx, optimizer)
            result = test(model, g, x, y_true, split_idx, evaluator)
            logger.add_result(run, result)

            train_acc, valid_acc, test_acc = result
            print(f'Run: {run + 1:02d}, '
                  f'Epoch: {epoch:02d}, '
                  f'Loss: {loss:.4f}, '
                  f'Train: {100 * train_acc:.2f}%, '
                  f'Valid: {100 * valid_acc:.2f}% '
                  f'Test: {100 * test_acc:.2f}%')

        logger.print_statistics(run)
    logger.print_statistics()
示例#11
0
文件: gat.py 项目: lukovnikov/dgl
def load_data(dataset):
    data = DglNodePropPredDataset(name=dataset)
    evaluator = Evaluator(name=dataset)

    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"]
    graph, labels = data[0]
    graph.ndata["labels"] = labels

    return graph, labels, train_idx, val_idx, test_idx, evaluator
示例#12
0
def get_pyg_graph(dataset):
    if dataset == 'reddit':
        reddit = dgl.data.RedditDataset()
        return dgl_to_pyg_graph(reddit[0])
    elif dataset == 'arxiv':
        arxiv = DglNodePropPredDataset(name='ogbn-arxiv')
        return dgl_to_pyg_graph(arxiv[0][0])
    elif dataset == 'proteins':
        proteins = DglNodePropPredDataset(name='ogbn-proteins')
        return dgl_to_pyg_graph(proteins[0][0])
示例#13
0
def load_data(path, device):
    data = DglNodePropPredDataset('ogbn-mag', path)
    g, labels = data[0]
    g = add_reverse_edges(g)
    labels = labels['paper'].to(device)
    split_idx = data.get_idx_split()
    train_idx = split_idx['train']['paper'].to(device)
    val_idx = split_idx['valid']['paper'].to(device)
    test_idx = split_idx['test']['paper'].to(device)
    return g, labels, data.num_classes, train_idx, val_idx, test_idx, Evaluator(data.name)
示例#14
0
def get_graph(dataset):
    if dataset == 'reddit':
        reddit = dgl.data.RedditDataset()
        return homo_to_hetero(reddit[0])
    elif dataset == 'arxiv':
        arxiv = DglNodePropPredDataset(name='ogbn-arxiv')
        return homo_to_hetero(arxiv[0][0])
    elif dataset == 'proteins':
        protein = DglNodePropPredDataset(name='ogbn-proteins')
        return homo_to_hetero(protein[0][0])
    else:
        raise KeyError("Unrecognized dataset name: {}".format(dataset))
示例#15
0
 def __init__(self, dataset_name):
     super(NodeClassificationDataset, self).__init__()
     if dataset_name == 'ogb-mag':
         dataset = DglNodePropPredDataset(name='ogbn-mag')
     else:
         raise ValueError
     split_idx = dataset.get_idx_split()
     self.num_classes = dataset.num_classes
     self.train_idx, self.valid_idx, self.test_idx = split_idx[
         "train"], split_idx["valid"], split_idx["test"]
     self.g, self.label = dataset[0]
     self.category = 'paper'  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
def load_ogb(name):

    tic_step = time.time()
    get_memory("-" * 40 + "---------------------from ogb.nodeproppred import DglNodePropPredDataset***************************")
    print('load', name)
    data = DglNodePropPredDataset(name=name)
    t1 = ttt(tic_step, "-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************")
    # get_memory("-"*40+"---------------------data = DglNodePropPredDataset(name=name)***************************")
    print('finish loading', name)
    splitted_idx = data.get_idx_split()
    t2 = ttt(t1,"-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************")
    # get_memory("-" * 40 + "---------------------splitted_idx = data.get_idx_split()***************************")
    graph, labels = data[0]
    # get_memory("-" * 40 + "---------------------graph, labels = data[0]***************************")
    t3 = ttt(t2, "-" * 40 + "---------------------graph, labels = data[0]***************************")
    print(labels)
    print(data[0])
    print(graph)
    labels = labels[:, 0]
    # get_memory("-" * 40 + "---------------------labels = labels[:, 0]***************************")
    t4 = ttt(t3, "-" * 40 + "---------------------labels = labels[:, 0]***************************")

    graph.ndata['features'] = graph.ndata['feat']
    # get_memory("-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************")
    t5 = ttt(t4, "-" * 40 + "---------------------graph.ndata['features'] = graph.ndata['feat']***************************")
    graph.ndata['labels'] = labels
    t6 = ttt(t5, "-" * 40 + "---------graph.ndata['labels'] = labels******************")
    in_feats = graph.ndata['features'].shape[1]
    num_labels = len(th.unique(labels[th.logical_not(th.isnan(labels))]))

    # Find the node IDs in the training, validation, and test set.
    train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
    t7 = ttt(t6, "-" * 40 + "---------train_nid, val_nid, test_nid = splitted_idx******************")
    # get_memory(
	    # "-" * 40 + "---------------------train_nid, val_nid, test_nid = splitted_idx***************************")
    train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    train_mask[train_nid] = True
    val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    val_mask[val_nid] = True
    test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
    test_mask[test_nid] = True
    graph.ndata['train_mask'] = train_mask
    graph.ndata['val_mask'] = val_mask
    graph.ndata['test_mask'] = test_mask
    t8 = ttt(t7, "-" * 40 + "---------end of load ogb******************")
    # get_memory(
	    # "-" * 40 + "---------------------end of load ogb***************************")

    print('finish constructing', name)
    print('load ogb-products time total: '+ str(time.time()-tic_step))
    return graph, num_labels
示例#17
0
def load_dataset(dataset_type, **kwargs):
    """
    Load dataset.
    Args:
        dataset_type: str, support 'proteins', 'cora', 'citeseer', 'pubmed', 'amazon', 'reddit'.
    """
    if dataset_type == 'proteins':
        data = DglNodePropPredDataset(name='ogbn-proteins',
                                      root=kwargs['root'])
        evaluator = Evaluator(name='ogbn-proteins')

        splitted_idx = data.get_idx_split()
        train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx[
            "valid"], splitted_idx["test"]
        graph, labels = data[0]
        species = graph.ndata['species']
        features = one_hot_encoder(species)
        graph.ndata['feat'] = features
        graph.ndata['label'] = labels

        return graph, labels, train_idx, val_idx, test_idx, evaluator

    if dataset_type == 'cora':
        dataset = dgl.data.CoraGraphDataset()
    elif dataset_type == 'citeseer':
        dataset = dgl.data.CiteseerGraphDataset()
    elif dataset_type == 'pubmed':
        dataset = dgl.data.PubmedGraphDataset()
    elif dataset_type == 'amazon':
        dataset = dgl.data.AmazonCoBuyComputerDataset()
    elif dataset_type == 'reddit':
        dataset = dgl.data.RedditDataset()
    else:
        raise (KeyError(
            'Dataset type {} not recognized.'.format(dataset_type)))

    if dataset_type == 'amazon':
        num_classes = dataset.num_classes
        graph = dataset[0]
        features = th.FloatTensor(graph.ndata['feat'])
        labels = th.LongTensor(graph.ndata['label'])
    else:
        num_classes = dataset.num_classes
        graph = dataset[0]
        features = th.FloatTensor(graph.ndata['feat'])
        labels = th.LongTensor(graph.ndata['label'])
        train_mask = th.BoolTensor(graph.ndata['train_mask'])
        val_mask = th.BoolTensor(graph.ndata['val_mask'])
        test_mask = th.BoolTensor(graph.ndata['test_mask'])

    return graph, features, labels, num_classes, train_mask, val_mask, test_mask
示例#18
0
def load_OGB(dataset):
    if dataset == 'mag':
        dataset = DglNodePropPredDataset(name='ogbn-mag')
        return dataset
        # split_idx = dataset.get_idx_split()
        # train_idx, valid_idx, test_idx = split_idx["train"], split_idx["valid"], split_idx["test"]
        # graph, label = dataset[0]  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
    elif dataset in ['biokg', 'wikikg']:
        d_name = 'ogbl-' + dataset
        dataset = DglLinkPropPredDataset(name=d_name)

        split_edge = dataset.get_edge_split()
        train_edge, valid_edge, test_edge = split_edge["train"], split_edge["valid"], split_edge["test"]
        graph = dataset[0]  # dgl graph object containing only training edges
示例#19
0
def load_data(dataset):
    global n_node_feats, n_classes

    data = DglNodePropPredDataset(name=dataset)
    evaluator = Evaluator(name=dataset)

    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx["train"], splitted_idx["valid"], splitted_idx["test"]
    graph, labels = data[0]

    n_node_feats = graph.ndata["feat"].shape[1]
    n_classes = (labels.max() + 1).item()

    return graph, labels, train_idx, val_idx, test_idx, evaluator
示例#20
0
def load_data_default(dataset_name):
    if dataset_name in ['cora', 'citeseer', 'pubmed']:
        if dataset_name == 'cora':
            dataset = CoraGraphDataset()
        if dataset_name == 'citeseer':
            dataset = CiteseerGraphDataset()
        if dataset_name == 'pubmed':
            dataset = PubmedGraphDataset()
        graph = dataset[0]
        graph = graph.remove_self_loop().add_self_loop()
        print(graph)
        features = graph.ndata['feat']
        labels = graph.ndata['label']
        train_mask = graph.ndata['train_mask']
        val_mask = graph.ndata['val_mask']
        test_mask = graph.ndata['test_mask']
        num_feats = features.shape[1]
        num_classes = int(labels.max().item() + 1)
    else:
        dataset = DglNodePropPredDataset(name=dataset_name)
        splitted_mask = dataset.get_idx_split()
        train_mask, val_mask, test_mask = splitted_mask[
            'train'], splitted_mask['valid'], splitted_mask['test']
        graph, labels = dataset[0]
        features = graph.ndata["feat"]
        num_feats = features.shape[1]
        num_classes = (labels.max() + 1).item()
        # add reverse edges
        srcs, dsts = graph.all_edges()
        graph.add_edges(dsts, srcs)
        #add self-loop
        graph = graph.remove_self_loop().add_self_loop()

    return graph, features, labels, train_mask, val_mask, test_mask, num_feats, num_classes
示例#21
0
    def process_DglNodeDataset_hetero(self, dataset: DglNodePropPredDataset):
        graph, labels = dataset[0]
        self._name = dataset.name

        if self.node_types is None:
            self.node_types = graph.ntypes

        self.num_nodes_dict = {
            ntype: graph.num_nodes(ntype)
            for ntype in self.node_types
        }
        self.y_dict = labels

        self.x_dict = graph.ndata["feat"]

        for ntype, labels in self.y_dict.items():
            if labels.dim() == 2 and labels.shape[1] == 1:
                labels = labels.squeeze(1)
            graph.nodes[ntype].data["labels"] = labels

        if self.head_node_type is None:
            if self.y_dict is not None:
                self.head_node_type = list(self.y_dict.keys())[0]
            else:
                self.head_node_type = self.node_types[0]

        self.metapaths = graph.canonical_etypes

        split_idx = dataset.get_idx_split()
        self.training_idx, self.validation_idx, self.testing_idx = split_idx["train"][self.head_node_type], \
                                                                   split_idx["valid"][self.head_node_type], \
                                                                   split_idx["test"][self.head_node_type]

        self.G = graph
示例#22
0
 def __init__(self, dataset_name):
     super(NodeClassificationDataset, self).__init__()
     if dataset_name in ['aifb', 'mutag', 'bgs', 'am']:
         self.g, self.category, self.num_classes = self.load_RDF_dgl(
             dataset_name)
     elif dataset_name in ['acm', 'imdb', 'acm1', 'academic']:
         self.g, self.category, self.num_classes = self.load_HIN(
             dataset_name)
     elif dataset_name in 'ogbn-mag':
         dataset = DglNodePropPredDataset(name='ogbn-mag')
         split_idx = dataset.get_idx_split()
         self.num_classes = dataset.num_classes
         self.train_idx, self.valid_idx, self.test_idx = split_idx[
             "train"], split_idx["valid"], split_idx["test"]
         self.g, self.label = dataset[0]
         self.category = 'paper'  # graph: dgl graph object, label: torch tensor of shape (num_nodes, num_tasks)
def load_ogb(dataset):
    if dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=dataset)
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        paper_labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

        # get target category id
        category_id = len(hg.ntypes)
        for i, ntype in enumerate(hg.ntypes):
            if ntype == category:
                category_id = i

        train_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        train_mask[train_idx] = True
        val_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        val_mask[val_idx] = True
        test_mask = th.zeros((hg.number_of_nodes('paper'), ), dtype=th.bool)
        test_mask[test_idx] = True
        hg.nodes['paper'].data['train_mask'] = train_mask
        hg.nodes['paper'].data['val_mask'] = val_mask
        hg.nodes['paper'].data['test_mask'] = test_mask

        hg.nodes['paper'].data['labels'] = paper_labels
        return hg
    else:
        raise ("Do not support other ogbn datasets.")
示例#24
0
def load_ogb_data(dataset, device):
    from ogb.nodeproppred import DglNodePropPredDataset
    data = DglNodePropPredDataset(name="ogbn-" + dataset, root='data')
    splitted_idx = data.get_idx_split()
    idx_train, idx_val, idx_test = splitted_idx["train"], splitted_idx[
        "valid"], splitted_idx["test"]
    graph, labels = data[0]
    labels = labels.squeeze()
    srcs, dsts = graph.all_edges()
    graph.add_edges(dsts, srcs)
    graph = graph.remove_self_loop().add_self_loop()
    features = graph.ndata['feat']
    graph = graph.to(device)
    features = features.to(device)
    labels = labels.to(device)
    idx_train = idx_train.to(device)
    idx_val = idx_val.to(device)
    idx_test = idx_test.to(device)
    return graph, features, labels, idx_train, idx_val, idx_test
示例#25
0
文件: utils.py 项目: yifeim/dgl
def load_ogbn_mag(root: str = None) -> OGBDataset:
    dataset = DglNodePropPredDataset(name='ogbn-mag', root=root)

    split_idx = dataset.get_idx_split()

    train_idx = split_idx['train']['paper']
    valid_idx = split_idx['valid']['paper']
    test_idx = split_idx['test']['paper']

    hg_original, labels = dataset[0]

    labels = labels['paper'].squeeze()
    num_labels = dataset.num_classes

    subgraphs = {}

    for etype in hg_original.canonical_etypes:
        src, dst = hg_original.all_edges(etype=etype)

        subgraphs[etype] = (src, dst)
        subgraphs[(etype[2], f'rev-{etype[1]}', etype[0])] = (dst, src)

    hg = dgl.heterograph(subgraphs)

    hg.nodes['paper'].data['feat'] = hg_original.nodes['paper'].data['feat']
    hg.nodes['paper'].data['labels'] = labels

    train_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    train_mask[train_idx] = True
    valid_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    valid_mask[valid_idx] = True
    test_mask = torch.zeros((hg.num_nodes('paper'), ), dtype=torch.bool)
    test_mask[test_idx] = True

    hg.nodes['paper'].data['train_mask'] = train_mask
    hg.nodes['paper'].data['valid_mask'] = valid_mask
    hg.nodes['paper'].data['test_mask'] = test_mask

    ogb_dataset = OGBDataset(hg, num_labels, 'paper')

    return ogb_dataset
示例#26
0
def load_ogbn_mag(device, add_reverse_edge, reverse_self):
    """加载ogbn-mag数据集

    :param device: torch.device 将图和数据移动到指定的设备上,默认为CPU
    :param add_reverse_edge: bool 是否添加反向边
    :param reverse_self: bool 起点和终点类型相同时是否添加反向边
    :return: dataset, g, features, labels, predict_ntype, train_mask, val_mask, test_mask, evaluator
    """
    data = DglNodePropPredDataset('ogbn-mag', DATA_DIR)
    g, labels = data[0]
    if add_reverse_edge:
        g = add_reverse_edges(g, reverse_self)
    g = g.to(device)
    features = g.nodes['paper'].data['feat']
    labels = labels['paper'].squeeze(dim=1).to(device)
    split_idx = data.get_idx_split()
    train_idx = split_idx['train']['paper'].to(device)
    val_idx = split_idx['valid']['paper'].to(device)
    test_idx = split_idx['test']['paper'].to(device)
    evaluator = Evaluator(data.name)
    return data, g, features, labels, 'paper', train_idx, val_idx, test_idx, evaluator
示例#27
0
def load_dataset(name, device):
    """
    Load dataset and move graph and features to device
    """
    if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]:
        raise RuntimeError("Dataset {} is not supported".format(name))
    dataset = DglNodePropPredDataset(name=name)
    splitted_idx = dataset.get_idx_split()
    train_nid = splitted_idx["train"]
    val_nid = splitted_idx["valid"]
    test_nid = splitted_idx["test"]
    g, labels = dataset[0]
    g = g.to(device)
    if name == "ogbn-arxiv":
        g = dgl.add_reverse_edges(g, copy_ndata=True)
        g = dgl.add_self_loop(g)
        g.ndata['feat'] = g.ndata['feat'].float()
    elif name == "ogbn-mag":
        # MAG is a heterogeneous graph. The task is to make prediction for
        # paper nodes
        labels = labels["paper"]
        train_nid = train_nid["paper"]
        val_nid = val_nid["paper"]
        test_nid = test_nid["paper"]
        g = convert_mag_to_homograph(g, device)
    else:
        g.ndata['feat'] = g.ndata['feat'].float()
    n_classes = dataset.num_classes
    labels = labels.squeeze()
    evaluator = get_ogb_evaluator(name)

    print(f"# Nodes: {g.number_of_nodes()}\n"
          f"# Edges: {g.number_of_edges()}\n"
          f"# Train: {len(train_nid)}\n"
          f"# Val: {len(val_nid)}\n"
          f"# Test: {len(test_nid)}\n"
          f"# Classes: {n_classes}")

    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
示例#28
0
def load_data(dataset, ogb_root):
    if dataset in ('cora', 'reddit'):
        data = CoraGraphDataset() if dataset == 'cora' else RedditDataset(self_loop=True)
        g = data[0]
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
        return g, g.ndata['label'], data.num_classes, train_idx, val_idx, test_idx
    else:
        data = DglNodePropPredDataset('ogbn-products', ogb_root)
        g, labels = data[0]
        split_idx = data.get_idx_split()
        return g, labels.squeeze(dim=-1), data.num_classes, \
            split_idx['train'], split_idx['valid'], split_idx['test']
示例#29
0
def load_mag(device, args):
    from ogb.nodeproppred import DglNodePropPredDataset
    path = args.use_emb
    home_dir = os.getenv("HOME")
    dataset = DglNodePropPredDataset(name="ogbn-mag",
                                     root=os.path.join(home_dir, ".ogb",
                                                       "dataset"))
    g, labels = dataset[0]
    splitted_idx = dataset.get_idx_split()
    train_nid = splitted_idx["train"]['paper']
    val_nid = splitted_idx["valid"]['paper']
    test_nid = splitted_idx["test"]['paper']
    features = g.nodes['paper'].data['feat']
    author_emb = torch.load(os.path.join(path, "author.pt")).float()
    topic_emb = torch.load(os.path.join(path, "field_of_study.pt")).float()
    institution_emb = torch.load(os.path.join(path, "institution.pt")).float()

    g.nodes["author"].data["feat"] = author_emb.to(device)
    g.nodes["institution"].data["feat"] = institution_emb.to(device)
    g.nodes["field_of_study"].data["feat"] = topic_emb.to(device)
    g.nodes["paper"].data["feat"] = features.to(device)
    paper_dim = g.nodes["paper"].data["feat"].shape[1]
    author_dim = g.nodes["author"].data["feat"].shape[1]
    if paper_dim != author_dim:
        paper_feat = g.nodes["paper"].data.pop("feat")
        rand_weight = torch.Tensor(paper_dim, author_dim).uniform_(-0.5, 0.5)
        g.nodes["paper"].data["feat"] = torch.matmul(paper_feat,
                                                     rand_weight.to(device))
        print(
            f"Randomly project paper feature from dimension {paper_dim} to {author_dim}"
        )

    labels = labels['paper'].to(device).squeeze()
    n_classes = int(labels.max() - labels.min()) + 1
    train_nid, val_nid, test_nid = np.array(train_nid), np.array(
        val_nid), np.array(test_nid)
    return g, labels, n_classes, train_nid, val_nid, test_nid
示例#30
0
def load_data(name, ogb_root, device):
    if name in ('ogbn-products', 'ogbn-arxiv'):
        data = DglNodePropPredDataset(name, ogb_root)
        g, labels = data[0]
        if name == 'ogbn-arxiv':
            g = dgl.to_bidirected(g, copy_ndata=True)
            feat = g.ndata['feat']
            feat = (feat - feat.mean(dim=0)) / feat.std(dim=0)
            g.ndata['feat'] = feat
        g = g.to(device)
        labels = labels.squeeze(dim=1).to(device)
        split_idx = data.get_idx_split()
        train_idx = split_idx['train'].to(device)
        val_idx = split_idx['valid'].to(device)
        test_idx = split_idx['test'].to(device)
        return g, labels, data.num_classes, train_idx, val_idx, test_idx
    else:
        data = load_citation_dataset(name)
        g = data[0].to(device)
        train_idx = g.ndata['train_mask'].nonzero(as_tuple=True)[0]
        val_idx = g.ndata['val_mask'].nonzero(as_tuple=True)[0]
        test_idx = g.ndata['test_mask'].nonzero(as_tuple=True)[0]
        return g, g.ndata[
            'label'], data.num_classes, train_idx, val_idx, test_idx