示例#1
0
def convert_mag_to_homograph(g, device):
    """
    Featurize node types that don't have input features (i.e. author,
    institution, field_of_study) by averaging their neighbor features.
    Then convert the graph to a undirected homogeneous graph.
    """
    src_writes, dst_writes = g.all_edges(etype="writes")
    src_topic, dst_topic = g.all_edges(etype="has_topic")
    src_aff, dst_aff = g.all_edges(etype="affiliated_with")
    new_g = dgl.heterograph({
        ("paper", "written", "author"): (dst_writes, src_writes),
        ("paper", "has_topic", "field"): (src_topic, dst_topic),
        ("author", "aff", "inst"): (src_aff, dst_aff)
    })
    new_g = new_g.to(device)
    new_g.nodes["paper"].data["feat"] = g.nodes["paper"].data["feat"]
    new_g["written"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    new_g["has_topic"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    new_g["aff"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    g.nodes["author"].data["feat"] = new_g.nodes["author"].data["feat"]
    g.nodes["institution"].data["feat"] = new_g.nodes["inst"].data["feat"]
    g.nodes["field_of_study"].data["feat"] = new_g.nodes["field"].data["feat"]

    # Convert to homogeneous graph
    # Get DGL type id for paper type
    target_type_id = g.get_ntype_id("paper")
    g = dgl.to_homogeneous(g, ndata=["feat"])
    g = dgl.add_reverse_edges(g, copy_ndata=True)
    # Mask for paper nodes
    g.ndata["target_mask"] = g.ndata[dgl.NTYPE] == target_type_id
    return g
示例#2
0
    def __init__(self):
        g = OAGCoreDataset()[0]
        author_rank = load_author_rank()
        rating = pd.DataFrame(
            [[i, a] for i, (f, r) in enumerate(author_rank.items()) for a in r],
            columns=['user_id', 'item_id']
        )
        user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (rating['user_id'], rating['item_id'])},
            num_nodes_dict={'user': len(author_rank), 'item': g.num_nodes('author')}
        )

        # 负采样
        neg_sampler = Uniform(1)
        nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges()))
        u, v = user_item_graph.edges()
        self.user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))},
            num_nodes_dict={ntype: user_item_graph.num_nodes(ntype) for ntype in user_item_graph.ntypes}
        )
        self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])])

        knowledge_graph = dgl.to_homogeneous(dgl.node_type_subgraph(g, ['author', 'institution', 'paper']))
        knowledge_graph.edata['relation'] = knowledge_graph.edata[dgl.NTYPE]
        self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
示例#3
0
文件: tu.py 项目: xnuohz/ARMA-dgl
def add_clustering_coefficients_feature(dataset):
    for g, _ in dataset:
        nx_g = dgl.to_networkx(dgl.to_homogeneous(g))
        # MultiDiGraph -> Graph
        nx_g = nx.Graph(nx_g)
        cc = torch.tensor(list(nx.clustering(nx_g).values())).view([-1, 1])
        g.ndata['feat'] = torch.cat([g.ndata['feat'], cc], dim=1)
    return dataset
示例#4
0
    def forward(self, graph):
        """Apply the model for prediction.

        Parameters
        ----------
        graph : DGLHeteroGraph
            DGLHeteroGraph consisting of the ligand graph, the protein graph
            and the complex graph, along with preprocessed features. For a batch of
            protein-ligand pairs, we assume zero padding is performed so that the
            number of ligand and protein atoms is the same in all pairs.

        Returns
        -------
        Float32 tensor of shape (B, O)
            Predicted protein-ligand binding affinity. B for the number
            of protein-ligand pairs in the batch and O for the number of tasks.
        """
        ligand_graph = graph[('ligand_atom', 'ligand', 'ligand_atom')]
        ligand_graph_node_feats = ligand_graph.ndata['atomic_number']
        assert ligand_graph_node_feats.shape[-1] == 1
        ligand_graph_distances = ligand_graph.edata['distance']
        ligand_conv_out = self.ligand_conv(ligand_graph,
                                           ligand_graph_node_feats,
                                           ligand_graph_distances)

        protein_graph = graph[('protein_atom', 'protein', 'protein_atom')]
        protein_graph_node_feats = protein_graph.ndata['atomic_number']
        assert protein_graph_node_feats.shape[-1] == 1
        protein_graph_distances = protein_graph.edata['distance']
        protein_conv_out = self.protein_conv(protein_graph,
                                             protein_graph_node_feats,
                                             protein_graph_distances)

        complex_graph = dgl.edge_type_subgraph(
            graph, [('ligand_atom', 'complex', 'ligand_atom'),
                    ('ligand_atom', 'complex', 'protein_atom'),
                    ('protein_atom', 'complex', 'ligand_atom'),
                    ('protein_atom', 'complex', 'protein_atom')])
        complex_graph = dgl.to_homogeneous(complex_graph,
                                           ndata=['atomic_number'],
                                           edata=['distance'])
        complex_graph_node_feats = complex_graph.ndata['atomic_number']
        assert complex_graph_node_feats.shape[-1] == 1
        complex_graph_distances = complex_graph.edata['distance']
        complex_conv_out = self.complex_conv(complex_graph,
                                             complex_graph_node_feats,
                                             complex_graph_distances)

        frag1_node_indices_in_complex = torch.where(
            complex_graph.ndata['_TYPE'] == 0)[0]
        frag2_node_indices_in_complex = list(
            set(range(complex_graph.num_nodes())) -
            set(frag1_node_indices_in_complex.tolist()))

        return self.predictor(graph.batch_size, frag1_node_indices_in_complex,
                              frag2_node_indices_in_complex, ligand_conv_out,
                              protein_conv_out, complex_conv_out)
示例#5
0
    def build_cl_graph(self, hg):
        if not hg.is_homogeneous:
            self.num_edge_type = len(hg.etypes)
            g = dgl.to_homogeneous(hg).to('cpu')

        traces = self.random_walks(g)
        edge_batch = self.rw_map_edge_type(g, traces)
        cl_graph = self.edge2graph(edge_batch)
        return cl_graph
示例#6
0
 def __init__(self, hg):
     self.hg = hg
     self.g = dgl.to_homogeneous(hg).to('cpu')
     self.NID = self.g.ndata[dgl.NID]
     self.NTYPE = self.g.ndata[dgl.NTYPE]
     num_nodes = {}
     for i in range(th.max(self.NTYPE) + 1):
         num_nodes[self.hg.ntypes[i]] = int((self.NTYPE == i).sum())
     self.num_nodes = num_nodes
     self.weight_column = 'w'
示例#7
0
def load_data(data_name, get_norm=False, inv_target=False):
    if data_name == 'aifb':
        dataset = AIFBDataset()
    elif data_name == 'mutag':
        dataset = MUTAGDataset()
    elif data_name == 'bgs':
        dataset = BGSDataset()
    else:
        dataset = AMDataset()

    # Load hetero-graph
    hg = dataset[0]

    num_rels = len(hg.canonical_etypes)
    category = dataset.predict_category
    num_classes = dataset.num_classes
    labels = hg.nodes[category].data.pop('labels')
    train_mask = hg.nodes[category].data.pop('train_mask')
    test_mask = hg.nodes[category].data.pop('test_mask')
    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
    test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

    if get_norm:
        # Calculate normalization weight for each edge,
        # 1. / d, d is the degree of the destination node
        for cetype in hg.canonical_etypes:
            hg.edges[cetype].data['norm'] = dgl.norm_by_dst(
                hg, cetype).unsqueeze(1)
        edata = ['norm']
    else:
        edata = None

    # get target category id
    category_id = hg.ntypes.index(category)

    g = dgl.to_homogeneous(hg, edata=edata)
    # Rename the fields as they can be changed by for example NodeDataLoader
    g.ndata['ntype'] = g.ndata.pop(dgl.NTYPE)
    g.ndata['type_id'] = g.ndata.pop(dgl.NID)
    node_ids = th.arange(g.num_nodes())

    # find out the target node ids in g
    loc = (g.ndata['ntype'] == category_id)
    target_idx = node_ids[loc]

    if inv_target:
        # Map global node IDs to type-specific node IDs. This is required for
        # looking up type-specific labels in a minibatch
        inv_target = th.empty((g.num_nodes(), ), dtype=th.int64)
        inv_target[target_idx] = th.arange(0,
                                           target_idx.shape[0],
                                           dtype=inv_target.dtype)
        return g, num_rels, num_classes, labels, train_idx, test_idx, target_idx, inv_target
    else:
        return g, num_rels, num_classes, labels, train_idx, test_idx, target_idx
示例#8
0
def gen_neg_edges(g, num_neg, device):
    if not g.is_homogeneous:
        g_homo = dgl.to_homogeneous(g)
    node_degrees = g_homo.out_degrees().to('cpu').numpy()
    node_weights = np.power(node_degrees, 0.75)
    node_probs = node_weights / np.sum(node_weights)

    # neg_sampler = dgl.dataloading.negative_sampler.Uniform(num_neg)
    # neg_edges = neg_sampler(g_homo, th.arange(0, g_homo.num_edges(), dtype=th.int64, device=device))
    neg_sampler = pro_sampler(num_neg, node_probs)
    neg_edges = neg_sampler(
        g_homo, th.arange(0, g_homo.num_edges(), dtype=th.int64,
                          device=device))
    # tensors used as indices must be long, byte or bool tensors, so it should be tensor.int64
    return neg_edges
示例#9
0
    def giveGraphs(self, batch_size, voxel_pos):
        p2v = np.load("data/p2v_spec.npy", allow_pickle=True).tolist()
        p2v = [item for sublist in p2v for item in sublist]
        p2p = np.load("data/p2p.npy", allow_pickle=True).tolist()
        p2p = [item for sublist in p2p for item in sublist]
        v2v = np.load("data/v2v.npy", allow_pickle=True).tolist()
        v2v = [item for sublist in v2v for item in sublist]
        v2v_6 = np.load("data/v2v_6.npy", allow_pickle=True).tolist()
        v2v_6 = [item for sublist in v2v_6 for item in sublist]
        G_vox = dgl.graph(v2v)
        G_vox = dgl.add_self_loop(G_vox)

        graph_data = {('PMT', 'p2v', 'vox'): p2v, ('vox', 'v2v', 'vox'): v2v}
        g = dgl.heterograph(graph_data)
        g = dgl.to_homogeneous(g)
        g = dgl.add_self_loop(g)
        G = dgl.batch([g for i in range(batch_size)])
        return G, G_vox
示例#10
0
    def forward(self, hg, h):
        with hg.local_scope():
            # * =============== Encode heterogeneous feature ================
            h_dict = self.feature_proj(h)
            hg.ndata['h_proj'] = h_dict
            g_homo = dgl.to_homogeneous(hg, ndata=['h_proj'])
            # * =============== Node Embedding Generation ===================
            h = g_homo.ndata['h_proj']
            #h = self.gnn1(g_homo, h)
            h = self.gnn2(g_homo, h)
            if self.norm_emb:
                # Independently normalize each dimension
                h = F.normalize(h, p=2, dim=1)
            # Context embedding generation
            # g_homo.ndata['h'] = h
            emd = self.h2dict(h, h_dict)
            hg.ndata['h'] = emd

        return emd
示例#11
0
 def __init__(self, hg, batch_size, window_size):
     self.hg = hg
     self.g = dgl.to_homogeneous(hg).to('cpu')
     self.NID = self.g.ndata[dgl.NID]
     self.NTYPE = self.g.ndata[dgl.NTYPE]
     num_nodes = {}
     for i in range(th.max(self.NTYPE) + 1):
         num_nodes[self.hg.ntypes[i]] = int((self.NTYPE == i).sum())
     self.num_nodes = num_nodes
     self.num_ntypes = len(self.num_nodes)
     # self.weights = {
     #     etype: hg.in_degrees(etype=etype).float() ** 0.75
     #     for _, etype, _ in hg.canonical_etypes
     # }
     self.batch_size = batch_size
     self.window_size = window_size
     self.neg_hetero = True
     self.edge_dict = {}
     self.ntypes = hg.ntypes
def track_time(data):
    dataset = utils.process_data(data)
    device = utils.get_bench_device()

    if data == 'am':
        batch_size = 64
        n_bases = 40
        l2norm = 5e-4
    elif data == 'ogbn-mag':
        batch_size = 1024
        n_bases = 2
        l2norm = 0
    else:
        raise ValueError()

    fanouts = [25, 15]
    n_layers = 2
    n_hidden = 64
    dropout = 0.5
    use_self_loop = True
    lr = 0.01
    num_workers = 4
    iter_start = 3
    iter_count = 10

    hg = dataset[0]
    category = dataset.predict_category
    num_classes = dataset.num_classes
    train_mask = hg.nodes[category].data.pop('train_mask')
    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
    labels = hg.nodes[category].data.pop('labels').to(device)
    num_of_ntype = len(hg.ntypes)
    num_rels = len(hg.canonical_etypes)

    node_feats = []
    for ntype in hg.ntypes:
        if len(hg.nodes[ntype].data
               ) == 0 or 'feat' not in hg.nodes[ntype].data:
            node_feats.append(None)
        else:
            feat = hg.nodes[ntype].data.pop('feat')
            node_feats.append(feat.share_memory_())

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i
    g = dgl.to_homogeneous(hg)
    u, v, eid = g.all_edges(form='all')

    # global norm
    _, inverse_index, count = th.unique(v,
                                        return_inverse=True,
                                        return_counts=True)
    degrees = count[inverse_index]
    norm = th.ones(eid.shape[0]) / degrees
    norm = norm.unsqueeze(1)
    g.edata['norm'] = norm
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.ndata['type_id'] = g.ndata[dgl.NID]
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]

    node_ids = th.arange(g.number_of_nodes())
    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_nids = node_ids[loc]
    train_nids = target_nids[train_idx]

    g = g.formats('csc')
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
    loader = dgl.dataloading.NodeDataLoader(g,
                                            target_nids[train_idx],
                                            sampler,
                                            batch_size=batch_size,
                                            shuffle=True,
                                            drop_last=False,
                                            num_workers=num_workers)

    # node features
    # None for one-hot feature, if not none, it should be the feature tensor.
    #
    embed_layer = RelGraphEmbedLayer(device,
                                     g.number_of_nodes(),
                                     node_tids,
                                     num_of_ntype,
                                     node_feats,
                                     n_hidden,
                                     sparse_emb=True)

    # create model
    # all model params are in device.
    model = EntityClassify(device,
                           g.number_of_nodes(),
                           n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=n_bases,
                           num_hidden_layers=n_layers - 2,
                           dropout=dropout,
                           use_self_loop=use_self_loop,
                           layer_norm=False)

    embed_layer = embed_layer.to(device)
    model = model.to(device)

    all_params = itertools.chain(model.parameters(),
                                 embed_layer.embeds.parameters())
    optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
    emb_optimizer = th.optim.SparseAdam(list(
        embed_layer.node_embeds.parameters()),
                                        lr=lr)

    print("start training...")
    model.train()
    embed_layer.train()

    for step, sample_data in enumerate(loader):
        input_nodes, output_nodes, blocks = sample_data
        feats = embed_layer(input_nodes, blocks[0].srcdata['ntype'],
                            blocks[0].srcdata['type_id'], node_feats)
        logits = model(blocks, feats)
        seed_idx = blocks[-1].dstdata['type_id']
        loss = F.cross_entropy(logits, labels[seed_idx])
        optimizer.zero_grad()
        emb_optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        emb_optimizer.step()

        # start timer at before iter_start
        if step == iter_start - 1:
            t0 = time.time()
        elif step == iter_count + iter_start - 1:  # time iter_count iterations
            break

    t1 = time.time()

    return (t1 - t0) / iter_count
示例#13
0
def main(args):
    # load graph data
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    else:
        raise ValueError()

    # Load from hetero-graph
    hg = dataset[0]

    num_rels = len(hg.canonical_etypes)
    category = dataset.predict_category
    num_classes = dataset.num_classes
    train_mask = hg.nodes[category].data.pop('train_mask')
    test_mask = hg.nodes[category].data.pop('test_mask')
    train_idx = mx.nd.array(np.nonzero(train_mask.asnumpy())[0], dtype='int64')
    test_idx = mx.nd.array(np.nonzero(test_mask.asnumpy())[0], dtype='int64')
    labels = mx.nd.array(hg.nodes[category].data.pop('labels'), dtype='int64')

    # split dataset into train, validate, test
    if args.validation:
        val_idx = train_idx[:len(train_idx) // 5]
        train_idx = train_idx[len(train_idx) // 5:]
    else:
        val_idx = train_idx

    # calculate norm for each edge type and store in edge
    for canonical_etype in hg.canonical_etypes:
        u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
        v = v.asnumpy()
        _, inverse_index, count = np.unique(v,
                                            return_inverse=True,
                                            return_counts=True)
        degrees = count[inverse_index]
        norm = np.ones(eid.shape[0]) / degrees
        hg.edges[canonical_etype].data['norm'] = mx.nd.expand_dims(
            mx.nd.array(norm), axis=1)

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i

    g = dgl.to_homogeneous(hg, edata=['norm'])
    num_nodes = g.number_of_nodes()
    node_ids = mx.nd.arange(num_nodes)
    edge_norm = g.edata['norm']
    edge_type = g.edata[dgl.ETYPE]

    # find out the target node ids in g
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    loc = mx.nd.array(np.nonzero(loc.asnumpy())[0], dtype='int64')
    target_idx = node_ids[loc]

    # since the nodes are featureless, the input feature is then the node id.
    feats = mx.nd.arange(num_nodes, dtype='int32')

    # check cuda
    use_cuda = args.gpu >= 0
    if use_cuda:
        ctx = mx.gpu(args.gpu)
        feats = feats.as_in_context(ctx)
        edge_type = edge_type.as_in_context(ctx)
        edge_norm = edge_norm.as_in_context(ctx)
        labels = labels.as_in_context(ctx)
        train_idx = train_idx.as_in_context(ctx)
        g = g.to(ctx)
    else:
        ctx = mx.cpu(0)

    # create model
    model = EntityClassify(num_nodes,
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           gpu_id=args.gpu)
    model.initialize(ctx=ctx)

    # optimizer
    trainer = gluon.Trainer(model.collect_params(), 'adam', {
        'learning_rate': args.lr,
        'wd': args.l2norm
    })
    loss_fcn = gluon.loss.SoftmaxCELoss(from_logits=False)

    # training loop
    print("start training...")
    forward_time = []
    backward_time = []
    for epoch in range(args.n_epochs):
        t0 = time.time()
        with mx.autograd.record():
            pred = model(g, feats, edge_type, edge_norm)
            pred = pred[target_idx]
            loss = loss_fcn(pred[train_idx], labels[train_idx])
        t1 = time.time()
        loss.backward()
        trainer.step(len(train_idx))
        t2 = time.time()

        forward_time.append(t1 - t0)
        backward_time.append(t2 - t1)
        print(
            "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}"
            .format(epoch, forward_time[-1], backward_time[-1]))

        train_acc = F.sum(
            mx.nd.cast(pred[train_idx].argmax(axis=1), 'int64') ==
            labels[train_idx]).asscalar() / train_idx.shape[0]
        val_acc = F.sum(
            mx.nd.cast(pred[val_idx].argmax(
                axis=1), 'int64') == labels[val_idx]).asscalar() / len(val_idx)
        print("Train Accuracy: {:.4f} | Validation Accuracy: {:.4f}".format(
            train_acc, val_acc))
    print()

    logits = model.forward(g, feats, edge_type, edge_norm)
    logits = logits[target_idx]
    test_acc = F.sum(
        mx.nd.cast(logits[test_idx].argmax(
            axis=1), 'int64') == labels[test_idx]).asscalar() / len(test_idx)
    print("Test Accuracy: {:.4f}".format(test_acc))
    print()

    print("Mean forward time: {:4f}".format(
        np.mean(forward_time[len(forward_time) // 4:])))
    print("Mean backward time: {:4f}".format(
        np.mean(backward_time[len(backward_time) // 4:])))
示例#14
0
def run_on_samples(model,
                   optimizer,
                   loss_fn,
                   acc_fn,
                   samples,
                   max_num_nodes,
                   classes,
                   edge_classes,
                   verbosity=0,
                   class_weights: Optional[np.ndarray] = None,
                   span_matching_modes=None,
                   explain=False,
                   none_class_id=None,
                   training=False,
                   use_manual_weight_decay=True) -> Epoch:
    if span_matching_modes is None:
        span_matching_modes = ['exact']
    epoch = Epoch()
    metrics = defaultdict(list)
    for i, (sample, texts) in enumerate(samples):
        start = time.time()
        with tf.device("/cpu:0"):
            with tf.GradientTape() as tape:
                g: dgl.DGLHeteroGraph = sample

                # load inputs and ensure correct types
                # the DGL library is very specific about types (int32 vs int64)
                target_classes = g.ndata['class_one_hot']
                target_classes = tf.cast(target_classes, tf.float32)
                node_features = g.ndata['feature']
                if node_features.dtype.is_integer:
                    # this happens if we pass ordinal node ids as features (== featureless mode)
                    # dgl needs them in int64, so let's cast them here
                    node_features = tf.one_hot(
                        node_features, depth=max_num_nodes
                    )  # tf.cast(node_features, tf.int64)
                else:
                    # this happens in every other case, like e.g. embeddings
                    # here dgl wants float32
                    node_features = tf.cast(node_features, tf.float32)
                edge_types = g.edata['class_ordinal']
                edge_types = tf.cast(edge_types, tf.int64)

                target_mask = g.ndata['is_target']
                target_mask = tf.squeeze(tf.where(target_mask))

                g = dgl.to_homogeneous(g)

                predicted_classes = model(g,
                                          node_features,
                                          edge_types,
                                          training=training)

                predicted_classes = tf.gather(predicted_classes, target_mask)
                target_classes = tf.gather(target_classes, target_mask)

                predicted_classes_ord = np.atleast_1d(
                    np.argmax(predicted_classes.numpy(), axis=-1))
                target_classes_ord = np.atleast_1d(
                    np.argmax(target_classes.numpy(), axis=-1))

                loss = loss_fn(target_classes, predicted_classes)
                acc = acc_fn(target_classes, predicted_classes)

                if class_weights is not None:
                    idxs = target_classes_ord
                    class_weighting = class_weights[idxs]
                    loss = loss * class_weighting

                if training:
                    if use_manual_weight_decay:
                        # Manually Weight Decay
                        # We found Tensorflow has a different implementation on weight decay
                        # of Adam(W) optimizer with PyTorch. And this results in worse results.
                        # Manually adding weights to the loss to do weight decay solves this problem.
                        for weight in model.trainable_weights:
                            loss += 1e-4 * tf.nn.l2_loss(weight)

                    grads = tape.gradient(loss, model.trainable_weights)
                    optimizer.apply_gradients(
                        zip(grads, model.trainable_weights))

                epoch.update_with_sample(time.time() - start,
                                         loss.numpy().mean(),
                                         acc.numpy().mean(),
                                         target_classes_ord,
                                         predicted_classes_ord)

                for span_matching_mode in span_matching_modes:
                    span_metric = span_matcher(target_classes_ord,
                                               predicted_classes_ord,
                                               mode=span_matching_mode,
                                               none_class_id=none_class_id)
                    epoch.metrics[f'span_{span_matching_mode}'].append(
                        span_metric)

        if verbosity > 0:
            print_progress(epoch, len(samples))

    if verbosity > 0:
        # finish printing by adding new line after the carriage return (stopping continuous output)
        print()
    return epoch
示例#15
0
                    BLOCK_COLS] = g.nodes['author'].data['x'].numpy().astype(
                        'float16')
        tq.set_postfix_str('Writing institution features...')
        inst_feat[:, start:start +
                  BLOCK_COLS] = g.nodes['institution'].data['x'].numpy(
                  ).astype('float16')
        del g.nodes['paper'].data['x']
        del g.nodes['author'].data['x']
        del g.nodes['institution'].data['x']
author_feat.flush()
inst_feat.flush()

# Convert to homogeneous if needed.  (The RGAT baseline needs homogeneous graph)
if args.graph_as_homogeneous:
    # Process graph
    g = dgl.to_homogeneous(g)
    # DGL ensures that nodes with the same type are put together with the order preserved.
    # DGL also ensures that the node types are sorted in ascending order.
    assert torch.equal(
        g.ndata[dgl.NTYPE],
        torch.cat([
            torch.full((dataset.num_authors, ), 0),
            torch.full((dataset.num_institutions, ), 1),
            torch.full((dataset.num_papers, ), 2)
        ]))
    assert torch.equal(
        g.ndata[dgl.NID],
        torch.cat([
            torch.arange(dataset.num_authors),
            torch.arange(dataset.num_institutions),
            torch.arange(dataset.num_papers)
示例#16
0
def main(args, devices):
    # load graph data
    ogb_dataset = False
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    elif args.dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=args.dataset)
        ogb_dataset = True
    else:
        raise ValueError()

    if ogb_dataset is True:
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-'+etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        if args.dataset == 'ogbn-mag':
            category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

    else:
        # Load from hetero-graph
        hg = dataset[0]

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        category = dataset.predict_category
        num_classes = dataset.num_classes
        train_mask = hg.nodes[category].data.pop('train_mask')
        test_mask = hg.nodes[category].data.pop('test_mask')
        labels = hg.nodes[category].data.pop('labels')
        train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
        test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

        # AIFB, MUTAG, BGS and AM datasets do not provide validation set split.
        # Split train set into train and validation if args.validation is set
        # otherwise use train set as the validation set.
        if args.validation:
            val_idx = train_idx[:len(train_idx) // 5]
            train_idx = train_idx[len(train_idx) // 5:]
        else:
            val_idx = train_idx

    node_feats = []
    for ntype in hg.ntypes:
        if len(hg.nodes[ntype].data) == 0 or args.node_feats is False:
            node_feats.append(hg.number_of_nodes(ntype))
        else:
            assert len(hg.nodes[ntype].data) == 1
            feat = hg.nodes[ntype].data.pop('feat')
            node_feats.append(feat.share_memory_())

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i
        print('{}:{}'.format(i, ntype))

    g = dgl.to_homogeneous(hg)
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
    g.ndata['ntype'].share_memory_()
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.edata['etype'].share_memory_()
    g.ndata['type_id'] = g.ndata[dgl.NID]
    g.ndata['type_id'].share_memory_()
    node_ids = th.arange(g.number_of_nodes())

    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_idx = node_ids[loc]
    target_idx.share_memory_()
    train_idx.share_memory_()
    val_idx.share_memory_()
    test_idx.share_memory_()
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()

    n_gpus = len(devices)
    n_cpus = mp.cpu_count()
    # cpu
    if devices[0] == -1:
        run(0, 0, n_cpus, args, ['cpu'],
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
             train_idx, val_idx, test_idx, labels), None, None)
    # gpu
    elif n_gpus == 1:
        run(0, n_gpus, n_cpus, args, devices,
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
            train_idx, val_idx, test_idx, labels), None, None)
    # multi gpu
    else:
        queue = mp.Queue(n_gpus)
        procs = []
        num_train_seeds = train_idx.shape[0]
        num_valid_seeds = val_idx.shape[0]
        num_test_seeds = test_idx.shape[0]
        train_seeds = th.randperm(num_train_seeds)
        valid_seeds = th.randperm(num_valid_seeds)
        test_seeds = th.randperm(num_test_seeds)
        tseeds_per_proc = num_train_seeds // n_gpus
        vseeds_per_proc = num_valid_seeds // n_gpus
        tstseeds_per_proc = num_test_seeds // n_gpus
        for proc_id in range(n_gpus):
            # we have multi-gpu for training, evaluation and testing
            # so split trian set, valid set and test set into num-of-gpu parts.
            proc_train_seeds = train_seeds[proc_id * tseeds_per_proc :
                                           (proc_id + 1) * tseeds_per_proc \
                                           if (proc_id + 1) * tseeds_per_proc < num_train_seeds \
                                           else num_train_seeds]
            proc_valid_seeds = valid_seeds[proc_id * vseeds_per_proc :
                                           (proc_id + 1) * vseeds_per_proc \
                                           if (proc_id + 1) * vseeds_per_proc < num_valid_seeds \
                                           else num_valid_seeds]
            proc_test_seeds = test_seeds[proc_id * tstseeds_per_proc :
                                         (proc_id + 1) * tstseeds_per_proc \
                                         if (proc_id + 1) * tstseeds_per_proc < num_test_seeds \
                                         else num_test_seeds]
            p = mp.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices,
                                             (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
                                             train_idx, val_idx, test_idx, labels),
                                             (proc_train_seeds, proc_valid_seeds, proc_test_seeds),
                                             queue))
            p.start()
            procs.append(p)
        for p in procs:
            p.join()
示例#17
0
def main(args):
    # load graph data
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    else:
        raise ValueError()

    # Load from hetero-graph
    hg = dataset[0]

    num_rels = len(hg.canonical_etypes)
    category = dataset.predict_category
    num_classes = dataset.num_classes
    train_mask = hg.nodes[category].data.pop('train_mask')
    test_mask = hg.nodes[category].data.pop('test_mask')
    train_idx = torch.nonzero(train_mask).squeeze()
    test_idx = torch.nonzero(test_mask).squeeze()
    labels = hg.nodes[category].data.pop('labels')

    # split dataset into train, validate, test
    if args.validation:
        val_idx = train_idx[:len(train_idx) // 5]
        train_idx = train_idx[len(train_idx) // 5:]
    else:
        val_idx = train_idx

    # calculate norm for each edge type and store in edge
    for canonical_etype in hg.canonical_etypes:
        u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
        _, inverse_index, count = torch.unique(v,
                                               return_inverse=True,
                                               return_counts=True)
        degrees = count[inverse_index]
        norm = torch.ones(eid.shape[0]).float() / degrees.float()
        norm = norm.unsqueeze(1)
        hg.edges[canonical_etype].data['norm'] = norm

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i

    g = dgl.to_homogeneous(hg, edata=['norm'])
    num_nodes = g.number_of_nodes()
    node_ids = torch.arange(num_nodes)
    edge_norm = g.edata['norm']
    edge_type = g.edata[dgl.ETYPE].long()

    # find out the target node ids in g
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_idx = node_ids[loc]

    # since the nodes are featureless, the input feature is then the node id.
    feats = torch.arange(num_nodes)

    # check cuda
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(args.gpu)
        feats = feats.cuda()
        edge_type = edge_type.cuda()
        edge_norm = edge_norm.cuda()
        labels = labels.cuda()

    # create model
    model = EntityClassify(num_nodes,
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           use_cuda=use_cuda)

    if use_cuda:
        model.cuda()
        g = g.to('cuda:%d' % args.gpu)

    # optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.l2norm)

    # training loop
    print("start training...")
    forward_time = []
    backward_time = []
    model.train()
    for epoch in range(args.n_epochs):
        optimizer.zero_grad()
        t0 = time.time()
        logits = model(g, feats, edge_type, edge_norm)
        logits = logits[target_idx]
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        t1 = time.time()
        loss.backward()
        optimizer.step()
        t2 = time.time()

        forward_time.append(t1 - t0)
        backward_time.append(t2 - t1)
        print(
            "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}"
            .format(epoch, forward_time[-1], backward_time[-1]))
        train_acc = torch.sum(logits[train_idx].argmax(
            dim=1) == labels[train_idx]).item() / len(train_idx)
        val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
        val_acc = torch.sum(logits[val_idx].argmax(
            dim=1) == labels[val_idx]).item() / len(val_idx)
        print(
            "Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}"
            .format(train_acc, loss.item(), val_acc, val_loss.item()))
    print()

    model.eval()
    logits = model.forward(g, feats, edge_type, edge_norm)
    logits = logits[target_idx]
    test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    test_acc = torch.sum(logits[test_idx].argmax(
        dim=1) == labels[test_idx]).item() / len(test_idx)
    print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(
        test_acc, test_loss.item()))
    print()

    print("Mean forward time: {:4f}".format(
        np.mean(forward_time[len(forward_time) // 4:])))
    print("Mean backward time: {:4f}".format(
        np.mean(backward_time[len(backward_time) // 4:])))
示例#18
0
def test_edge_softmax(g, norm_by, idtype):
    print("params", norm_by, idtype)

    g = create_test_heterograph(idtype)

    x1 = F.randn((g.num_edges('plays'), feat_size))
    x2 = F.randn((g.num_edges('follows'), feat_size))
    x3 = F.randn((g.num_edges('develops'), feat_size))
    x4 = F.randn((g.num_edges('wishes'), feat_size))

    F.attach_grad(F.clone(x1))
    F.attach_grad(F.clone(x2))
    F.attach_grad(F.clone(x3))
    F.attach_grad(F.clone(x4))

    g['plays'].edata['eid'] = x1
    g['follows'].edata['eid'] = x2
    g['develops'].edata['eid'] = x3
    g['wishes'].edata['eid'] = x4

    #################################################################
    #  edge_softmax() on homogeneous graph
    #################################################################

    with F.record_grad():
        hm_g = dgl.to_homogeneous(g)
        hm_x = F.cat((x3, x2, x1, x4), 0)
        hm_e = F.attach_grad(F.clone(hm_x))
        score_hm = edge_softmax(hm_g, hm_e, norm_by=norm_by)
        hm_g.edata['score'] = score_hm
        ht_g = dgl.to_heterogeneous(hm_g, g.ntypes, g.etypes)
        r1 = ht_g.edata['score'][('user', 'plays', 'game')]
        r2 = ht_g.edata['score'][('user', 'follows', 'user')]
        r3 = ht_g.edata['score'][('developer', 'develops', 'game')]
        r4 = ht_g.edata['score'][('user', 'wishes', 'game')]
        F.backward(F.reduce_sum(r1) + F.reduce_sum(r2))
        grad_edata_hm = F.grad(hm_e)

    #################################################################
    #  edge_softmax() on heterogeneous graph
    #################################################################

    e1 = F.attach_grad(F.clone(x1))
    e2 = F.attach_grad(F.clone(x2))
    e3 = F.attach_grad(F.clone(x3))
    e4 = F.attach_grad(F.clone(x4))
    e = {
        ('user', 'follows', 'user'): e2,
        ('user', 'plays', 'game'): e1,
        ('user', 'wishes', 'game'): e4,
        ('developer', 'develops', 'game'): e3
    }
    with F.record_grad():
        score = edge_softmax(g, e, norm_by=norm_by)
        r5 = score[('user', 'plays', 'game')]
        r6 = score[('user', 'follows', 'user')]
        r7 = score[('developer', 'develops', 'game')]
        r8 = score[('user', 'wishes', 'game')]
        F.backward(F.reduce_sum(r5) + F.reduce_sum(r6))
        grad_edata_ht = F.cat((F.grad(e3), F.grad(e2), F.grad(e1), F.grad(e4)),
                              0)
        # correctness check
        assert F.allclose(r1, r5)
        assert F.allclose(r2, r6)
        assert F.allclose(r3, r7)
        assert F.allclose(r4, r8)
        assert F.allclose(grad_edata_hm, grad_edata_ht)
示例#19
0
def main(args, devices):
    # load graph data
    ogb_dataset = False
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    elif args.dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=args.dataset)
        ogb_dataset = True
    else:
        raise ValueError()

    if ogb_dataset is True:
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        if args.dataset == 'ogbn-mag':
            category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

    else:
        # Load from hetero-graph
        hg = dataset[0]

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        category = dataset.predict_category
        num_classes = dataset.num_classes
        train_mask = hg.nodes[category].data.pop('train_mask')
        test_mask = hg.nodes[category].data.pop('test_mask')
        labels = hg.nodes[category].data.pop('labels')
        train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
        test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

        # AIFB, MUTAG, BGS and AM datasets do not provide validation set split.
        # Split train set into train and validation if args.validation is set
        # otherwise use train set as the validation set.
        if args.validation:
            val_idx = train_idx[:len(train_idx) // 5]
            train_idx = train_idx[len(train_idx) // 5:]
        else:
            val_idx = train_idx

    node_feats = []
    for ntype in hg.ntypes:
        if len(hg.nodes[ntype].data) == 0 or args.node_feats is False:
            node_feats.append(hg.number_of_nodes(ntype))
        else:
            assert len(hg.nodes[ntype].data) == 1
            feat = hg.nodes[ntype].data.pop('feat')
            node_feats.append(feat.share_memory_())

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i
        print('{}:{}'.format(i, ntype))

    g = dgl.to_homogeneous(hg)
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
    g.ndata['ntype'].share_memory_()
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.edata['etype'].share_memory_()
    g.ndata['type_id'] = g.ndata[dgl.NID]
    g.ndata['type_id'].share_memory_()
    node_ids = th.arange(g.number_of_nodes())

    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_idx = node_ids[loc]
    target_idx.share_memory_()
    train_idx.share_memory_()
    val_idx.share_memory_()
    test_idx.share_memory_()

    # This is a graph with multiple node types, so we want a way to map
    # our target node from their global node numberings, back to their
    # numberings within their type. This is used when taking the nodes in a
    # mini-batch, and looking up their type-specific labels
    inv_target = th.empty(node_ids.shape, dtype=node_ids.dtype)
    inv_target.share_memory_()
    inv_target[target_idx] = th.arange(0,
                                       target_idx.shape[0],
                                       dtype=inv_target.dtype)

    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()

    n_gpus = len(devices)
    n_cpus = mp.cpu_count()
    # cpu
    if devices[0] == -1:
        run(0, 0, n_cpus, args, ['cpu'],
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
             inv_target, train_idx, val_idx, test_idx, labels), None)
    # gpu
    elif n_gpus == 1:
        run(0, n_gpus, n_cpus, args, devices,
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
             inv_target, train_idx, val_idx, test_idx, labels), None)
    # multi gpu
    else:
        queue = mp.Queue(n_gpus)
        procs = []
        for proc_id in range(n_gpus):
            # We use distributed data parallel dataloader to handle the data
            # splitting
            p = mp.Process(target=run,
                           args=(proc_id, n_gpus, n_cpus // n_gpus, args,
                                 devices,
                                 (g, node_feats, num_of_ntype, num_classes,
                                  num_rels, target_idx, inv_target, train_idx,
                                  val_idx, test_idx, labels), queue))
            p.start()
            procs.append(p)
        for p in procs:
            p.join()
示例#20
0
def track_time(data):
    # args
    if data == 'aifb':
        num_bases = -1
        l2norm = 0.
    elif data == 'am':
        num_bases = 40
        l2norm = 5e-4
    else:
        raise ValueError()

    data = utils.process_data(data)
    device = utils.get_bench_device()
    num_epochs = 30

    g = data[0]

    num_rels = len(g.canonical_etypes)
    category = data.predict_category
    num_classes = data.num_classes
    train_mask = g.nodes[category].data.pop('train_mask').bool().to(device)
    test_mask = g.nodes[category].data.pop('test_mask').bool().to(device)
    labels = g.nodes[category].data.pop('labels').to(device)

    # calculate norm for each edge type and store in edge
    for canonical_etype in g.canonical_etypes:
        u, v, eid = g.all_edges(form='all', etype=canonical_etype)
        _, inverse_index, count = torch.unique(v,
                                               return_inverse=True,
                                               return_counts=True)
        degrees = count[inverse_index]
        norm = 1. / degrees.float()
        norm = norm.unsqueeze(1)
        g.edges[canonical_etype].data['norm'] = norm

    # get target category id
    category_id = len(g.ntypes)
    for i, ntype in enumerate(g.ntypes):
        if ntype == category:
            category_id = i

    g = dgl.to_homogeneous(g, edata=['norm']).to(device)
    num_nodes = g.number_of_nodes()
    edge_norm = g.edata['norm']
    edge_type = g.edata[dgl.ETYPE].long()

    # find out the target node ids in g
    target_idx = torch.where(g.ndata[dgl.NTYPE] == category_id)[0]
    train_idx = target_idx[train_mask]
    test_idx = target_idx[test_mask]
    train_labels = labels[train_mask]
    test_labels = labels[test_mask]

    # since the nodes are featureless, the input feature is then the node id.
    feats = torch.arange(num_nodes, device=device)

    # create model
    model = RGCN(num_nodes, 16, num_classes, num_rels, num_bases, 0,
                 0).to(device)

    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-2,
                                 weight_decay=l2norm)

    model.train()
    t0 = time.time()
    for epoch in range(num_epochs):
        logits = model(g, feats, edge_type, edge_norm)
        loss = F.cross_entropy(logits[train_idx], train_labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    t1 = time.time()

    return (t1 - t0) / num_epochs
示例#21
0
# 边类型子图
eg = dgl.edge_type_subgraph(g, [('drug', 'interacts', 'drug'),
                                ('drug', 'treats', 'disease')])
print(eg, eg.nodes['drug'].data['hv'])

print('-----------------------------')
g = dgl.heterograph({
    ('drug', 'interacts', 'drug'): ([0, 1], [1, 2]),
    ('drug', 'treats', 'disease'): ([1], [2])
})
g.nodes['drug'].data['hv'] = nd.ones((3, 1))
g.nodes['disease'].data['hv'] = nd.ones((3, 1))
g.edges['interacts'].data['he'] = nd.zeros((2, 1))
g.edges['treats'].data['he'] = nd.zeros((1, 2))

hg = dgl.to_homogeneous(g)
print('hv' in hg.ndata)
# hg = dgl.to_homogeneous(g, edata=['he'])   复制特征必须保证边/节点拥有相同的尺寸和数据类型
hg = dgl.to_homogeneous(g, ndata=['hv'])
print(hg.ndata['hv'], g.ntypes)
print(hg.ndata[dgl.NTYPE], hg.ndata[dgl.NID])
print(g.etypes, hg.ndata[dgl.ETYPE], hg.ndata[dgl.EID])

g0 = dgl.heterograph({
    ('drug', 'interacts', 'drug'): ([0, 1], [1, 2]),
    ('drug', 'interacts', 'gene'): ([0, 1], [2, 3]),
    ('drug', 'treats', 'disease'): ([1], [2])
})
sub_g = dgl.edge_type_subgraph(g0, [('drug', 'interacts', 'drug'),
                                    ('drug', 'interacts', 'gene')])
h_sub_g = dgl.to_homogeneous(sub_g)
示例#22
0
def main(args, devices):
    # load graph data
    ogb_dataset = False
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    elif args.dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=args.dataset)
        ogb_dataset = True
    else:
        raise ValueError()

    if ogb_dataset is True:
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        if args.dataset == 'ogbn-mag':
            category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

        if args.node_feats:
            node_feats = []
            for ntype in hg.ntypes:
                if len(hg.nodes[ntype].data) == 0:
                    node_feats.append(None)
                else:
                    assert len(hg.nodes[ntype].data) == 1
                    feat = hg.nodes[ntype].data.pop('feat')
                    node_feats.append(feat.share_memory_())
        else:
            node_feats = [None] * num_of_ntype
    else:
        # Load from hetero-graph
        hg = dataset[0]

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        category = dataset.predict_category
        num_classes = dataset.num_classes
        train_mask = hg.nodes[category].data.pop('train_mask')
        test_mask = hg.nodes[category].data.pop('test_mask')
        labels = hg.nodes[category].data.pop('labels')
        train_idx = th.nonzero(train_mask).squeeze()
        test_idx = th.nonzero(test_mask).squeeze()
        node_feats = [None] * num_of_ntype

        # AIFB, MUTAG, BGS and AM datasets do not provide validation set split.
        # Split train set into train and validation if args.validation is set
        # otherwise use train set as the validation set.
        if args.validation:
            val_idx = train_idx[:len(train_idx) // 5]
            train_idx = train_idx[len(train_idx) // 5:]
        else:
            val_idx = train_idx

    # calculate norm for each edge type and store in edge
    if args.global_norm is False:
        for canonical_etype in hg.canonical_etypes:
            u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
            _, inverse_index, count = th.unique(v,
                                                return_inverse=True,
                                                return_counts=True)
            degrees = count[inverse_index]
            norm = th.ones(eid.shape[0]) / degrees
            norm = norm.unsqueeze(1)
            hg.edges[canonical_etype].data['norm'] = norm

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i

    g = dgl.to_homogeneous(hg, edata=['norm'])
    if args.global_norm:
        u, v, eid = g.all_edges(form='all')
        _, inverse_index, count = th.unique(v,
                                            return_inverse=True,
                                            return_counts=True)
        degrees = count[inverse_index]
        norm = th.ones(eid.shape[0]) / degrees
        norm = norm.unsqueeze(1)
        g.edata['norm'] = norm

    g.ndata[dgl.NTYPE].share_memory_()
    g.edata[dgl.ETYPE].share_memory_()
    g.edata['norm'].share_memory_()
    node_ids = th.arange(g.number_of_nodes())

    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_idx = node_ids[loc]
    target_idx.share_memory_()
    train_idx.share_memory_()
    val_idx.share_memory_()
    test_idx.share_memory_()

    n_gpus = len(devices)
    # cpu
    if devices[0] == -1:
        run(0, 0, args, ['cpu'],
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
             train_idx, val_idx, test_idx, labels), None, None)
    # gpu
    elif n_gpus == 1:
        run(0, n_gpus, args, devices,
            (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
             train_idx, val_idx, test_idx, labels), None, None)
    # multi gpu
    else:
        queue = mp.Queue(n_gpus)
        procs = []
        num_train_seeds = train_idx.shape[0]
        num_valid_seeds = val_idx.shape[0]
        num_test_seeds = test_idx.shape[0]
        train_seeds = th.randperm(num_train_seeds)
        valid_seeds = th.randperm(num_valid_seeds)
        test_seeds = th.randperm(num_test_seeds)
        tseeds_per_proc = num_train_seeds // n_gpus
        vseeds_per_proc = num_valid_seeds // n_gpus
        tstseeds_per_proc = num_test_seeds // n_gpus
        for proc_id in range(n_gpus):
            # we have multi-gpu for training, evaluation and testing
            # so split trian set, valid set and test set into num-of-gpu parts.
            proc_train_seeds = train_seeds[proc_id * tseeds_per_proc :
                                           (proc_id + 1) * tseeds_per_proc \
                                           if (proc_id + 1) * tseeds_per_proc < num_train_seeds \
                                           else num_train_seeds]
            proc_valid_seeds = valid_seeds[proc_id * vseeds_per_proc :
                                           (proc_id + 1) * vseeds_per_proc \
                                           if (proc_id + 1) * vseeds_per_proc < num_valid_seeds \
                                           else num_valid_seeds]
            proc_test_seeds = test_seeds[proc_id * tstseeds_per_proc :
                                         (proc_id + 1) * tstseeds_per_proc \
                                         if (proc_id + 1) * tstseeds_per_proc < num_test_seeds \
                                         else num_test_seeds]
            p = mp.Process(target=run,
                           args=(proc_id, n_gpus, args, devices,
                                 (g, node_feats, num_of_ntype, num_classes,
                                  num_rels, target_idx, train_idx, val_idx,
                                  test_idx, labels), (proc_train_seeds,
                                                      proc_valid_seeds,
                                                      proc_test_seeds), queue))
            p.start()
            procs.append(p)
        for p in procs:
            p.join()
    def forward(self, **kwargs):
        """
        :param input_ids: shape: [batch_size, max_seq_length (,1)]. e.g. [101 16068 1551 131 11253 10785 7637 3348 113 1286 114 1105 19734 1123 1493 113 1268 114 1112 1131 4927 1123 1159 1113 1103 2037 1437 1114 1123 3235 137 1282 14507 2636 102 1650 3696 9255 153 2591 13360 6258 3048 10069 131 5187 131 3927 142 9272 117 1367 1347 1381 197 19753 11392 12880 2137 131 1367 131 1512 142 9272 117 1367 1347 1381 11253 10785 7637 1144 3090 1131 1110 7805 1123 1148 2027 1114 20497 1389 27891 1667 11247 119 1109 3081 118 1214 118 1385 2851 117 1150 1640 1144 1300 1482 1121 2166 6085 117 1163 1107 1126 3669 1113 1109 4258 157 18963 7317 2737 3237 1115 1131 1110 17278 1106 1129 20028 1330 1901 1106 1123 9304 13465 119 1153 1163 131 112 1284 787 1396 1198 1276 1149 1195 787 1231 1515 170 2963 118 146 787 182 1210 1808 6391 119 146 1138 2094 1105 170 2963 1107 1139 7413 117 1103 1436 2053 1107 1103 1362 117 170 1632 2261 1105 146 787 182 170 1304 6918 1873 119 146 787 182 1304 9473 119 112 137 13426 11253 117 3081 117 1110 1210 1808 6391 1114 1123 3049 2963 137 13426 18662 18284 5208 2483 1163 1131 5115 1176 112 170 1304 6918 1873 112 137 13426 11253 1105 1393 4896 1591 1667 1508 1147 4655 1113 2080 1165 1131 1108 3332 19004 1111 170 1248 1159 1171 1107 1351 102]
        :param attention_mask: [batch_size, max_seq_length(, 1)]. e.g. [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
        :param kwargs (optional input):
            start_positions: [batch_size(,1)]
            end_positions: [batch_size (,1)]
            token_type_ids: [batch_size, max_seq_length(, 1)]. e.g. [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
            wordnet_concept_ids: [batch_size, max_seq_length, max_wn_length]. e.g. [[0,0,0,0,0],[0,1,0,0,0],[92,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]]
            nell_concept_ids: [batch_size, max_seq_length, max_nell_length]. e.g. 0:[] 1:[] 2:[] 3:[] 4:[19, 92, 255] 5:[19, 92, 255] 6:[19, 92, 255] 7:[] 8:[] 9:[] 10:[] 11:[] 12:[] 13:[] 14:[] 15:[] 16:[] 17:[] 18:[] 19:[] 20:[] 21:[] 22:[] 23:[] 24:[] 25:[] 26:[] 27:[] 28:[] 29:[] 30:[] 31:[] 32:[] 33:[] 34:[] 35:[] 36:[] 37:[] 38:[] 39:[] 40:[] 41:[] 42:[] 43:[] 44:[] 45:[] 46:[] 47:[] 48:[] 49:[] 50:[] 51:[] 52:[] 53:[] 54:[] 55:[] 56:[] 57:[] 58:[] 59:[] 60:[] 61:[] 62:[] 63:[] 64:[] 65:[] 66:[] 67:[] 68:[] 69:[19, 92, 255] 70:[19, 92, 255] 71:[19, 92, 255] 72:[] 73:[] 74:[] 75:[] 76:[] 77:[] 78:[] 79:[] 80:[] 81:[] 82:[] 83:[] 84:[] 85:[] 86:[] 87:[] 88:[] 89:[] 90:[] 91:[] 92:[] 93:[] 94:[] 95:[] 96:[] 97:[] 98:[] 99:[] 100:[] 101:[] 102:[] 103:[] 104:[] 105:[] 106:[] 107:[] 108:[] 109:[] 110:[] 111:[] 112:[] 113:[] 114:[] 115:[] 116:[] 117:[] 118:[] 119:[] 120:[] 121:[] 122:[] 123:[] 124:[] 125:[] 126:[] 127:[] 128:[] 129:[] 130:[] 131:[] 132:[] 133:[] 134:[] 135:[] 136:[] 137:[] 138:[] 139:[] 140:[] 141:[] 142:[] 143:[] 144:[] 145:[] 146:[] 147:[] 148:[] 149:[] 150:[] 151:[] 152:[] 153:[] 154:[] 155:[] 156:[] 157:[] 158:[] 159:[] 160:[] 161:[] 162:[] 163:[] 164:[] 165:[] 166:[] 167:[] 168:[] 169:[] 170:[] 171:[] 172:[] 173:[] 174:[] 175:[] 176:[] 177:[] 178:[] 179:[] 180:[] 181:[] 182:[] 183:[] 184:[] 185:[] 186:[] 187:[] 188:[] 189:[] 190:[] 191:[] 192:[50, 239] 193:[] 194:[] 195:[] 196:[] 197:[] 198:[] 199:[] 200:[] 201:[] 202:[] 203:[] 204:[] 205:[] 206:[] 207:[] 208:[] 209:[] 210:[] 211:[] 212:[] 213:[] 214:[] 215:[] 216:[] 217:[] 218:[] 219:[] 220:[] 221:[] 222:[50, 239] 223:[] 224:[] 225:[] 226:[] 227:[138, 91] 228:[] 229:[] 230:[] 231:[] 232:[] 233:[] 234:[] 235:[] 236:[] 237:[] 238:[] 239:[] 240:[] 241:[] 242:[] 243:[] 244:[] 245:[]
        :return:
        """
        # start_forward_time = time()
        label_ids_list = kwargs.get("label_ids")
        input_ids_list = kwargs.get("input_ids")
        # logger.info("rank:{}".format(input_ids.device))
        attention_mask_list = kwargs.get("attention_mask")
        token_type_ids_list = kwargs.get("token_type_ids")
        batch_synset_graphs_id_list = kwargs.get("batch_synset_graphs")
        wn_synset_graphs_list = kwargs.get("wn_synset_graphs")
        choice_score_list = []

        for num in range(2):
            label_ids = label_ids_list
            input_ids = input_ids_list[:, num, :]
            # logger.info("rank:{}".format(input_ids.device))
            attention_mask = attention_mask_list[:, num, :]
            if self.config.text_embed_model == "bert":
                token_type_ids = token_type_ids_list[:, num, :]
            elif self.config.text_embed_model == "roberta" or self.config.text_embed_model == "roberta_base":
                token_type_ids = None
            batch_synset_graphs_id = batch_synset_graphs_id_list
            wn_synset_graphs = wn_synset_graphs_list[num]

            batch_synset_graphs = [wn_synset_graphs[i] for i in batch_synset_graphs_id]
            batch_context_graphs_list = []
            batch_wn_graphs_list = []
            batch_entity2token_graphs_list = []
            batch_entity2token_graphs_nell_list = []

            token_length_list = []

            if self.config.text_embed_model == "bert":
                text_output = self.text_embed_model(
                    input_ids,
                    attention_mask=attention_mask,
                    token_type_ids=token_type_ids,
                    output_attentions=self.config.output_attentions,
                    output_hidden_states=self.config.output_hidden_states
                )[0]
            elif self.config.text_embed_model == "roberta" or self.config.text_embed_model == "roberta_base":
                text_output = self.text_embed_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask
                )[0]
            relation_list = self.config.relation_list

            inverse_relation_list = []
            # node_type in origin graph
            id_type_list = []
            context_type_list = []
            for i, relation_type in enumerate(relation_list):
                inverse_relation_list.append("{}_".format(relation_type))

                id_type = "wn{}_id".format(relation_type)
                id_type_list.append(id_type)

                context_type = "wn{}_context".format(relation_type)
                context_type_list.append(context_type)

            # start_time = time()
            for i, g in enumerate(batch_synset_graphs):
                assert (len(g.nodes("token_id")) == torch.sum(attention_mask[i, :]))
                token_length_list.append(len(g.nodes("token_id")))

                # reconstruct context graph
                context_g, wn_g = self.reconstruct_dgl_graph(g, relation_list, inverse_relation_list,
                                                             id_type_list, context_type_list,
                                                             text_output[i, :, :], input_ids.device)

                entity2token_graph, entity2token_graph_nell = self.construct_entity2token_graph(i, g, text_output, input_ids.device)

                batch_entity2token_graphs_list.append(entity2token_graph)
                batch_entity2token_graphs_nell_list.append(entity2token_graph_nell)
                batch_context_graphs_list.append(context_g)
                batch_wn_graphs_list.append(wn_g)

            batch_context_graphs_dgl = dgl.batch(batch_context_graphs_list)
            graph_context_embedding = self.rgcn_context(batch_context_graphs_dgl, batch_context_graphs_dgl.ndata['feature'])
            batch_context_graphs_dgl.nodes["wn_concept_context"].data["feature"] = graph_context_embedding[
                "wn_concept_context"]
            # batch_context_graphs_dgl.nodes["wn_concept_context"].data["feature_project"] = self.bert_projected_token_ids(
            #     graph_context_embedding["wn_concept_context"])
            batch_context_graphs_list = dgl.unbatch(batch_context_graphs_dgl)

            batch_wn_graphs_dgl = dgl.batch(batch_wn_graphs_list)
            graph_wn_embedding = self.rgcn_wn(batch_wn_graphs_dgl, batch_wn_graphs_dgl.ndata['feature'])
            batch_wn_graphs_dgl.nodes["wn_concept_id"].data["feature"] = graph_wn_embedding["wn_concept_id"]
            batch_wn_graphs_list = dgl.unbatch(batch_wn_graphs_dgl)

            memory_output_new = text_output
            # batch_entity2token_graphs_list_homo_s = []
            context_embed_new = torch.zeros(
                (memory_output_new.shape[0], memory_output_new.shape[1], self.concept_embed_size),
                dtype=torch.float32, device=input_ids.device)
            concept_embed_new = torch.zeros(
                (memory_output_new.shape[0], memory_output_new.shape[1], self.concept_embed_size),
                dtype=torch.float32, device=input_ids.device)

            nell_embed_new = torch.zeros(
                (memory_output_new.shape[0], memory_output_new.shape[1], self.concept_embed_size),
                dtype=torch.float32, device=input_ids.device)

            # start_time = time()
            for idx, g_e2t in enumerate(batch_entity2token_graphs_list):
                g_e2t.nodes["wn_concept_id"].data["context_feature"] = batch_context_graphs_list[idx].nodes["wn_concept_context"].data["feature"]
                # logger.info("idx {}: {}".format(idx, g_e2t.nodes["wn_concept_id"].data["context_feature"]))
                g_e2t.nodes["wn_concept_id"].data["id_feature"] = batch_wn_graphs_list[idx].nodes["wn_concept_id"].data["feature"]
                g_e2t.nodes["token_id"].data["id_feature"] = self.projected_token_text(g_e2t.nodes["token_id"].data["context_feature"])
                g_e2t.nodes["sentinel_id"].data["id_feature"] = torch.zeros_like(g_e2t.nodes["token_id"].data["id_feature"], device=input_ids.device)
                g_e2t.nodes["sentinel_id"].data["context_feature"] = torch.zeros_like(g_e2t.nodes["token_id"].data["context_feature"], device=input_ids.device)

                g_e2t_homo = dgl.to_homogeneous(g_e2t, ndata=['id_feature', 'context_feature'])
                g_e2t_homo.ndata['context_feature'] = self.gat_context(g_e2t_homo,
                                 g_e2t_homo.ndata['context_feature'])
                g_e2t_homo.ndata['id_feature'] = self.gat_wn(g_e2t_homo, g_e2t_homo.ndata['id_feature'])
                tmp_graph = dgl.to_heterogeneous(g_e2t_homo, g_e2t.ntypes, g_e2t.etypes)

                tmp_argsort = torch.argsort(tmp_graph.ndata[dgl.NID]["token_id"] - tmp_graph.num_nodes("sentinel_id"))
                concept_embed_new[idx, :tmp_graph.num_nodes("token_id"), :] = tmp_graph.nodes["token_id"].data[
                    "id_feature"].index_select(0, tmp_argsort)
                context_embed_new[idx, :tmp_graph.num_nodes("token_id"), :] = tmp_graph.nodes["token_id"].data[
                    "context_feature"].index_select(0, tmp_argsort)

                g_e2t_nell = batch_entity2token_graphs_nell_list[idx]
                g_e2t_nell_homo = dgl.to_homogeneous(g_e2t_nell, ndata=['id_feature'])
                g_e2t_nell_homo.ndata['id_feature'] = self.gat_nell(g_e2t_nell_homo, g_e2t_nell_homo.ndata['id_feature'])
                tmp_graph_nell = dgl.to_heterogeneous(g_e2t_nell_homo, g_e2t_nell.ntypes, g_e2t_nell.etypes)
                nell_tmp_argsort = torch.argsort(tmp_graph_nell.ndata[dgl.NID]["token_id"] - tmp_graph_nell.num_nodes("sentinel_id") - tmp_graph_nell.num_nodes("nell_concept_id"))
                nell_embed_new[idx, :tmp_graph_nell.num_nodes("token_id"), :] = tmp_graph_nell.nodes["token_id"].data["id_feature"].index_select(0, nell_tmp_argsort)
            # # logger.info("time for one by one: {}".format(time() - start_time))

            if self.use_nell:
                memory_output_new = torch.cat((memory_output_new, nell_embed_new), 2)
            if self.use_context_graph and self.use_wn:
                k_memory = torch.cat((concept_embed_new, context_embed_new), 2)
            elif self.use_wn:
                k_memory = concept_embed_new
            elif self.use_context_graph:
                k_memory = context_embed_new
            if self.use_context_graph or self.use_wn:
                memory_output_new = torch.cat((memory_output_new, k_memory), 2)


            att_output = self.self_matching(memory_output_new,
                                            attention_mask.unsqueeze(2))  # [batch_size, max_seq_length, memory_output_size]
            # 4th layer: output layer
            choice_score = self.qa_kt_outputs(att_output[:,0,:])
            choice_score_list.append(choice_score)

        logits = torch.cat(
            [choice_score.unsqueeze(1).squeeze(-1) for choice_score in choice_score_list], dim=1
        )

        if label_ids[0] != -1:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, 2), label_ids.view(-1))
        else:
            loss = None

        # logger.info("time for forward: {}".format(time()-start_forward_time))
        return loss, logits, label_ids, kwargs.get("qas_ids")
示例#24
0
def test_sample_neighbors_etype_homogeneous(format_, direction, replace):
    num_nodes = 100
    rare_cnt = 4
    g = create_etype_test_graph(100, 30, rare_cnt)
    h_g = dgl.to_homogeneous(g)
    seed_ntype = g.get_ntype_id("u")
    seeds = F.nonzero_1d(h_g.ndata[dgl.NTYPE] == seed_ntype)
    fanouts = F.tensor([6, 5, 4, 3, 2], dtype=F.int64)

    def check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction):
        src, dst = subg.edges()
        num_etypes = F.asnumpy(h_g.edata[dgl.ETYPE]).max()
        etype_array = F.asnumpy(subg.edata[dgl.ETYPE])
        src = F.asnumpy(src)
        dst = F.asnumpy(dst)
        fanouts = F.asnumpy(fanouts)

        all_etype_array = F.asnumpy(h_g.edata[dgl.ETYPE])
        all_src = F.asnumpy(all_src)
        all_dst = F.asnumpy(all_dst)

        src_per_etype = []
        dst_per_etype = []
        for etype in range(num_etypes):
            src_per_etype.append(src[etype_array == etype])
            dst_per_etype.append(dst[etype_array == etype])

        if replace:
            if direction == 'in':
                in_degree_per_etype = [np.bincount(d) for d in dst_per_etype]
                for in_degree, fanout in zip(in_degree_per_etype, fanouts):
                    assert np.all(in_degree == fanout)
            else:
                out_degree_per_etype = [np.bincount(s) for s in src_per_etype]
                for out_degree, fanout in zip(out_degree_per_etype, fanouts):
                    assert np.all(out_degree == fanout)
        else:
            if direction == 'in':
                for v in set(dst):
                    u = src[dst == v]
                    et = etype_array[dst == v]
                    all_u = all_src[all_dst == v]
                    all_et = all_etype_array[all_dst == v]
                    for etype in set(et):
                        u_etype = set(u[et == etype])
                        all_u_etype = set(all_u[all_et == etype])
                        assert (len(u_etype) == fanouts[etype]) or (u_etype == all_u_etype)
            else:
                for u in set(src):
                    v = dst[src == u]
                    et = etype_array[src == u]
                    all_v = all_dst[all_src == u]
                    all_et = all_etype_array[all_src == u]
                    for etype in set(et):
                        v_etype = set(v[et == etype])
                        all_v_etype = set(all_v[all_et == etype])
                        assert (len(v_etype) == fanouts[etype]) or (v_etype == all_v_etype)

    all_src, all_dst = h_g.edges()
    h_g = h_g.formats(format_)
    if (direction, format_) in [('in', 'csr'), ('out', 'csc')]:
        h_g = h_g.formats(['csc', 'csr', 'coo'])
    for _ in range(5):
        subg = dgl.sampling.sample_etype_neighbors(
            h_g, seeds, dgl.ETYPE, fanouts, replace=replace, edge_dir=direction)
        check_num(h_g, all_src, all_dst, subg, replace, fanouts, direction)
示例#25
0
def track_time(data, low_mem, dgl_sparse):
    # load graph data
    dataset = utils.process_data(data)
    args = config()
    devices = [0, 1, 2, 3]
    args.low_mem = low_mem
    args.dgl_sparse = dgl_sparse
    args.dataset = dataset
    ogb_dataset = False

    if data == 'am':
        args.n_bases = 40
        args.l2norm = 5e-4
    elif data == 'ogbn-mag':
        args.n_bases = 2
        args.l2norm = 0
    else:
        raise ValueError()

    if ogb_dataset is True:
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-'+etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        if args.dataset == 'ogbn-mag':
            category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

    else:
        # Load from hetero-graph
        hg = dataset[0]

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        category = dataset.predict_category
        num_classes = dataset.num_classes
        train_mask = hg.nodes[category].data.pop('train_mask')
        test_mask = hg.nodes[category].data.pop('test_mask')
        labels = hg.nodes[category].data.pop('labels')
        train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
        test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()

        # AIFB, MUTAG, BGS and AM datasets do not provide validation set split.
        # Split train set into train and validation if args.validation is set
        # otherwise use train set as the validation set.
        if args.validation:
            val_idx = train_idx[:len(train_idx) // 5]
            train_idx = train_idx[len(train_idx) // 5:]
        else:
            val_idx = train_idx

    node_feats = []
    for ntype in hg.ntypes:
        if len(hg.nodes[ntype].data) == 0 or args.node_feats is False:
            node_feats.append(hg.number_of_nodes(ntype))
        else:
            assert len(hg.nodes[ntype].data) == 1
            feat = hg.nodes[ntype].data.pop('feat')
            node_feats.append(feat.share_memory_())

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i
        print('{}:{}'.format(i, ntype))

    g = dgl.to_homogeneous(hg)
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]
    g.ndata['ntype'].share_memory_()
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.edata['etype'].share_memory_()
    g.ndata['type_id'] = g.ndata[dgl.NID]
    g.ndata['type_id'].share_memory_()
    node_ids = th.arange(g.number_of_nodes())

    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_idx = node_ids[loc]
    target_idx.share_memory_()
    train_idx.share_memory_()
    val_idx.share_memory_()
    test_idx.share_memory_()
    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()

    n_gpus = len(devices)
    n_cpus = mp.cpu_count()

    ctx = mp.get_context('fork')
    queue = ctx.Queue()
    procs = []
    num_train_seeds = train_idx.shape[0]
    num_valid_seeds = val_idx.shape[0]
    num_test_seeds = test_idx.shape[0]
    train_seeds = th.randperm(num_train_seeds)
    valid_seeds = th.randperm(num_valid_seeds)
    test_seeds = th.randperm(num_test_seeds)
    tseeds_per_proc = num_train_seeds // n_gpus
    vseeds_per_proc = num_valid_seeds // n_gpus
    tstseeds_per_proc = num_test_seeds // n_gpus

    for proc_id in range(n_gpus):
        # we have multi-gpu for training, evaluation and testing
        # so split trian set, valid set and test set into num-of-gpu parts.
        proc_train_seeds = train_seeds[proc_id * tseeds_per_proc:
                                       (proc_id + 1) * tseeds_per_proc
                                       if (proc_id + 1) * tseeds_per_proc < num_train_seeds
                                       else num_train_seeds]
        proc_valid_seeds = valid_seeds[proc_id * vseeds_per_proc:
                                       (proc_id + 1) * vseeds_per_proc
                                       if (proc_id + 1) * vseeds_per_proc < num_valid_seeds
                                       else num_valid_seeds]
        proc_test_seeds = test_seeds[proc_id * tstseeds_per_proc:
                                     (proc_id + 1) * tstseeds_per_proc
                                     if (proc_id + 1) * tstseeds_per_proc < num_test_seeds
                                     else num_test_seeds]

        p = ctx.Process(target=run, args=(proc_id, n_gpus, n_cpus // n_gpus, args, devices,
                                          (g, node_feats, num_of_ntype, num_classes, num_rels, target_idx,
                                           train_idx, val_idx, test_idx, labels),
                                          (proc_train_seeds,
                                           proc_valid_seeds, proc_test_seeds),
                                          queue))
        p.start()

        procs.append(p)
    for p in procs:
        p.join()
    time_records = queue.get(block=False)
    num_exclude = 10 # exclude first 10 iterations
    if len(time_records) < 15:
        # exclude less if less records
        num_exclude = int(len(time_records)*0.3)
    return np.mean(time_records[num_exclude:])
示例#26
0
hg_orig, labels = dataset[0]
subgs = {}
for etype in hg_orig.canonical_etypes:
    u, v = hg_orig.all_edges(etype=etype)
    subgs[etype] = (u, v)
    subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
hg = dgl.heterograph(subgs)
hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
print(hg)
#subg_nodes = {}
#for ntype in hg.ntypes:
#    subg_nodes[ntype] = np.random.choice(hg.number_of_nodes(ntype), int(hg.number_of_nodes(ntype) / 5), replace=False)
#hg = dgl.compact_graphs(dgl.node_subgraph(hg, subg_nodes))

# OGB-MAG is stored in heterogeneous format. We need to convert it into homogeneous format.
g = dgl.to_homogeneous(hg)
g.ndata['orig_id'] = g.ndata[dgl.NID]
g.edata['orig_id'] = g.edata[dgl.EID]
print('|V|=' + str(g.number_of_nodes()))
print('|E|=' + str(g.number_of_edges()))
print('|NTYPE|=' + str(len(th.unique(g.ndata[dgl.NTYPE]))))

# Store the metadata of nodes.
num_node_weights = 0
node_data = [g.ndata[dgl.NTYPE].numpy()]
for ntype_id in th.unique(g.ndata[dgl.NTYPE]):
    node_data.append((g.ndata[dgl.NTYPE] == ntype_id).numpy())
    num_node_weights += 1
node_data.append(g.ndata['orig_id'].numpy())
node_data = np.stack(node_data, 1)
np.savetxt('mag_nodes.txt', node_data, fmt='%d', delimiter=' ')
示例#27
0
def track_acc(data):
    dataset = utils.process_data(data)
    device = utils.get_bench_device()

    if data == 'am':
        n_bases = 40
        l2norm = 5e-4
    elif data == 'ogbn-mag':
        n_bases = 2
        l2norm = 0
    else:
        raise ValueError()

    fanouts = [25,15]
    n_layers = 2
    batch_size = 1024
    n_hidden = 64
    dropout = 0.5
    use_self_loop = True
    lr = 0.01
    n_epochs = 20
    low_mem = True
    num_workers = 4

    hg = dataset[0]
    category = dataset.predict_category
    num_classes = dataset.num_classes
    train_mask = hg.nodes[category].data.pop('train_mask')
    train_idx = th.nonzero(train_mask, as_tuple=False).squeeze()
    test_mask = hg.nodes[category].data.pop('test_mask')
    test_idx = th.nonzero(test_mask, as_tuple=False).squeeze()
    labels = hg.nodes[category].data.pop('labels').to(device)
    num_of_ntype = len(hg.ntypes)
    num_rels = len(hg.canonical_etypes)

    node_feats = []
    for ntype in hg.ntypes:
        if len(hg.nodes[ntype].data) == 0 or 'feat' not in hg.nodes[ntype].data:
            node_feats.append(None)
        else:
            feat = hg.nodes[ntype].data.pop('feat')
            node_feats.append(feat.share_memory_())

    # get target category id
    category_id = len(hg.ntypes)
    for i, ntype in enumerate(hg.ntypes):
        if ntype == category:
            category_id = i
    g = dgl.to_homogeneous(hg)
    u, v, eid = g.all_edges(form='all')

    # global norm
    _, inverse_index, count = th.unique(v, return_inverse=True, return_counts=True)
    degrees = count[inverse_index]
    norm = th.ones(eid.shape[0]) / degrees
    norm = norm.unsqueeze(1)
    g.edata['norm'] = norm
    g.edata['etype'] = g.edata[dgl.ETYPE]
    g.ndata['type_id'] = g.ndata[dgl.NID]
    g.ndata['ntype'] = g.ndata[dgl.NTYPE]

    node_ids = th.arange(g.number_of_nodes())
    # find out the target node ids
    node_tids = g.ndata[dgl.NTYPE]
    loc = (node_tids == category_id)
    target_nids = node_ids[loc]
    train_nids = target_nids[train_idx]

    # Create csr/coo/csc formats before launching training processes with multi-gpu.
    # This avoids creating certain formats in each sub-process, which saves momory and CPU.
    g.create_formats_()
    sampler = dgl.dataloading.MultiLayerNeighborSampler(fanouts)
    collator = dgl.dataloading.NodeCollator(g, train_nids, sampler, return_indices=True)
    loader = dgl.dataloading.DataLoader(
        collator.dataset, collate_fn=collator.collate,
        batch_size=batch_size, shuffle=True, num_workers=4)
    # test_sampler =  dgl.dataloading.MultiLayerNeighborSampler(fanouts)
    test_loader = DataLoader(dataset=test_idx.numpy(),
                             batch_size=batch_size,
                             collate_fn=collator.collate,
                             shuffle=False,
                             num_workers=4)

    # node features
    # None for one-hot feature, if not none, it should be the feature tensor.
    #
    embed_layer = RelGraphEmbedLayer(device,
                                     g.number_of_nodes(),
                                     node_tids,
                                     num_of_ntype,
                                     node_feats,
                                     n_hidden,
                                     sparse_emb=True)

    # create model
    # all model params are in device.
    model = EntityClassify(device,
                           g.number_of_nodes(),
                           n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=n_bases,
                           num_hidden_layers=n_layers - 2,
                           dropout=dropout,
                           use_self_loop=use_self_loop,
                           low_mem=low_mem,
                           layer_norm=False)

    embed_layer = embed_layer.to(device)
    model = model.to(device)

    all_params = itertools.chain(model.parameters(), embed_layer.embeds.parameters())
    optimizer = th.optim.Adam(all_params, lr=lr, weight_decay=l2norm)
    emb_optimizer = th.optim.SparseAdam(list(embed_layer.node_embeds.parameters()), lr=lr)

    print("start training...")
    t0 = time.time()
    for epoch in range(n_epochs):
        model.train()
        embed_layer.train()

        for i, sample_data in enumerate(loader):
            input_nodes, output_nodes, seed_idx, blocks = sample_data
            feats = embed_layer(input_nodes,
                                blocks[0].srcdata['ntype'],
                                blocks[0].srcdata['type_id'],
                                node_feats)
            logits = model(blocks, feats)
            loss = F.cross_entropy(logits, labels[train_idx][seed_idx])
            optimizer.zero_grad()
            emb_optimizer.zero_grad()

            loss.backward()
            optimizer.step()
            emb_optimizer.step()

    test_logits, test_seeds = evaluate(model, embed_layer, test_loader, node_feats)
    test_loss = F.cross_entropy(test_logits, labels[test_seeds].cpu()).item()
    test_acc = th.sum(test_logits.argmax(dim=1) == labels[test_seeds].cpu()).item() / len(test_seeds)
    t1 = time.time()
    return test_acc
示例#28
0
def load_ogb(dataset, global_norm):
    if dataset == 'ogbn-mag':
        dataset = DglNodePropPredDataset(name=dataset)
        split_idx = dataset.get_idx_split()
        train_idx = split_idx["train"]['paper']
        val_idx = split_idx["valid"]['paper']
        test_idx = split_idx["test"]['paper']
        hg_orig, labels = dataset[0]
        subgs = {}
        for etype in hg_orig.canonical_etypes:
            u, v = hg_orig.all_edges(etype=etype)
            subgs[etype] = (u, v)
            subgs[(etype[2], 'rev-' + etype[1], etype[0])] = (v, u)
        hg = dgl.heterograph(subgs)
        hg.nodes['paper'].data['feat'] = hg_orig.nodes['paper'].data['feat']
        paper_labels = labels['paper'].squeeze()

        num_rels = len(hg.canonical_etypes)
        num_of_ntype = len(hg.ntypes)
        num_classes = dataset.num_classes
        category = 'paper'
        print('Number of relations: {}'.format(num_rels))
        print('Number of class: {}'.format(num_classes))
        print('Number of train: {}'.format(len(train_idx)))
        print('Number of valid: {}'.format(len(val_idx)))
        print('Number of test: {}'.format(len(test_idx)))

        # currently we do not support node feature in mag dataset.
        # calculate norm for each edge type and store in edge
        if global_norm is False:
            for canonical_etype in hg.canonical_etypes:
                u, v, eid = hg.all_edges(form='all', etype=canonical_etype)
                _, inverse_index, count = th.unique(v,
                                                    return_inverse=True,
                                                    return_counts=True)
                degrees = count[inverse_index]
                norm = th.ones(eid.shape[0]) / degrees
                norm = norm.unsqueeze(1)
                hg.edges[canonical_etype].data['norm'] = norm

        # get target category id
        category_id = len(hg.ntypes)
        for i, ntype in enumerate(hg.ntypes):
            if ntype == category:
                category_id = i

        g = dgl.to_homogeneous(hg, edata=['norm'])
        if global_norm:
            u, v, eid = g.all_edges(form='all')
            _, inverse_index, count = th.unique(v,
                                                return_inverse=True,
                                                return_counts=True)
            degrees = count[inverse_index]
            norm = th.ones(eid.shape[0]) / degrees
            norm = norm.unsqueeze(1)
            g.edata['norm'] = norm

        node_ids = th.arange(g.number_of_nodes())
        # find out the target node ids
        node_tids = g.ndata[dgl.NTYPE]
        loc = (node_tids == category_id)
        target_idx = node_ids[loc]
        train_idx = target_idx[train_idx]
        val_idx = target_idx[val_idx]
        test_idx = target_idx[test_idx]
        train_mask = th.zeros((g.number_of_nodes(), ), dtype=th.bool)
        train_mask[train_idx] = True
        val_mask = th.zeros((g.number_of_nodes(), ), dtype=th.bool)
        val_mask[val_idx] = True
        test_mask = th.zeros((g.number_of_nodes(), ), dtype=th.bool)
        test_mask[test_idx] = True
        g.ndata['train_mask'] = train_mask
        g.ndata['val_mask'] = val_mask
        g.ndata['test_mask'] = test_mask

        labels = th.full((g.number_of_nodes(), ), -1, dtype=paper_labels.dtype)
        labels[target_idx] = paper_labels
        g.ndata['labels'] = labels
        return g
    else:
        raise ("Do not support other ogbn datasets.")