def __getitem__(self, i): eids = self.pos_sampler.sample() src, dst = self.g.find_edges(eids) src, dst = src.numpy(), dst.numpy() rel = self.g.edata[dgl.ETYPE][eids].numpy() # relabel nodes to have consecutive node IDs uniq_v, edges = np.unique((src, dst), return_inverse=True) num_nodes = len(uniq_v) # edges is the concatenation of src, dst with relabeled ID src, dst = np.reshape(edges, (2, -1)) relabeled_data = np.stack((src, rel, dst)).transpose() samples, labels = self.neg_sampler.sample(relabeled_data, num_nodes) # Use only half of the positive edges chosen_ids = np.random.choice(np.arange(self.sample_size), size=int(self.sample_size / 2), replace=False) src = src[chosen_ids] dst = dst[chosen_ids] rel = rel[chosen_ids] src, dst = np.concatenate((src, dst)), np.concatenate((dst, src)) rel = np.concatenate((rel, rel + self.num_rels)) sub_g = dgl.graph((src, dst), num_nodes=num_nodes) sub_g.edata[dgl.ETYPE] = th.from_numpy(rel) sub_g.edata['norm'] = dgl.norm_by_dst(sub_g).unsqueeze(-1) uniq_v = th.from_numpy(uniq_v).view(-1, 1).long() return sub_g, uniq_v, samples, labels
def preprocess(g, num_rels): # Get train graph train_g = get_subset_g(g, g.edata['train_mask'], num_rels) # Get test graph test_g = get_subset_g(g, g.edata['train_mask'], num_rels, bidirected=True) test_g.edata['norm'] = dgl.norm_by_dst(test_g).unsqueeze(-1) return train_g, test_g
def process_batch(inv_target, batch): _, seeds, blocks = batch # map the seed nodes back to their type-specific ids, # in order to get the target node labels seeds = inv_target[seeds] for blc in blocks: blc.edata['norm'] = dgl.norm_by_dst(blc).unsqueeze(1) return seeds, blocks
def load_data(data_name, get_norm=False, inv_target=False): if data_name == 'aifb': dataset = AIFBDataset() elif data_name == 'mutag': dataset = MUTAGDataset() elif data_name == 'bgs': dataset = BGSDataset() else: dataset = AMDataset() # Load hetero-graph hg = dataset[0] num_rels = len(hg.canonical_etypes) category = dataset.predict_category num_classes = dataset.num_classes labels = hg.nodes[category].data.pop('labels') train_mask = hg.nodes[category].data.pop('train_mask') test_mask = hg.nodes[category].data.pop('test_mask') train_idx = th.nonzero(train_mask, as_tuple=False).squeeze() test_idx = th.nonzero(test_mask, as_tuple=False).squeeze() if get_norm: # Calculate normalization weight for each edge, # 1. / d, d is the degree of the destination node for cetype in hg.canonical_etypes: hg.edges[cetype].data['norm'] = dgl.norm_by_dst( hg, cetype).unsqueeze(1) edata = ['norm'] else: edata = None # get target category id category_id = hg.ntypes.index(category) g = dgl.to_homogeneous(hg, edata=edata) # Rename the fields as they can be changed by for example NodeDataLoader g.ndata['ntype'] = g.ndata.pop(dgl.NTYPE) g.ndata['type_id'] = g.ndata.pop(dgl.NID) node_ids = th.arange(g.num_nodes()) # find out the target node ids in g loc = (g.ndata['ntype'] == category_id) target_idx = node_ids[loc] if inv_target: # Map global node IDs to type-specific node IDs. This is required for # looking up type-specific labels in a minibatch inv_target = th.empty((g.num_nodes(), ), dtype=th.int64) inv_target[target_idx] = th.arange(0, target_idx.shape[0], dtype=inv_target.dtype) return g, num_rels, num_classes, labels, train_idx, test_idx, target_idx, inv_target else: return g, num_rels, num_classes, labels, train_idx, test_idx, target_idx