def sample_frontier(self, block_id, g, seed_nodes): fanout = self.fanouts[block_id] if self.fanouts is not None else None # List of neighbors to sample per edge type for each GNN layer, starting from the first layer. g = dgl.in_subgraph(g, seed_nodes) g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0]) if self.args.valid_path: if block_id != self.args.n_layer - 1: g.dstdata['sample_time'] = self.frontiers[block_id + 1].srcdata['sample_time'] g.apply_edges(self.sample_prob) g.remove_edges(torch.where(g.edata['timespan'] < 0)[0]) g_re=dgl.reverse(g,copy_edata=True,copy_ndata=True) g_re.update_all(self.sample_time,fn.max('st','sample_time')) g=dgl.reverse(g_re,copy_edata=True,copy_ndata=True) if fanout is None: frontier = g else: if block_id == self.args.n_layer - 1: if self.args.bandit: frontier = dgl.sampling.sample_neighbors(g,seed_nodes,fanout,prob='q_ij') else: frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout) else: frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout) self.frontiers[block_id] = frontier return frontier
def inference(self, g, x, batch_size, device): """ Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling). g : the entire graph. x : the input of entire node set. The inference code is written in a fashion that it could handle any number of nodes and layers. """ # During inference with sampling, multi-layer blocks are very inefficient because # lots of computations in the first few layers are repeated. # Therefore, we compute the representation of all nodes layer by layer. The nodes # on each layer are of course splitted in batches. # TODO: can we standardize this? nodes = th.arange(g.number_of_nodes()) for l, layer in enumerate(self.layers): y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) for start in tqdm.trange(0, len(nodes), batch_size): end = start + batch_size batch_nodes = nodes[start:end] block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes) input_nodes = block.srcdata[dgl.NID] h = x[input_nodes].to(device) h_dst = h[:block.number_of_dst_nodes()] h = layer(block, (h, h_dst)) if l != len(self.layers) - 1: h = self.activation(h) h = self.dropout(h) y[start:end] = h.cpu() x = y return y
def sample_frontier(self, block_id, g, seed_nodes): # List of neighbors to sample per edge type for each GNN layer, starting from the first layer. g = dgl.in_subgraph(g, seed_nodes) g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0]) frontier=g self.frontiers[block_id] = frontier return frontier
def sampler_frontier(self, block_id, g, seed_nodes, timestamp): full_neighbor_subgraph = dgl.in_subgraph(g, seed_nodes) full_neighbor_subgraph = dgl.add_edges(full_neighbor_subgraph, seed_nodes, seed_nodes) temporal_edge_mask = (full_neighbor_subgraph.edata['timestamp'] < timestamp) + ( full_neighbor_subgraph.edata['timestamp'] <= 0) temporal_subgraph = dgl.edge_subgraph( full_neighbor_subgraph, temporal_edge_mask) # Map preserve ID temp2origin = temporal_subgraph.ndata[dgl.NID] # The added new edgge will be preserved hence root2sub_dict = dict( zip(temp2origin.tolist(), temporal_subgraph.nodes().tolist())) temporal_subgraph.ndata[dgl.NID] = g.ndata[dgl.NID][temp2origin] seed_nodes = [root2sub_dict[int(n)] for n in seed_nodes] final_subgraph = self.sampler(g=temporal_subgraph, nodes=seed_nodes) final_subgraph.remove_self_loop() return final_subgraph
def test_in_subgraph(): g1 = dgl.graph([(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)], 'user', 'follow') g2 = dgl.bipartite([(0, 0), (0, 1), (1, 2), (3, 2)], 'user', 'play', 'game') g3 = dgl.bipartite([(2, 0), (2, 1), (2, 2), (1, 0), (1, 3), (0, 0)], 'game', 'liked-by', 'user') g4 = dgl.bipartite([(0, 0), (1, 0), (2, 0), (3, 0)], 'user', 'flips', 'coin') hg = dgl.hetero_from_relations([g1, g2, g3, g4]) subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}) assert len(subg.ntypes) == 3 assert len(subg.etypes) == 4 u, v = subg['follow'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID]) assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)} u, v = subg['play'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID]) assert edge_set == {(0, 0)} u, v = subg['liked-by'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID]) assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)} assert subg['flips'].number_of_edges() == 0
def build_ground_truth_train(args): """根据某个领域的论文引用数加权求和构造学者排名,作为ground truth训练集。""" data = OAGCSDataset() if args.use_original_id else OAGCoreDataset() g = data[0] g.nodes['paper'].data['citation'] = g.nodes['paper'].data[ 'citation'].float() if args.log_citation: g.nodes['paper'].data['citation'] = g.nodes['paper'].data[ 'citation'].log1p() g.edges['writes'].data['order'] = g.edges['writes'].data['order'].float() apg = g['author', 'writes', 'paper'] # 1.筛选论文数>=num_papers的领域 field_in_degree, fid = g.in_degrees( g.nodes('field'), etype='has_field').sort(descending=True) fid = fid[field_in_degree >= args.num_papers].tolist() # 2.对每个领域关联的论文,构造学者-论文子图,通过论文引用数之和构造学者排名 author_rank = {} for i in tqdm(fid): pid, _ = g.in_edges(i, etype='has_field') sg = add_reverse_edges( dgl.in_subgraph(apg, {'paper': pid}, relabel_nodes=True)) author_citation = calc_author_citation(sg) _, idx = author_citation.topk(args.num_authors) aid = sg.nodes['author'].data[dgl.NID][idx] author_rank[i] = aid.tolist() suffix = '_original' if args.use_original_id else '' with open(DATA_DIR / f'rank/author_rank_train{suffix}.json', 'w') as f: json.dump(author_rank, f) print('结果已保存到', f.name)
def construct_blocks(self, seeds, user_item_pairs_to_remove): blocks = [] users, items = user_item_pairs_to_remove # 采样就是根据卷积层数选取对应数量的邻居结点 # 涉及到双向图的处理 for i in range(self.num_layers): sampled_graph = dgl.in_subgraph(self.graph, seeds) sampled_eids = sampled_graph.edges[('user', 'watched', 'item')].data[dgl.EID] sampled_eids_rev = sampled_graph.edges[('item', 'watchedby', 'user')].data[dgl.EID] # 训练时要去掉用户和项目间的关联 _, _, edges_to_remove = sampled_graph.edge_ids(users, items, etype=('user', 'watched', 'item'), return_uv=True) _, _, edges_to_remove_rev = sampled_graph.edge_ids( items, users, etype=('item', 'watchedby', 'user'), return_uv=True) # sampled_with_edges_removed = dgl.remove_edges( # sampled_graph, # {('user', 'watched', 'item'): edges_to_remove, ('item', 'watchedby', 'user'): edges_to_remove_rev} # ) sampled_with_edges_removed = dgl.remove_edges( sampled_graph, edges_to_remove, ('user', 'watched', 'item')) sampled_with_edges_removed = dgl.remove_edges( sampled_with_edges_removed, edges_to_remove_rev, ('item', 'watchedby', 'user')) sampled_eids = sampled_eids[sampled_with_edges_removed.edges[( 'user', 'watched', 'item')].data[dgl.EID]] sampled_eids_rev = sampled_eids_rev[ sampled_with_edges_removed.edges[('item', 'watchedby', 'user')].data[dgl.EID]] # 创建子图块 block = dgl.to_block(sampled_with_edges_removed, seeds) blocks.insert(0, block) seeds = { 'user': block.srcnodes['user'].data[dgl.NID], 'item': block.srcnodes['item'].data[dgl.NID] } # 把评分复制过去 block.edges[('user', 'watched', 'item')].data['rating'] = \ self.graph.edges[('user', 'watched', 'item')].data['rating'][sampled_eids] block.edges[('item', 'watchedby', 'user')].data['rating'] = \ self.graph.edges[('item', 'watchedby', 'user')].data['rating'][sampled_eids_rev] return blocks
def check_rpc_in_subgraph_shuffle(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{}\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server partition_graph(g, 'test_in_subgraph', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=True) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_in_subgraph')) p.start() time.sleep(1) pserver_list.append(p) nodes = [0, 10, 99, 66, 1024, 2008] time.sleep(3) sampled_graph = start_in_subgraph_client(0, tmpdir, num_server > 1, nodes) for p in pserver_list: p.join() orig_nid = F.zeros((g.number_of_nodes(), ), dtype=F.int64, ctx=F.cpu()) orig_eid = F.zeros((g.number_of_edges(), ), dtype=F.int64, ctx=F.cpu()) for i in range(num_server): part, _, _, _, _, _, _ = load_partition( tmpdir / 'test_in_subgraph.json', i) orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id'] orig_eid[part.edata[dgl.EID]] = part.edata['orig_id'] src, dst = sampled_graph.edges() src = orig_nid[src] dst = orig_nid[dst] assert sampled_graph.number_of_nodes() == g.number_of_nodes() assert np.all(F.asnumpy(g.has_edges_between(src, dst))) subg1 = dgl.in_subgraph(g, orig_nid[nodes]) src1, dst1 = subg1.edges() assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1))) assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1))) eids = g.edge_ids(src, dst) eids1 = orig_eid[sampled_graph.edata[dgl.EID]] assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
def sample_blocks(self, seeds): blocks = [] seeds = {self.category: th.tensor(seeds).long()} cur = seeds for fanout in self.fanouts: if fanout is None: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) block = dgl.to_block(frontier, cur) cur = {} for ntype in block.srctypes: cur[ntype] = block.srcnodes[ntype].data[dgl.NID] blocks.insert(0, block) return seeds, blocks
def sample_frontier(self, block_id, g, seed_nodes): ''' Deleting the the edges that happen after the current timestamp, then use a simple topk edge sampling by timestamp. ''' fanout = self.fanouts[block_id] # List of neighbors to sample per edge type for each GNN layer, starting from the first layer. g = dgl.in_subgraph(g, seed_nodes) g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0]) # Deleting the the edges that happen after the current timestamp if fanout is None: # full neighborhood sampling frontier = g else: frontier = dgl.sampling.select_topk(g, fanout, 'timestamp', seed_nodes) # most recent timestamp edge sampling self.frontiers[block_id] = frontier # save frontier return frontier
def sample_blocks(self, seeds): seeds = th.LongTensor(np.asarray(seeds)) blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. if fanout == 0: frontier = dgl.in_subgraph(self.g, seeds) else: frontier = dgl.dataloading.sample_neighbors(self.g, seeds, fanout, replace=True) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks
def sample_blocks(self, seeds): seeds = th.LongTensor(seeds) blocks = [] hist_blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout) hist_frontier = dgl.in_subgraph(self.g, seeds) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) hist_block = dgl.to_block(hist_frontier, seeds) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) hist_blocks.insert(0, hist_block) return blocks, hist_blocks
def sample_blocks(self, seeds): blocks = [] etypes = [] norms = [] ntypes = [] seeds = th.tensor(seeds).long() cur = self.target_idx[seeds] for fanout in self.fanouts: if fanout is None or fanout == -1: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) block = dgl.to_block(frontier, cur) gen_norm(block) cur = block.srcdata[dgl.NID] blocks.insert(0, block) return seeds, blocks
def sample_frontier(self, block_id, g, seed_nodes): fanout = self.fanouts[block_id] g = dgl.in_subgraph(g, seed_nodes) g.remove_edges(torch.where(g.edata['timestamp']>self.ts)[0]) if fanout is None: frontier = g #frontier = dgl.in_subgraph(g, seed_nodes) else: if self.args.uniform: frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout) else: frontier = dgl.sampling.select_topk(g, fanout, 'timestamp', seed_nodes) self.frontiers[block_id] = frontier return frontier
def sample_block(self, seeds): blocks = [] for fanout in self.fanouts: # For each seed node, sample ``fanout`` neighbors. if fanout is None: frontier = dgl.in_subgraph(self.g, seeds) else: frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout, replace=False) # Then we compact the frontier into a bipartite graph for message passing. block = dgl.to_block(frontier, seeds) # Obtain the seed nodes for next layer. seeds = block.srcdata[dgl.NID] blocks.insert(0, block) return blocks, blocks[0].srcdata[dgl.NID]
def rank(ctx, query, k=100): """根据输入的查询词在oag-cs数据集计算学者排名 :param ctx: Context 上下文对象 :param query: str 查询词 :param k: int, optional 返回top学者数量,默认为100 :return: List[float], List[int] 学者得分和id,按得分降序排序 """ if query in ctx.field2id: pid, _ = ctx.g.in_edges(ctx.field2id[query], etype='has_field') else: _, pid = recall.recall(ctx.recall_ctx, query, 200) sg = add_reverse_edges( dgl.in_subgraph(ctx.apg, {'paper': pid}, relabel_nodes=True)) author_citation = calc_author_citation(sg) citation, idx = author_citation.topk(k) aid = sg.nodes['author'].data[dgl.NID][idx] return citation.tolist(), aid.tolist()
def check_rpc_in_subgraph(tmpdir, num_server): ip_config = open("rpc_ip_config.txt", "w") for _ in range(num_server): ip_config.write('{} 1\n'.format(get_local_usable_addr())) ip_config.close() g = CitationGraphDataset("cora")[0] g.readonly() num_parts = num_server partition_graph(g, 'test_in_subgraph', num_parts, tmpdir, num_hops=1, part_method='metis', reshuffle=False) pserver_list = [] ctx = mp.get_context('spawn') for i in range(num_server): p = ctx.Process(target=start_server, args=(i, tmpdir, num_server > 1, 'test_in_subgraph')) p.start() time.sleep(1) pserver_list.append(p) nodes = [0, 10, 99, 66, 1024, 2008] time.sleep(3) sampled_graph = start_in_subgraph_client(0, tmpdir, num_server > 1, nodes) for p in pserver_list: p.join() src, dst = sampled_graph.edges() g = dgl.as_heterograph(g) assert sampled_graph.number_of_nodes() == g.number_of_nodes() subg1 = dgl.in_subgraph(g, nodes) src1, dst1 = subg1.edges() assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1))) assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1))) eids = g.edge_ids(src, dst) assert np.array_equal(F.asnumpy(sampled_graph.edata[dgl.EID]), F.asnumpy(eids))
def inference(self, g, x, batch_size, device): nodes = torch.arange(g.number_of_nodes()) for l, layer in enumerate(self.layers): y = torch.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes) for start in tqdm.trange(0, len(nodes), batch_size): end = start + batch_size batch_nodes = nodes[start:end] block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes) input_nodes = block.srcdata[dgl.NID] h = x[input_nodes].to(device) h_dst = h[:block.number_of_dst_nodes()] h = layer(block, (h, h_dst)) if l != len(self.layers) - 1: h = self.activation(h) h = self.dropout(h) y[start:end] = h.cpu() x = y return y
def sample_blocks(self, seeds): blocks = [] etypes = [] norms = [] ntypes = [] seeds = th.tensor(seeds).long() cur = self.target_idx[seeds] for fanout in self.fanouts: if fanout is None or fanout == -1: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) etypes = self.g.edata[dgl.ETYPE][frontier.edata[dgl.EID]] block = dgl.to_block(frontier, cur) block.srcdata[dgl.NTYPE] = self.g.ndata[dgl.NTYPE][block.srcdata[dgl.NID]] block.srcdata['type_id'] = self.g.ndata[dgl.NID][block.srcdata[dgl.NID]] block.edata['etype'] = etypes cur = block.srcdata[dgl.NID] blocks.insert(0, block) return seeds, blocks
def sample_blocks(g, uniq_uids, uniq_iids, fanouts, steps): seeds = { 'user': th.LongTensor(uniq_uids), 'item': th.LongTensor(uniq_iids) } blocks = [] for fanout in fanouts: if fanout <= 0: frontier = dgl.in_subgraph(g, seeds) else: frontier = dgl.sampling.sample_neighbors(g, seeds, fanout, copy_ndata=False, copy_edata=True) block = dgl.to_block(frontier, seeds) seeds = { ntype: block.srcnodes[ntype].data[dgl.NID] for ntype in block.srctypes } blocks.insert(0, block) return blocks, seeds
def test_in_subgraph(idtype): hg = dgl.heterograph({ ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2]), ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]), ('game', 'liked-by', 'user'): ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]), ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0]) }, idtype=idtype) subg = dgl.in_subgraph(hg, {'user' : [0,1], 'game' : 0}) assert subg.idtype == idtype assert len(subg.ntypes) == 3 assert len(subg.etypes) == 4 u, v = subg['follow'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID]) assert edge_set == {(1,0),(2,0),(3,0),(0,1),(2,1),(3,1)} u, v = subg['play'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID]) assert edge_set == {(0,0)} u, v = subg['liked-by'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID]) assert edge_set == {(2,0),(2,1),(1,0),(0,0)} assert subg['flips'].number_of_edges() == 0
def test_in_subgraph(idtype): hg = dgl.heterograph( { ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2]), ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]), ('game', 'liked-by', 'user'): ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]), ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0]) }, idtype=idtype, num_nodes_dict={ 'user': 5, 'game': 10, 'coin': 8 }) subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}) assert subg.idtype == idtype assert len(subg.ntypes) == 3 assert len(subg.etypes) == 4 u, v = subg['follow'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID]) assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)} u, v = subg['play'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID]) assert edge_set == {(0, 0)} u, v = subg['liked-by'].edges() edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v)))) assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID]) assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)} assert subg['flips'].number_of_edges() == 0 for ntype in subg.ntypes: assert dgl.NID not in subg.nodes[ntype].data # Test store_ids subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}, store_ids=False) for etype in ['follow', 'play', 'liked-by']: assert dgl.EID not in subg.edges[etype].data for ntype in subg.ntypes: assert dgl.NID not in subg.nodes[ntype].data # Test relabel nodes subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}, relabel_nodes=True) assert subg.idtype == idtype assert len(subg.ntypes) == 3 assert len(subg.etypes) == 4 u, v = subg['follow'].edges() old_u = F.gather_row(subg.nodes['user'].data[dgl.NID], u) old_v = F.gather_row(subg.nodes['user'].data[dgl.NID], v) assert F.array_equal(hg['follow'].edge_ids(old_u, old_v), subg['follow'].edata[dgl.EID]) edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v)))) assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)} u, v = subg['play'].edges() old_u = F.gather_row(subg.nodes['user'].data[dgl.NID], u) old_v = F.gather_row(subg.nodes['game'].data[dgl.NID], v) assert F.array_equal(hg['play'].edge_ids(old_u, old_v), subg['play'].edata[dgl.EID]) edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v)))) assert edge_set == {(0, 0)} u, v = subg['liked-by'].edges() old_u = F.gather_row(subg.nodes['game'].data[dgl.NID], u) old_v = F.gather_row(subg.nodes['user'].data[dgl.NID], v) assert F.array_equal(hg['liked-by'].edge_ids(old_u, old_v), subg['liked-by'].edata[dgl.EID]) edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v)))) assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)} assert subg.num_nodes('user') == 4 assert subg.num_nodes('game') == 3 assert subg.num_nodes('coin') == 0 assert subg.num_edges('flips') == 0
def collate_fn(seeds): batch_nodes = { ntype: th.LongTensor(np.asarray(seeds)) } block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes) return seeds, block
def full(self, g, batch_size): nodes = torch.arange(g.number_of_nodes()) x = g.ndata['features'] x = self.dropout(x) for i in range(self.K): if i != self.K - 1: y = torch.zeros(g.number_of_nodes(), self.hid_feat) else: y = torch.zeros(g.number_of_nodes(), self.out_feat) for start in tqdm.trange(0, g.number_of_nodes(), batch_size): end = start + batch_size batch_nodes = nodes[start:end] block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes) input_nodes = block.srcdata[dgl.NID] h = x[input_nodes] if self.use_cuda: h = h.cuda() # 下一层时使用了上一层的y,y默认为cpu() if self.aggregator == 'pool': if i == 0: h = torch.matmul(h, self.weight_pool_in) if self.bias: h = h + self.bias_in else: h = torch.matmul(h, self.weight_pool_hid) if self.bias: h = h + self.bias_hid if self.aggregator == 'gcn': if i == 0: block.srcdata['h'] = torch.matmul(h, self.weight_gcn_in) else: block.srcdata['h'] = torch.matmul(h, self.weight_gcn_hid) else: block.srcdata['h'] = h block.dstdata['h'] = h[:block.number_of_dst_nodes()] if self.aggregator == 'gcn': block.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'neigh')) elif self.aggregator == 'mean': block.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'neigh')) elif self.aggregator == 'lstm': block.update_all(fn.copy_src('h', 'm'), self.lstm_reducer_in if i == 0 else self.lstm_reducer_hid) else: block.update_all(fn.copy_src('h', 'm'), fn.max('m', 'neigh')) h_neigh = block.dstdata['neigh'] if i == 0: h = torch.matmul(block.dstdata['h'], self.weight_in[0, :, :]) \ + (torch.matmul(h_neigh, self.weight_in[1, :, :]) if self.aggregator != 'gcn' else 0) if self.bias: h = h + self.bias_in_k[0, :] + (self.bias_in_k[1, :] if self.aggregator != 'gcn' else 0) elif i == self.K - 1: h = torch.matmul(block.dstdata['h'], self.weight_out[0, :, :]) \ + (torch.matmul(h_neigh, self.weight_out[1, :, :]) if self.aggregator != 'gcn' else 0) if self.bias: h = h + self.bias_out_k[0, :] + (self.bias_out_k[1, :] if self.aggregator != 'gcn' else 0) else: h = torch.matmul(block.dstdata['h'], self.weight_hid[i - 1, 0, :, :]) \ + (torch.matmul(h_neigh, self.weight_hid[i - 1, 1, :, :]) if self.aggregator != 'gcn' else 0) if self.bias: h = h + self.bias_hid_k[0, :] + (self.bias_hid_k[1, :] if self.aggregator != 'gcn' else 0) if self.activation and i != self.K - 1: h = self.activation(h, inplace=False) if i != self.K - 1: h = self.dropout(h) if self.norm: norm = torch.norm(h, dim=1) norm = norm + (norm == 0).long() h = h / norm.unsqueeze(-1) y[start:end] = h x = y if self.use_cuda: x = x.cuda() g.ndata['z'] = x return g
def sample_blocks(self, seeds): """Sample subgraphs from the entire graph. The input ``seeds`` represents the edges to compute prediction for. The sampling algorithm works as follows: 1. Get the head and tail nodes of the provided seed edges. 2. For each head and tail node, extract the entire in-coming neighborhood. 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. """ dataset = self.dataset enc_graph = self.enc_graph dec_graph = self.dec_graph edge_ids = th.stack(seeds) # generate frontiers for user and item possible_rating_values = dataset.possible_rating_values true_relation_ratings = self.truths[edge_ids] true_relation_labels = None if self.labels is None else self.labels[ edge_ids] # 1. Get the head and tail nodes from both the decoder and encoder graphs. head_id, tail_id = dec_graph.find_edges(edge_ids) utype, _, vtype = enc_graph.canonical_etypes[0] subg = [] true_rel_ratings = [] true_rel_labels = [] for possible_rating_value in possible_rating_values: idx_loc = (true_relation_ratings == possible_rating_value) head = head_id[idx_loc] tail = tail_id[idx_loc] true_rel_ratings.append(true_relation_ratings[idx_loc]) if self.labels is not None: true_rel_labels.append(true_relation_labels[idx_loc]) subg.append( dgl.bipartite((head, tail), utype=utype, etype=str(possible_rating_value), vtype=vtype, num_nodes=(enc_graph.number_of_nodes(utype), enc_graph.number_of_nodes(vtype)))) # Convert the encoder subgraph to a more compact one by removing nodes that covered # by the seed edges. g = dgl.hetero_from_relations(subg) g = dgl.compact_graphs(g) # 2. For each head and tail node, extract the entire in-coming neighborhood. seed_nodes = {} for ntype in g.ntypes: seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID] frontier = dgl.in_subgraph(enc_graph, seed_nodes) frontier = dgl.to_block(frontier, seed_nodes) # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs. frontier.dstnodes['user'].data['ci'] = \ enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]] frontier.srcnodes['movie'].data['cj'] = \ enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]] frontier.srcnodes['user'].data['cj'] = \ enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]] frontier.dstnodes['movie'].data['ci'] = \ enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]] # handle features head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \ if dataset.user_feature is None else \ dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]] tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\ if dataset.movie_feature is None else \ dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]] true_rel_labels = None if self.labels is None else th.cat( true_rel_labels, dim=0) true_rel_ratings = th.cat(true_rel_ratings, dim=0) return (g, frontier, head_feat, tail_feat, true_rel_labels, true_rel_ratings)
def sample_blocks(self, seeds): log = open('log.txt', 'w') import datetime print('Datetime:', datetime.datetime.now(), file=log) blocks = [] seeds = {self.category: th.tensor(seeds).long()} cur = seeds print('Seed input', file=log) for ntype, nid in cur.items(): print(ntype + ':', file=log) np.savetxt(log, [nid.numpy()], fmt='%ld') for fanout in self.fanouts: if fanout is None: frontier = dgl.in_subgraph(self.g, cur) else: frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout) print('Frontier edges', file=log) frontier_edges = { etype: frontier.all_edges(order='eid', etype=etype) for etype in frontier.canonical_etypes } for etype, (u, v) in frontier_edges.items(): print(str(etype) + ':', file=log) np.savetxt(log, [u.numpy()], fmt='%ld') np.savetxt(log, [v.numpy()], fmt='%ld') block = dgl.to_block(frontier, cur) cur = {} for ntype in block.srctypes: cur[ntype] = block.srcnodes[ntype].data[dgl.NID] blocks.insert(0, block) print('Block edges', file=log) block_edges = { etype: block.all_edges(order='eid', etype=etype) for etype in block.canonical_etypes } for etype, (u, v) in block_edges.items(): print(str(etype) + ':', file=log) np.savetxt(log, [u.numpy()], fmt='%ld') np.savetxt(log, [v.numpy()], fmt='%ld') np.savetxt(log, [block.edges[etype].data[dgl.EID].numpy()], fmt='%ld') print('Block src nodes', file=log) for ntype in block.srctypes: print(ntype + ':', file=log) np.savetxt(log, [block.srcnodes[ntype].data[dgl.NID].numpy()], fmt='%ld') print('Block dst nodes', file=log) for ntype in block.dsttypes: print(ntype + ':', file=log) np.savetxt(log, [block.dstnodes[ntype].data[dgl.NID].numpy()], fmt='%ld') log.close() return seeds, blocks
num_nodes = data.num_nodes() num_edges = data.num_edges() num_edges = data.num_edges() trainval_div = int(VALID_SPLIT * num_edges) # Select new node from test set and remove them from entire graph test_split_ts = data.edata['timestamp'][trainval_div] test_nodes = torch.cat( [data.edges()[0][trainval_div:], data.edges()[1][trainval_div:]]).unique().numpy() test_new_nodes = np.random.choice(test_nodes, int(0.1 * len(test_nodes)), replace=False) in_subg = dgl.in_subgraph(data, test_new_nodes) out_subg = dgl.out_subgraph(data, test_new_nodes) # Remove edge who happen before the test set to prevent from learning the connection info new_node_in_eid_delete = in_subg.edata[dgl.EID][ in_subg.edata['timestamp'] < test_split_ts] new_node_out_eid_delete = out_subg.edata[dgl.EID][ out_subg.edata['timestamp'] < test_split_ts] new_node_eid_delete = torch.cat( [new_node_in_eid_delete, new_node_out_eid_delete]).unique() graph_new_node = copy.deepcopy(data) # relative order preseved graph_new_node.remove_edges(new_node_eid_delete) # Now for no new node graph, all edge id need to be removed in_eid_delete = in_subg.edata[dgl.EID]