Exemplo n.º 1
0
    def sample_frontier(self, block_id, g, seed_nodes):
        fanout = self.fanouts[block_id] if self.fanouts is not None else None
        # List of neighbors to sample per edge type for each GNN layer, starting from the first layer.
        g = dgl.in_subgraph(g, seed_nodes)
        g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0])
        if self.args.valid_path:
            if block_id != self.args.n_layer - 1:
                g.dstdata['sample_time'] = self.frontiers[block_id + 1].srcdata['sample_time']
                g.apply_edges(self.sample_prob)
                g.remove_edges(torch.where(g.edata['timespan'] < 0)[0])
            g_re=dgl.reverse(g,copy_edata=True,copy_ndata=True)
            g_re.update_all(self.sample_time,fn.max('st','sample_time'))
            g=dgl.reverse(g_re,copy_edata=True,copy_ndata=True)

        if fanout is None:
            frontier = g
        else:
            if block_id == self.args.n_layer - 1:

                if self.args.bandit:
                    frontier = dgl.sampling.sample_neighbors(g,seed_nodes,fanout,prob='q_ij')
                else:
                    frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout)

            else:
                frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout)

        self.frontiers[block_id] = frontier
        return frontier
Exemplo n.º 2
0
    def inference(self, g, x, batch_size, device):
        """
        Inference with the GraphSAGE model on full neighbors (i.e. without neighbor sampling).
        g : the entire graph.
        x : the input of entire node set.

        The inference code is written in a fashion that it could handle any number of nodes and
        layers.
        """
        # During inference with sampling, multi-layer blocks are very inefficient because
        # lots of computations in the first few layers are repeated.
        # Therefore, we compute the representation of all nodes layer by layer.  The nodes
        # on each layer are of course splitted in batches.
        # TODO: can we standardize this?
        nodes = th.arange(g.number_of_nodes())
        for l, layer in enumerate(self.layers):
            y = th.zeros(g.number_of_nodes(), self.n_hidden if l != len(self.layers) - 1 else self.n_classes)

            for start in tqdm.trange(0, len(nodes), batch_size):
                end = start + batch_size
                batch_nodes = nodes[start:end]
                block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
                input_nodes = block.srcdata[dgl.NID]

                h = x[input_nodes].to(device)
                h_dst = h[:block.number_of_dst_nodes()]
                h = layer(block, (h, h_dst))
                if l != len(self.layers) - 1:
                    h = self.activation(h)
                    h = self.dropout(h)

                y[start:end] = h.cpu()

            x = y
        return y
Exemplo n.º 3
0
 def sample_frontier(self, block_id, g, seed_nodes):
     # List of neighbors to sample per edge type for each GNN layer, starting from the first layer.
     g = dgl.in_subgraph(g, seed_nodes)
     g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0])
     frontier=g
     self.frontiers[block_id] = frontier
     return frontier
Exemplo n.º 4
0
    def sampler_frontier(self,
                         block_id,
                         g,
                         seed_nodes,
                         timestamp):
        full_neighbor_subgraph = dgl.in_subgraph(g, seed_nodes)
        full_neighbor_subgraph = dgl.add_edges(full_neighbor_subgraph,
                                               seed_nodes, seed_nodes)

        temporal_edge_mask = (full_neighbor_subgraph.edata['timestamp'] < timestamp) + (
            full_neighbor_subgraph.edata['timestamp'] <= 0)
        temporal_subgraph = dgl.edge_subgraph(
            full_neighbor_subgraph, temporal_edge_mask)

        # Map preserve ID
        temp2origin = temporal_subgraph.ndata[dgl.NID]

        # The added new edgge will be preserved hence
        root2sub_dict = dict(
            zip(temp2origin.tolist(), temporal_subgraph.nodes().tolist()))
        temporal_subgraph.ndata[dgl.NID] = g.ndata[dgl.NID][temp2origin]
        seed_nodes = [root2sub_dict[int(n)] for n in seed_nodes]
        final_subgraph = self.sampler(g=temporal_subgraph, nodes=seed_nodes)
        final_subgraph.remove_self_loop()
        return final_subgraph
Exemplo n.º 5
0
def test_in_subgraph():
    g1 = dgl.graph([(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1), (0, 2)],
                   'user', 'follow')
    g2 = dgl.bipartite([(0, 0), (0, 1), (1, 2), (3, 2)], 'user', 'play',
                       'game')
    g3 = dgl.bipartite([(2, 0), (2, 1), (2, 2), (1, 0), (1, 3), (0, 0)],
                       'game', 'liked-by', 'user')
    g4 = dgl.bipartite([(0, 0), (1, 0), (2, 0), (3, 0)], 'user', 'flips',
                       'coin')
    hg = dgl.hetero_from_relations([g1, g2, g3, g4])
    subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0})
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    u, v = subg['follow'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['follow'].edge_ids(u, v),
                         subg['follow'].edata[dgl.EID])
    assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)}
    u, v = subg['play'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['play'].edge_ids(u, v),
                         subg['play'].edata[dgl.EID])
    assert edge_set == {(0, 0)}
    u, v = subg['liked-by'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['liked-by'].edge_ids(u, v),
                         subg['liked-by'].edata[dgl.EID])
    assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)}
    assert subg['flips'].number_of_edges() == 0
Exemplo n.º 6
0
def build_ground_truth_train(args):
    """根据某个领域的论文引用数加权求和构造学者排名,作为ground truth训练集。"""
    data = OAGCSDataset() if args.use_original_id else OAGCoreDataset()
    g = data[0]
    g.nodes['paper'].data['citation'] = g.nodes['paper'].data[
        'citation'].float()
    if args.log_citation:
        g.nodes['paper'].data['citation'] = g.nodes['paper'].data[
            'citation'].log1p()
    g.edges['writes'].data['order'] = g.edges['writes'].data['order'].float()
    apg = g['author', 'writes', 'paper']

    # 1.筛选论文数>=num_papers的领域
    field_in_degree, fid = g.in_degrees(
        g.nodes('field'), etype='has_field').sort(descending=True)
    fid = fid[field_in_degree >= args.num_papers].tolist()

    # 2.对每个领域关联的论文,构造学者-论文子图,通过论文引用数之和构造学者排名
    author_rank = {}
    for i in tqdm(fid):
        pid, _ = g.in_edges(i, etype='has_field')
        sg = add_reverse_edges(
            dgl.in_subgraph(apg, {'paper': pid}, relabel_nodes=True))
        author_citation = calc_author_citation(sg)
        _, idx = author_citation.topk(args.num_authors)
        aid = sg.nodes['author'].data[dgl.NID][idx]
        author_rank[i] = aid.tolist()

    suffix = '_original' if args.use_original_id else ''
    with open(DATA_DIR / f'rank/author_rank_train{suffix}.json', 'w') as f:
        json.dump(author_rank, f)
        print('结果已保存到', f.name)
Exemplo n.º 7
0
    def construct_blocks(self, seeds, user_item_pairs_to_remove):
        blocks = []
        users, items = user_item_pairs_to_remove
        # 采样就是根据卷积层数选取对应数量的邻居结点
        # 涉及到双向图的处理
        for i in range(self.num_layers):
            sampled_graph = dgl.in_subgraph(self.graph, seeds)
            sampled_eids = sampled_graph.edges[('user', 'watched',
                                                'item')].data[dgl.EID]
            sampled_eids_rev = sampled_graph.edges[('item', 'watchedby',
                                                    'user')].data[dgl.EID]

            # 训练时要去掉用户和项目间的关联
            _, _, edges_to_remove = sampled_graph.edge_ids(users,
                                                           items,
                                                           etype=('user',
                                                                  'watched',
                                                                  'item'),
                                                           return_uv=True)
            _, _, edges_to_remove_rev = sampled_graph.edge_ids(
                items,
                users,
                etype=('item', 'watchedby', 'user'),
                return_uv=True)

            # sampled_with_edges_removed = dgl.remove_edges(
            #     sampled_graph,
            #     {('user', 'watched', 'item'): edges_to_remove, ('item', 'watchedby', 'user'): edges_to_remove_rev}
            # )

            sampled_with_edges_removed = dgl.remove_edges(
                sampled_graph, edges_to_remove, ('user', 'watched', 'item'))
            sampled_with_edges_removed = dgl.remove_edges(
                sampled_with_edges_removed, edges_to_remove_rev,
                ('item', 'watchedby', 'user'))

            sampled_eids = sampled_eids[sampled_with_edges_removed.edges[(
                'user', 'watched', 'item')].data[dgl.EID]]
            sampled_eids_rev = sampled_eids_rev[
                sampled_with_edges_removed.edges[('item', 'watchedby',
                                                  'user')].data[dgl.EID]]

            # 创建子图块
            block = dgl.to_block(sampled_with_edges_removed, seeds)
            blocks.insert(0, block)
            seeds = {
                'user': block.srcnodes['user'].data[dgl.NID],
                'item': block.srcnodes['item'].data[dgl.NID]
            }

            # 把评分复制过去
            block.edges[('user', 'watched', 'item')].data['rating'] = \
                self.graph.edges[('user', 'watched', 'item')].data['rating'][sampled_eids]
            block.edges[('item', 'watchedby', 'user')].data['rating'] = \
                self.graph.edges[('item', 'watchedby', 'user')].data['rating'][sampled_eids_rev]

        return blocks
Exemplo n.º 8
0
def check_rpc_in_subgraph_shuffle(tmpdir, num_server):
    ip_config = open("rpc_ip_config.txt", "w")
    for _ in range(num_server):
        ip_config.write('{}\n'.format(get_local_usable_addr()))
    ip_config.close()

    g = CitationGraphDataset("cora")[0]
    g.readonly()
    num_parts = num_server

    partition_graph(g,
                    'test_in_subgraph',
                    num_parts,
                    tmpdir,
                    num_hops=1,
                    part_method='metis',
                    reshuffle=True)

    pserver_list = []
    ctx = mp.get_context('spawn')
    for i in range(num_server):
        p = ctx.Process(target=start_server,
                        args=(i, tmpdir, num_server > 1, 'test_in_subgraph'))
        p.start()
        time.sleep(1)
        pserver_list.append(p)

    nodes = [0, 10, 99, 66, 1024, 2008]
    time.sleep(3)
    sampled_graph = start_in_subgraph_client(0, tmpdir, num_server > 1, nodes)
    for p in pserver_list:
        p.join()

    orig_nid = F.zeros((g.number_of_nodes(), ), dtype=F.int64, ctx=F.cpu())
    orig_eid = F.zeros((g.number_of_edges(), ), dtype=F.int64, ctx=F.cpu())
    for i in range(num_server):
        part, _, _, _, _, _, _ = load_partition(
            tmpdir / 'test_in_subgraph.json', i)
        orig_nid[part.ndata[dgl.NID]] = part.ndata['orig_id']
        orig_eid[part.edata[dgl.EID]] = part.edata['orig_id']

    src, dst = sampled_graph.edges()
    src = orig_nid[src]
    dst = orig_nid[dst]
    assert sampled_graph.number_of_nodes() == g.number_of_nodes()
    assert np.all(F.asnumpy(g.has_edges_between(src, dst)))

    subg1 = dgl.in_subgraph(g, orig_nid[nodes])
    src1, dst1 = subg1.edges()
    assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1)))
    assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1)))
    eids = g.edge_ids(src, dst)
    eids1 = orig_eid[sampled_graph.edata[dgl.EID]]
    assert np.array_equal(F.asnumpy(eids1), F.asnumpy(eids))
Exemplo n.º 9
0
 def sample_blocks(self, seeds):
     blocks = []
     seeds = {self.category: th.tensor(seeds).long()}
     cur = seeds
     for fanout in self.fanouts:
         if fanout is None:
             frontier = dgl.in_subgraph(self.g, cur)
         else:
             frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout)
         block = dgl.to_block(frontier, cur)
         cur = {}
         for ntype in block.srctypes:
             cur[ntype] = block.srcnodes[ntype].data[dgl.NID]
         blocks.insert(0, block)
     return seeds, blocks
Exemplo n.º 10
0
    def sample_frontier(self, block_id, g, seed_nodes):
        '''
        Deleting the the edges that happen after the current timestamp, then use a simple topk edge sampling by timestamp.
        '''
        fanout = self.fanouts[block_id]
        # List of neighbors to sample per edge type for each GNN layer, starting from the first layer.
        g = dgl.in_subgraph(g, seed_nodes)
        g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0])  # Deleting the the edges that happen after the current timestamp

        if fanout is None:  # full neighborhood sampling
            frontier = g
        else:
            frontier = dgl.sampling.select_topk(g, fanout, 'timestamp', seed_nodes)  # most recent timestamp edge sampling
        self.frontiers[block_id] = frontier  # save frontier
        return frontier
Exemplo n.º 11
0
    def sample_blocks(self, seeds):
        seeds = th.LongTensor(np.asarray(seeds))
        blocks = []
        for fanout in self.fanouts:
            # For each seed node, sample ``fanout`` neighbors.
            if fanout == 0:
                frontier = dgl.in_subgraph(self.g, seeds)
            else:
                frontier = dgl.dataloading.sample_neighbors(self.g, seeds, fanout, replace=True)
            # Then we compact the frontier into a bipartite graph for message passing.
            block = dgl.to_block(frontier, seeds)
            # Obtain the seed nodes for next layer.
            seeds = block.srcdata[dgl.NID]

            blocks.insert(0, block)
        return blocks
Exemplo n.º 12
0
    def sample_blocks(self, seeds):
        seeds = th.LongTensor(seeds)
        blocks = []
        hist_blocks = []
        for fanout in self.fanouts:
            # For each seed node, sample ``fanout`` neighbors.
            frontier = dgl.sampling.sample_neighbors(self.g, seeds, fanout)
            hist_frontier = dgl.in_subgraph(self.g, seeds)
            # Then we compact the frontier into a bipartite graph for message passing.
            block = dgl.to_block(frontier, seeds)
            hist_block = dgl.to_block(hist_frontier, seeds)
            # Obtain the seed nodes for next layer.
            seeds = block.srcdata[dgl.NID]

            blocks.insert(0, block)
            hist_blocks.insert(0, hist_block)
        return blocks, hist_blocks
Exemplo n.º 13
0
 def sample_blocks(self, seeds):
     blocks = []
     etypes = []
     norms = []
     ntypes = []
     seeds = th.tensor(seeds).long()
     cur = self.target_idx[seeds]
     for fanout in self.fanouts:
         if fanout is None or fanout == -1:
             frontier = dgl.in_subgraph(self.g, cur)
         else:
             frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout)
         block = dgl.to_block(frontier, cur)
         gen_norm(block)
         cur = block.srcdata[dgl.NID]
         blocks.insert(0, block)
     return seeds, blocks
Exemplo n.º 14
0
    def sample_frontier(self, block_id, g, seed_nodes):
        fanout = self.fanouts[block_id]

        g = dgl.in_subgraph(g, seed_nodes)
        g.remove_edges(torch.where(g.edata['timestamp']>self.ts)[0])

        if fanout is None:
            frontier = g
            #frontier = dgl.in_subgraph(g, seed_nodes)
        else:
            if self.args.uniform:
                frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout)
            else:
                frontier = dgl.sampling.select_topk(g, fanout, 'timestamp', seed_nodes)

        self.frontiers[block_id] = frontier
        return frontier
Exemplo n.º 15
0
    def sample_block(self, seeds):
        blocks = []
        for fanout in self.fanouts:
            # For each seed node, sample ``fanout`` neighbors.
            if fanout is None:
                frontier = dgl.in_subgraph(self.g, seeds)
            else:
                frontier = dgl.sampling.sample_neighbors(self.g,
                                                         seeds,
                                                         fanout,
                                                         replace=False)
            # Then we compact the frontier into a bipartite graph for message passing.
            block = dgl.to_block(frontier, seeds)
            # Obtain the seed nodes for next layer.
            seeds = block.srcdata[dgl.NID]

            blocks.insert(0, block)
        return blocks, blocks[0].srcdata[dgl.NID]
Exemplo n.º 16
0
def rank(ctx, query, k=100):
    """根据输入的查询词在oag-cs数据集计算学者排名

    :param ctx: Context 上下文对象
    :param query: str 查询词
    :param k: int, optional 返回top学者数量,默认为100
    :return: List[float], List[int] 学者得分和id,按得分降序排序
    """
    if query in ctx.field2id:
        pid, _ = ctx.g.in_edges(ctx.field2id[query], etype='has_field')
    else:
        _, pid = recall.recall(ctx.recall_ctx, query, 200)
    sg = add_reverse_edges(
        dgl.in_subgraph(ctx.apg, {'paper': pid}, relabel_nodes=True))
    author_citation = calc_author_citation(sg)
    citation, idx = author_citation.topk(k)
    aid = sg.nodes['author'].data[dgl.NID][idx]
    return citation.tolist(), aid.tolist()
Exemplo n.º 17
0
def check_rpc_in_subgraph(tmpdir, num_server):
    ip_config = open("rpc_ip_config.txt", "w")
    for _ in range(num_server):
        ip_config.write('{} 1\n'.format(get_local_usable_addr()))
    ip_config.close()

    g = CitationGraphDataset("cora")[0]
    g.readonly()
    num_parts = num_server

    partition_graph(g,
                    'test_in_subgraph',
                    num_parts,
                    tmpdir,
                    num_hops=1,
                    part_method='metis',
                    reshuffle=False)

    pserver_list = []
    ctx = mp.get_context('spawn')
    for i in range(num_server):
        p = ctx.Process(target=start_server,
                        args=(i, tmpdir, num_server > 1, 'test_in_subgraph'))
        p.start()
        time.sleep(1)
        pserver_list.append(p)

    nodes = [0, 10, 99, 66, 1024, 2008]
    time.sleep(3)
    sampled_graph = start_in_subgraph_client(0, tmpdir, num_server > 1, nodes)
    for p in pserver_list:
        p.join()

    src, dst = sampled_graph.edges()
    g = dgl.as_heterograph(g)
    assert sampled_graph.number_of_nodes() == g.number_of_nodes()
    subg1 = dgl.in_subgraph(g, nodes)
    src1, dst1 = subg1.edges()
    assert np.all(np.sort(F.asnumpy(src)) == np.sort(F.asnumpy(src1)))
    assert np.all(np.sort(F.asnumpy(dst)) == np.sort(F.asnumpy(dst1)))
    eids = g.edge_ids(src, dst)
    assert np.array_equal(F.asnumpy(sampled_graph.edata[dgl.EID]),
                          F.asnumpy(eids))
Exemplo n.º 18
0
 def inference(self, g, x, batch_size, device):
     nodes = torch.arange(g.number_of_nodes())
     for l, layer in enumerate(self.layers):
         y = torch.zeros(g.number_of_nodes(),
                      self.n_hidden if l != len(self.layers) - 1 else self.n_classes)
         for start in tqdm.trange(0, len(nodes), batch_size):
             end = start + batch_size
             batch_nodes = nodes[start:end]
             block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
             input_nodes = block.srcdata[dgl.NID]
             h = x[input_nodes].to(device)
             h_dst = h[:block.number_of_dst_nodes()]
             h = layer(block, (h, h_dst))
             if l != len(self.layers) - 1:
                 h = self.activation(h)
                 h = self.dropout(h)
             y[start:end] = h.cpu()
         x = y
     return y
Exemplo n.º 19
0
 def sample_blocks(self, seeds):
     blocks = []
     etypes = []
     norms = []
     ntypes = []
     seeds = th.tensor(seeds).long()
     cur = self.target_idx[seeds]
     for fanout in self.fanouts:
         if fanout is None or fanout == -1:
             frontier = dgl.in_subgraph(self.g, cur)
         else:
             frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout)
         etypes = self.g.edata[dgl.ETYPE][frontier.edata[dgl.EID]]
         block = dgl.to_block(frontier, cur)
         block.srcdata[dgl.NTYPE] = self.g.ndata[dgl.NTYPE][block.srcdata[dgl.NID]]
         block.srcdata['type_id'] = self.g.ndata[dgl.NID][block.srcdata[dgl.NID]]
         block.edata['etype'] = etypes
         cur = block.srcdata[dgl.NID]
         blocks.insert(0, block)
     return seeds, blocks
Exemplo n.º 20
0
def sample_blocks(g, uniq_uids, uniq_iids, fanouts, steps):
    seeds = {
        'user': th.LongTensor(uniq_uids),
        'item': th.LongTensor(uniq_iids)
    }
    blocks = []
    for fanout in fanouts:
        if fanout <= 0:
            frontier = dgl.in_subgraph(g, seeds)
        else:
            frontier = dgl.sampling.sample_neighbors(g,
                                                     seeds,
                                                     fanout,
                                                     copy_ndata=False,
                                                     copy_edata=True)
        block = dgl.to_block(frontier, seeds)
        seeds = {
            ntype: block.srcnodes[ntype].data[dgl.NID]
            for ntype in block.srctypes
        }
        blocks.insert(0, block)
    return blocks, seeds
Exemplo n.º 21
0
def test_in_subgraph(idtype):
    hg = dgl.heterograph({
        ('user', 'follow', 'user'): ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2]),
        ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]),
        ('game', 'liked-by', 'user'): ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]),
        ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0])
    }, idtype=idtype)
    subg = dgl.in_subgraph(hg, {'user' : [0,1], 'game' : 0})
    assert subg.idtype == idtype
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    u, v = subg['follow'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['follow'].edge_ids(u, v), subg['follow'].edata[dgl.EID])
    assert edge_set == {(1,0),(2,0),(3,0),(0,1),(2,1),(3,1)}
    u, v = subg['play'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['play'].edge_ids(u, v), subg['play'].edata[dgl.EID])
    assert edge_set == {(0,0)}
    u, v = subg['liked-by'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['liked-by'].edge_ids(u, v), subg['liked-by'].edata[dgl.EID])
    assert edge_set == {(2,0),(2,1),(1,0),(0,0)}
    assert subg['flips'].number_of_edges() == 0
Exemplo n.º 22
0
def test_in_subgraph(idtype):
    hg = dgl.heterograph(
        {
            ('user', 'follow', 'user'):
            ([1, 2, 3, 0, 2, 3, 0], [0, 0, 0, 1, 1, 1, 2]),
            ('user', 'play', 'game'): ([0, 0, 1, 3], [0, 1, 2, 2]),
            ('game', 'liked-by', 'user'):
            ([2, 2, 2, 1, 1, 0], [0, 1, 2, 0, 3, 0]),
            ('user', 'flips', 'coin'): ([0, 1, 2, 3], [0, 0, 0, 0])
        },
        idtype=idtype,
        num_nodes_dict={
            'user': 5,
            'game': 10,
            'coin': 8
        })
    subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0})
    assert subg.idtype == idtype
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4
    u, v = subg['follow'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['follow'].edge_ids(u, v),
                         subg['follow'].edata[dgl.EID])
    assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)}
    u, v = subg['play'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['play'].edge_ids(u, v),
                         subg['play'].edata[dgl.EID])
    assert edge_set == {(0, 0)}
    u, v = subg['liked-by'].edges()
    edge_set = set(zip(list(F.asnumpy(u)), list(F.asnumpy(v))))
    assert F.array_equal(hg['liked-by'].edge_ids(u, v),
                         subg['liked-by'].edata[dgl.EID])
    assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)}
    assert subg['flips'].number_of_edges() == 0
    for ntype in subg.ntypes:
        assert dgl.NID not in subg.nodes[ntype].data

    # Test store_ids
    subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}, store_ids=False)
    for etype in ['follow', 'play', 'liked-by']:
        assert dgl.EID not in subg.edges[etype].data
    for ntype in subg.ntypes:
        assert dgl.NID not in subg.nodes[ntype].data

    # Test relabel nodes
    subg = dgl.in_subgraph(hg, {'user': [0, 1], 'game': 0}, relabel_nodes=True)
    assert subg.idtype == idtype
    assert len(subg.ntypes) == 3
    assert len(subg.etypes) == 4

    u, v = subg['follow'].edges()
    old_u = F.gather_row(subg.nodes['user'].data[dgl.NID], u)
    old_v = F.gather_row(subg.nodes['user'].data[dgl.NID], v)
    assert F.array_equal(hg['follow'].edge_ids(old_u, old_v),
                         subg['follow'].edata[dgl.EID])
    edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v))))
    assert edge_set == {(1, 0), (2, 0), (3, 0), (0, 1), (2, 1), (3, 1)}

    u, v = subg['play'].edges()
    old_u = F.gather_row(subg.nodes['user'].data[dgl.NID], u)
    old_v = F.gather_row(subg.nodes['game'].data[dgl.NID], v)
    assert F.array_equal(hg['play'].edge_ids(old_u, old_v),
                         subg['play'].edata[dgl.EID])
    edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v))))
    assert edge_set == {(0, 0)}

    u, v = subg['liked-by'].edges()
    old_u = F.gather_row(subg.nodes['game'].data[dgl.NID], u)
    old_v = F.gather_row(subg.nodes['user'].data[dgl.NID], v)
    assert F.array_equal(hg['liked-by'].edge_ids(old_u, old_v),
                         subg['liked-by'].edata[dgl.EID])
    edge_set = set(zip(list(F.asnumpy(old_u)), list(F.asnumpy(old_v))))
    assert edge_set == {(2, 0), (2, 1), (1, 0), (0, 0)}

    assert subg.num_nodes('user') == 4
    assert subg.num_nodes('game') == 3
    assert subg.num_nodes('coin') == 0
    assert subg.num_edges('flips') == 0
 def collate_fn(seeds):
     batch_nodes = {
         ntype: th.LongTensor(np.asarray(seeds))
     }
     block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
     return seeds, block
Exemplo n.º 24
0
 def full(self, g, batch_size):
     nodes = torch.arange(g.number_of_nodes())
     x = g.ndata['features']
     x = self.dropout(x)
     for i in range(self.K):
         if i != self.K - 1:
             y = torch.zeros(g.number_of_nodes(), self.hid_feat)
         else:
             y = torch.zeros(g.number_of_nodes(), self.out_feat)
         for start in tqdm.trange(0, g.number_of_nodes(), batch_size):
             end = start + batch_size
             batch_nodes = nodes[start:end]
             block = dgl.to_block(dgl.in_subgraph(g, batch_nodes), batch_nodes)
             input_nodes = block.srcdata[dgl.NID]
             h = x[input_nodes]
             if self.use_cuda:
                 h = h.cuda()              # 下一层时使用了上一层的y,y默认为cpu()
             if self.aggregator == 'pool':
                 if i == 0:
                     h = torch.matmul(h, self.weight_pool_in)
                     if self.bias:
                         h = h + self.bias_in
                 else:
                     h = torch.matmul(h, self.weight_pool_hid)
                     if self.bias:
                         h = h + self.bias_hid
             if self.aggregator == 'gcn':
                 if i == 0:
                     block.srcdata['h'] = torch.matmul(h, self.weight_gcn_in)
                 else:
                     block.srcdata['h'] = torch.matmul(h, self.weight_gcn_hid)
             else:
                 block.srcdata['h'] = h
             
             block.dstdata['h'] = h[:block.number_of_dst_nodes()]
             if self.aggregator == 'gcn':
                 block.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'neigh'))
             elif self.aggregator == 'mean':
                 block.update_all(fn.copy_src('h', 'm'), fn.mean('m', 'neigh'))
             elif self.aggregator == 'lstm':
                 block.update_all(fn.copy_src('h', 'm'), self.lstm_reducer_in if i == 0 else self.lstm_reducer_hid)
             else:
                 block.update_all(fn.copy_src('h', 'm'), fn.max('m', 'neigh'))
             h_neigh = block.dstdata['neigh']
             if i == 0:
                 h = torch.matmul(block.dstdata['h'], self.weight_in[0, :, :]) \
                     + (torch.matmul(h_neigh, self.weight_in[1, :, :]) if self.aggregator != 'gcn' else 0)
                 if self.bias:
                     h = h + self.bias_in_k[0, :] + (self.bias_in_k[1, :] if self.aggregator != 'gcn' else 0)
             elif i == self.K - 1:
                 h = torch.matmul(block.dstdata['h'], self.weight_out[0, :, :]) \
                     + (torch.matmul(h_neigh, self.weight_out[1, :, :]) if self.aggregator != 'gcn' else 0)
                 if self.bias:
                     h = h + self.bias_out_k[0, :] + (self.bias_out_k[1, :] if self.aggregator != 'gcn' else 0)
             else:
                 h = torch.matmul(block.dstdata['h'], self.weight_hid[i - 1, 0, :, :]) \
                     + (torch.matmul(h_neigh, self.weight_hid[i - 1, 1, :, :])  if self.aggregator != 'gcn' else 0)
                 if self.bias:
                     h = h + self.bias_hid_k[0, :] + (self.bias_hid_k[1, :] if self.aggregator != 'gcn' else 0)
             if self.activation and i != self.K - 1:
                 h = self.activation(h, inplace=False)
             if i != self.K - 1:
                 h = self.dropout(h)
             if self.norm:
                 norm = torch.norm(h, dim=1)
                 norm = norm + (norm == 0).long()
                 h = h / norm.unsqueeze(-1)
             y[start:end] = h
         x = y
     if self.use_cuda:
         x = x.cuda()
     g.ndata['z'] = x
     return g
Exemplo n.º 25
0
    def sample_blocks(self, seeds):
        """Sample subgraphs from the entire graph.

        The input ``seeds`` represents the edges to compute prediction for. The sampling
        algorithm works as follows:

          1. Get the head and tail nodes of the provided seed edges.
          2. For each head and tail node, extract the entire in-coming neighborhood.
          3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        """
        dataset = self.dataset
        enc_graph = self.enc_graph
        dec_graph = self.dec_graph
        edge_ids = th.stack(seeds)
        # generate frontiers for user and item
        possible_rating_values = dataset.possible_rating_values
        true_relation_ratings = self.truths[edge_ids]
        true_relation_labels = None if self.labels is None else self.labels[
            edge_ids]

        # 1. Get the head and tail nodes from both the decoder and encoder graphs.
        head_id, tail_id = dec_graph.find_edges(edge_ids)
        utype, _, vtype = enc_graph.canonical_etypes[0]
        subg = []
        true_rel_ratings = []
        true_rel_labels = []
        for possible_rating_value in possible_rating_values:
            idx_loc = (true_relation_ratings == possible_rating_value)
            head = head_id[idx_loc]
            tail = tail_id[idx_loc]
            true_rel_ratings.append(true_relation_ratings[idx_loc])
            if self.labels is not None:
                true_rel_labels.append(true_relation_labels[idx_loc])
            subg.append(
                dgl.bipartite((head, tail),
                              utype=utype,
                              etype=str(possible_rating_value),
                              vtype=vtype,
                              num_nodes=(enc_graph.number_of_nodes(utype),
                                         enc_graph.number_of_nodes(vtype))))
        # Convert the encoder subgraph to a more compact one by removing nodes that covered
        # by the seed edges.
        g = dgl.hetero_from_relations(subg)
        g = dgl.compact_graphs(g)

        # 2. For each head and tail node, extract the entire in-coming neighborhood.
        seed_nodes = {}
        for ntype in g.ntypes:
            seed_nodes[ntype] = g.nodes[ntype].data[dgl.NID]
        frontier = dgl.in_subgraph(enc_graph, seed_nodes)
        frontier = dgl.to_block(frontier, seed_nodes)

        # 3. Copy the node features/embeddings from the full graph to the sampled subgraphs.
        frontier.dstnodes['user'].data['ci'] = \
            enc_graph.nodes['user'].data['ci'][frontier.dstnodes['user'].data[dgl.NID]]
        frontier.srcnodes['movie'].data['cj'] = \
            enc_graph.nodes['movie'].data['cj'][frontier.srcnodes['movie'].data[dgl.NID]]
        frontier.srcnodes['user'].data['cj'] = \
            enc_graph.nodes['user'].data['cj'][frontier.srcnodes['user'].data[dgl.NID]]
        frontier.dstnodes['movie'].data['ci'] = \
            enc_graph.nodes['movie'].data['ci'][frontier.dstnodes['movie'].data[dgl.NID]]

        # handle features
        head_feat = frontier.srcnodes['user'].data[dgl.NID].long() \
                    if dataset.user_feature is None else \
                       dataset.user_feature[frontier.srcnodes['user'].data[dgl.NID]]
        tail_feat = frontier.srcnodes['movie'].data[dgl.NID].long()\
                    if dataset.movie_feature is None else \
                       dataset.movie_feature[frontier.srcnodes['movie'].data[dgl.NID]]

        true_rel_labels = None if self.labels is None else th.cat(
            true_rel_labels, dim=0)
        true_rel_ratings = th.cat(true_rel_ratings, dim=0)
        return (g, frontier, head_feat, tail_feat, true_rel_labels,
                true_rel_ratings)
Exemplo n.º 26
0
    def sample_blocks(self, seeds):
        log = open('log.txt', 'w')
        import datetime
        print('Datetime:', datetime.datetime.now(), file=log)

        blocks = []
        seeds = {self.category: th.tensor(seeds).long()}
        cur = seeds

        print('Seed input', file=log)
        for ntype, nid in cur.items():
            print(ntype + ':', file=log)
            np.savetxt(log, [nid.numpy()], fmt='%ld')

        for fanout in self.fanouts:
            if fanout is None:
                frontier = dgl.in_subgraph(self.g, cur)
            else:
                frontier = dgl.sampling.sample_neighbors(self.g, cur, fanout)

            print('Frontier edges', file=log)
            frontier_edges = {
                etype: frontier.all_edges(order='eid', etype=etype)
                for etype in frontier.canonical_etypes
            }
            for etype, (u, v) in frontier_edges.items():
                print(str(etype) + ':', file=log)
                np.savetxt(log, [u.numpy()], fmt='%ld')
                np.savetxt(log, [v.numpy()], fmt='%ld')

            block = dgl.to_block(frontier, cur)
            cur = {}
            for ntype in block.srctypes:
                cur[ntype] = block.srcnodes[ntype].data[dgl.NID]
            blocks.insert(0, block)

            print('Block edges', file=log)
            block_edges = {
                etype: block.all_edges(order='eid', etype=etype)
                for etype in block.canonical_etypes
            }
            for etype, (u, v) in block_edges.items():
                print(str(etype) + ':', file=log)
                np.savetxt(log, [u.numpy()], fmt='%ld')
                np.savetxt(log, [v.numpy()], fmt='%ld')
                np.savetxt(log, [block.edges[etype].data[dgl.EID].numpy()],
                           fmt='%ld')
            print('Block src nodes', file=log)
            for ntype in block.srctypes:
                print(ntype + ':', file=log)
                np.savetxt(log, [block.srcnodes[ntype].data[dgl.NID].numpy()],
                           fmt='%ld')
            print('Block dst nodes', file=log)
            for ntype in block.dsttypes:
                print(ntype + ':', file=log)
                np.savetxt(log, [block.dstnodes[ntype].data[dgl.NID].numpy()],
                           fmt='%ld')

        log.close()

        return seeds, blocks
Exemplo n.º 27
0
    num_nodes = data.num_nodes()
    num_edges = data.num_edges()

    num_edges = data.num_edges()
    trainval_div = int(VALID_SPLIT * num_edges)

    # Select new node from test set and remove them from entire graph
    test_split_ts = data.edata['timestamp'][trainval_div]
    test_nodes = torch.cat(
        [data.edges()[0][trainval_div:],
         data.edges()[1][trainval_div:]]).unique().numpy()
    test_new_nodes = np.random.choice(test_nodes,
                                      int(0.1 * len(test_nodes)),
                                      replace=False)

    in_subg = dgl.in_subgraph(data, test_new_nodes)
    out_subg = dgl.out_subgraph(data, test_new_nodes)
    # Remove edge who happen before the test set to prevent from learning the connection info
    new_node_in_eid_delete = in_subg.edata[dgl.EID][
        in_subg.edata['timestamp'] < test_split_ts]
    new_node_out_eid_delete = out_subg.edata[dgl.EID][
        out_subg.edata['timestamp'] < test_split_ts]
    new_node_eid_delete = torch.cat(
        [new_node_in_eid_delete, new_node_out_eid_delete]).unique()

    graph_new_node = copy.deepcopy(data)
    # relative order preseved
    graph_new_node.remove_edges(new_node_eid_delete)

    # Now for no new node graph, all edge id need to be removed
    in_eid_delete = in_subg.edata[dgl.EID]