예제 #1
0
    def forward(self, g, node_feats, edge_feats):
        with g.local_scope():
            # Node and edge feature dimension need to match.
            g.ndata['h'] = node_feats
            g.edata['h'] = self.edge_encoder(edge_feats)
            g.apply_edges(fn.u_add_e('h', 'h', 'm'))

            if self.aggr == 'softmax':
                g.edata['m'] = F.relu(g.edata['m']) + self.eps
                g.edata['a'] = edge_softmax(g, g.edata['m'] * self.beta)
                g.update_all(
                    lambda edge: {'x': edge.data['m'] * edge.data['a']},
                    fn.sum('x', 'm'))

            elif self.aggr == 'power':
                minv, maxv = 1e-7, 1e1
                torch.clamp_(g.edata['m'], minv, maxv)
                g.update_all(
                    lambda edge: {'x': torch.pow(edge.data['m'], self.p)},
                    fn.mean('x', 'm'))
                torch.clamp_(g.ndata['m'], minv, maxv)
                g.ndata['m'] = torch.pow(g.ndata['m'], self.p)

            else:
                raise NotImplementedError(
                    f'Aggregator {self.aggr} is not supported.')

            if self.msg_norm is not None:
                g.ndata['m'] = self.msg_norm(node_feats, g.ndata['m'])

            feats = node_feats + g.ndata['m']

            return self.mlp(feats)
예제 #2
0
 def forward(self, graph, feat):
     graph = graph.local_var()
     feat_c = feat.clone().detach().requires_grad_(False)
     q, k, v = self.q_proj(feat), self.k_proj(feat_c), self.v_proj(feat_c)
     q = q.view(-1, self._num_heads, self._out_feats)
     k = k.view(-1, self._num_heads, self._out_feats)
     v = v.view(-1, self._num_heads, self._out_feats)
     graph.ndata.update({
         'ft': v,
         'el': k,
         'er': q
     })  # k,q instead of q,k, the edge_softmax is applied on incoming edges
     # compute edge attention
     graph.apply_edges(fn.u_dot_v('el', 'er', 'e'))
     e = graph.edata.pop('e') / math.sqrt(self._out_feats * self._num_heads)
     graph.edata['a'] = edge_softmax(graph, e)
     # message passing
     graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft2'))
     rst = graph.ndata['ft2']
     # residual
     rst = rst.view(feat.shape) + feat
     if self._trans:
         rst = self.ln1(rst)
         rst = self.ln1(rst + self.FFN(rst))
         # use the same layer norm, see the author's code
     return rst
예제 #3
0
파일: hgt.py 프로젝트: zhengkangjie/OpenKS
    def forward(self, G, h):
        with G.local_scope():
            node_dict, edge_dict = self.node_dict, self.edge_dict
            for srctype, etype, dsttype in G.canonical_etypes:
                sub_graph = G[srctype, etype, dsttype]

                k_linear = self.k_linears[node_dict[srctype]]
                v_linear = self.v_linears[node_dict[srctype]]
                q_linear = self.q_linears[node_dict[dsttype]]

                k = k_linear(h[srctype]).view(-1, self.n_heads, self.d_k)
                v = v_linear(h[srctype]).view(-1, self.n_heads, self.d_k)
                q = q_linear(h[dsttype]).view(-1, self.n_heads, self.d_k)

                e_id = self.edge_dict[etype]

                relation_att = self.relation_att[e_id]
                relation_pri = self.relation_pri[e_id]
                relation_msg = self.relation_msg[e_id]

                k = torch.einsum("bij,ijk->bik", k, relation_att)
                v = torch.einsum("bij,ijk->bik", v, relation_msg)

                sub_graph.srcdata['k'] = k
                sub_graph.dstdata['q'] = q
                sub_graph.srcdata['v_%d' % e_id] = v

                sub_graph.apply_edges(fn.v_dot_u('q', 'k', 't'))
                attn_score = sub_graph.edata.pop('t').sum(
                    -1) * relation_pri / self.sqrt_dk
                attn_score = edge_softmax(sub_graph, attn_score, norm_by='dst')

                sub_graph.edata['t'] = attn_score.unsqueeze(-1)

            G.multi_update_all({etype : (fn.u_mul_e('v_%d' % e_id, 't', 'm'), fn.sum('m', 't')) \
                                for etype, e_id in edge_dict.items()}, cross_reducer = 'mean')

            new_h = {}
            for ntype in G.ntypes:
                '''
                    Step 3: Target-specific Aggregation
                    x = norm( W[node_type] * gelu( Agg(x) ) + x )
                '''
                n_id = node_dict[ntype]
                alpha = torch.sigmoid(self.skip[n_id])
                t = G.nodes[ntype].data['t'].view(-1, self.out_dim)
                trans_out = self.drop(self.a_linears[n_id](t))
                trans_out = trans_out * alpha + h[ntype] * (1 - alpha)
                if self.use_norm:
                    new_h[ntype] = self.norms[n_id](trans_out)
                else:
                    new_h[ntype] = trans_out
            return new_h
예제 #4
0
    def forward(self, graph, feat, edge_weight):

        with graph.local_scope():
            feat_src, feat_dst = expand_as_pair(feat, graph)

            graph.srcdata['h'] = feat_src
            graph.srcdata['norm_h'] = F.normalize(feat_src, p=2, dim=-1)
            if isinstance(feat, tuple) or graph.is_block:
                graph.dstdata['norm_h'] = F.normalize(feat_dst, p=2, dim=-1)

            e = self.beta * edge_weight

            graph.edata['p'] = edge_softmax(graph, e, norm_by='src')
            graph.update_all(fn.u_mul_e('norm_h', 'p', 'm'), fn.sum('m', 'h'))
            rst = graph.dstdata.pop('h')
            rst = (1 + self.eps) * feat + rst
            return rst
예제 #5
0
    def forward(self, g, node_feats, edge_feats):
        """Update node representations.

        Parameters
        ----------
        g : DGLGraph
            DGLGraph for a batch of graphs
        node_feats : float32 tensor of shape (V, node_in_feats) or (V, n_head, node_in_feats)
            Input node features. V for the number of nodes in the batch of graphs.
        edge_feats : float32 tensor of shape (E, edge_in_feats)
            Input edge features. E for the number of edges in the batch of graphs.

        Returns
        -------
        float32 tensor of shape (V, node_out_feats) or (V, n_head, node_out_feats)
            Updated node features.
        """

        g = g.local_var()
        # In the paper node_src, node_dst, edge feats are concatenated
        # and multiplied with the matrix. We have optimized this step
        # by having three separate matrix multiplication.
        g.ndata['src'] = self.dropout(self.attn_src(node_feats))
        g.ndata['dst'] = self.dropout(self.attn_dst(node_feats))
        edg_atn = self.dropout(self.attn_edg(edge_feats)).unsqueeze(-2)
        g.apply_edges(fn.u_add_v('src', 'dst', 'e'))
        atn_scores = self.act(g.edata.pop('e') + edg_atn)

        atn_scores = self.attn_dot(atn_scores)
        atn_scores = self.dropout(edge_softmax(g, atn_scores))

        g.ndata['src'] = self.msg_src(node_feats)
        g.ndata['dst'] = self.msg_dst(node_feats)
        g.apply_edges(fn.u_add_v('src', 'dst', 'e'))
        atn_inp = g.edata.pop('e') + self.msg_edg(edge_feats).unsqueeze(-2)
        atn_inp = self.act(atn_inp)
        g.edata['msg'] = atn_scores * atn_inp
        g.update_all(fn.copy_e('msg', 'm'), fn.sum('m', 'feat'))
        out = g.ndata.pop('feat') + self.wgt_n(node_feats)
        return self.act(out)
예제 #6
0
    def forward(self, graph, feat):
        with graph.local_scope():
            if not self._allow_zero_in_degree:
                if (graph.in_degrees() == 0).any():
                    assert False

            if isinstance(feat, tuple):
                h_src = self.feat_drop(feat[0])
                h_dst = self.feat_drop(feat[1])
                if not hasattr(self, "fc_src"):
                    self.fc_src, self.fc_dst = self.fc, self.fc
                feat_src, feat_dst = h_src, h_dst
                feat_src = self.fc_src(h_src).view(-1, self._num_heads,
                                                   self._out_feats)
                feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads,
                                                   self._out_feats)
            else:
                h_src = h_dst = self.feat_drop(feat)
                feat_src, feat_dst = h_src, h_dst
                feat_src = feat_dst = self.fc(h_src).view(
                    -1, self._num_heads, self._out_feats)
                if graph.is_block:
                    feat_dst = feat_src[:graph.number_of_dst_nodes()]

            if self._norm == "both":
                degs = graph.out_degrees().float().clamp(min=1)
                norm = torch.pow(degs, -0.5)
                shp = norm.shape + (1, ) * (feat_src.dim() - 1)
                norm = torch.reshape(norm, shp)
                feat_src = feat_src * norm

            # NOTE: GAT paper uses "first concatenation then linear projection"
            # to compute attention scores, while ours is "first projection then
            # addition", the two approaches are mathematically equivalent:
            # We decompose the weight vector a mentioned in the paper into
            # [a_l || a_r], then
            # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
            # Our implementation is much efficient because we do not need to
            # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
            # addition could be optimized with DGL's built-in function u_add_v,
            # which further speeds up computation and saves memory footprint.
            el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1)
            er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1)
            graph.srcdata.update({"ft": feat_src, "el": el})
            graph.dstdata.update({"er": er})
            # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively.
            graph.apply_edges(fn.u_add_v("el", "er", "e"))
            e = self.leaky_relu(graph.edata.pop("e"))
            # compute softmax
            graph.edata["a"] = self.attn_drop(edge_softmax(graph, e))
            # message passing
            graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft"))
            rst = graph.dstdata["ft"]

            if self._norm == "both":
                degs = graph.in_degrees().float().clamp(min=1)
                norm = torch.pow(degs, 0.5)
                shp = norm.shape + (1, ) * (feat_dst.dim() - 1)
                norm = torch.reshape(norm, shp)
                rst = rst * norm

            # residual
            if self.res_fc is not None:
                resval = self.res_fc(h_dst).view(h_dst.shape[0], -1,
                                                 self._out_feats)
                rst = rst + resval
            # activation
            if self._activation is not None:
                rst = self._activation(rst)
            return rst
예제 #7
0
파일: model_sampling.py 프로젝트: yuk12/dgl
    def forward(self, g, f_feat, b_feat, u_feat, v_feat):
        g.srcnodes['u'].data['h'] = u_feat
        g.srcnodes['v'].data['h'] = v_feat
        g.dstnodes['u'].data['h'] = u_feat[:g.number_of_dst_nodes(ntype='u')]
        g.dstnodes['v'].data['h'] = v_feat[:g.number_of_dst_nodes(ntype='v')]
        g.edges['forward'].data['h'] = f_feat
        g.edges['backward'].data['h'] = b_feat

        # formula 3 and 4 (optimized implementation to save memory)
        g.srcnodes["u"].data.update(
            {'he_u': self.u_linear(g.srcnodes['u'].data['h'])})
        g.srcnodes["v"].data.update(
            {'he_v': self.v_linear(g.srcnodes['v'].data['h'])})
        g.dstnodes["u"].data.update(
            {'he_u': self.u_linear(g.dstnodes['u'].data['h'])})
        g.dstnodes["v"].data.update(
            {'he_v': self.v_linear(g.dstnodes['v'].data['h'])})
        g.edges["forward"].data.update({'he_e': self.e_linear(f_feat)})
        g.edges["backward"].data.update({'he_e': self.e_linear(b_feat)})
        g.apply_edges(
            lambda edges:
            {'he': edges.data['he_e'] + edges.dst['he_u'] + edges.src['he_v']},
            etype='backward')
        g.apply_edges(
            lambda edges:
            {'he': edges.data['he_e'] + edges.src['he_u'] + edges.dst['he_v']},
            etype='forward')
        hf = g.edges["forward"].data['he']
        hb = g.edges["backward"].data['he']
        if self.activation is not None:
            hf = self.activation(hf)
            hb = self.activation(hb)

        # formula 6
        g.apply_edges(lambda edges:
                      {'h_ve': th.cat([edges.src['h'], edges.data['h']], -1)},
                      etype='backward')
        g.apply_edges(lambda edges:
                      {'h_ue': th.cat([edges.src['h'], edges.data['h']], -1)},
                      etype='forward')

        # formula 7, self-attention
        g.srcnodes['u'].data['h_att_u'] = self.W_ATTN_u(
            g.srcnodes['u'].data['h'])
        g.srcnodes['v'].data['h_att_v'] = self.W_ATTN_v(
            g.srcnodes['v'].data['h'])
        g.dstnodes['u'].data['h_att_u'] = self.W_ATTN_u(
            g.dstnodes['u'].data['h'])
        g.dstnodes['v'].data['h_att_v'] = self.W_ATTN_v(
            g.dstnodes['v'].data['h'])

        # Step 1: dot product
        g.apply_edges(fn.e_dot_v('h_ve', 'h_att_u', 'edotv'), etype='backward')
        g.apply_edges(fn.e_dot_v('h_ue', 'h_att_v', 'edotv'), etype='forward')

        # Step 2. softmax
        g.edges['backward'].data['sfm'] = edge_softmax(
            g['backward'], g.edges['backward'].data['edotv'])
        g.edges['forward'].data['sfm'] = edge_softmax(
            g['forward'], g.edges['forward'].data['edotv'])

        # Step 3. Broadcast softmax value to each edge, and then attention is done
        g.apply_edges(
            lambda edges: {'attn': edges.data['h_ve'] * edges.data['sfm']},
            etype='backward')
        g.apply_edges(
            lambda edges: {'attn': edges.data['h_ue'] * edges.data['sfm']},
            etype='forward')

        # Step 4. Aggregate attention to dst,user nodes, so formula 7 is done
        g.update_all(fn.copy_e('attn', 'm'),
                     fn.sum('m', 'agg_u'),
                     etype='backward')
        g.update_all(fn.copy_e('attn', 'm'),
                     fn.sum('m', 'agg_v'),
                     etype='forward')

        # formula 5
        h_nu = self.W_u(g.dstnodes['u'].data['agg_u'])
        h_nv = self.W_v(g.dstnodes['v'].data['agg_v'])
        if self.activation is not None:
            h_nu = self.activation(h_nu)
            h_nv = self.activation(h_nv)

        # Dropout
        hf = self.dropout(hf)
        hb = self.dropout(hb)
        h_nu = self.dropout(h_nu)
        h_nv = self.dropout(h_nv)

        # formula 8
        hu = th.cat([self.Vu(g.dstnodes['u'].data['h']), h_nu], -1)
        hv = th.cat([self.Vv(g.dstnodes['v'].data['h']), h_nv], -1)

        return hf, hb, hu, hv
예제 #8
0
    def forward(self, graph, feat):
        with graph.local_scope():
            feat = self.feat_drop(feat)
            if self.model_type == 'acmgat':
                feat_low = (self.fc_self(feat))
                self_low = feat_low.view(-1, self._num_heads, self._out_feats)
                feat_high = (self.fc_self_high(feat))
                self_high = feat_high.view(-1, self._num_heads,
                                           self._out_feats)

                el_low = (self_low * self.attn_l_low).sum(dim=-1).unsqueeze(-1)
                #print(self_low.shape, self.attn_l_low.shape,(self_low * self.attn_l_low).shape, (self_low * self.attn_l_low).sum(dim=-1).shape, el_low.shape)
                er_low = (self_low * self.attn_r_low).sum(dim=-1).unsqueeze(-1)
                el_high = (self_high *
                           self.attn_l_high).sum(dim=-1).unsqueeze(-1)
                er_high = (self_high *
                           self.attn_r_high).sum(dim=-1).unsqueeze(-1)
                graph.srcdata.update({
                    'ft_low': self_low,
                    'el_low': el_low,
                    'ft_high': self_high,
                    'el_high': el_high
                })
                graph.dstdata.update({'er_low': er_low, 'er_high': er_high})
                graph.apply_edges(fn.u_add_v('el_low', 'er_low', 'e_low'))
                graph.apply_edges(fn.u_add_v('el_high', 'er_high', 'e_high'))
                e_low = self.leaky_relu(graph.edata.pop('e_low'))
                e_high = self.leaky_relu(graph.edata.pop('e_high'))
                graph.edata['a_low'] = self.attn_drop(
                    edge_softmax(graph, e_low))
                graph.edata['a_high'] = self.attn_drop(
                    edge_softmax(graph, e_high))

                # message passing
                graph.update_all(fn.u_mul_e('ft_high', 'a_high', 'm_high'),
                                 fn.sum('m_high', 'ft_high'))
                graph.update_all(fn.u_mul_e('ft_low', 'a_low', 'm_low'),
                                 fn.sum('m_low', 'ft_low'))
                #graph.apply_edges(fn.u_mul_e('ft_low', 'a_low', 'm_low'))

                #graph.update_all(fn.copy_edge('m_low', 'm'), fn.sum('m', 'ft_low'))
                #graph.apply_edges(fn.u_mul_e('ft_high', 'a_high', 'm_high'))
                #graph.update_all(fn.copy_edge('m_high', 'm'), fn.sum('m', 'ft_high'))

                low = F.relu(self_low + graph.dstdata['ft_low']).view(
                    feat_low.shape)
                high = F.relu(self_high - graph.dstdata['ft_high']).view(
                    feat_low.shape)
                identity = F.relu(self.fc_identity(
                    feat))  #.view(-1, self._num_heads, self._out_feats)
                #print(identity.view(feat_low.shape).reshape(high.shape)==identity)
                #print(identity.view(feat_low.shape).shape)
                att_low, att_high, att_mlp = self.attention(
                    low, high, identity)
                rst = (att_low * low + att_high * high +
                       att_mlp * identity).view(-1, self._num_heads,
                                                self._out_feats)

            else:
                feat_src = feat_dst = self.fc_self(
                    feat)  #.view(-1, self._num_heads, self._out_feats)
                graph.srcdata['h'] = feat_src
                #el = (feat_src * self.attn_l_low).sum(dim=-1).unsqueeze(-1)
                #r = (feat_dst * self.attn_r_low).sum(dim=-1).unsqueeze(-1)
                #graph.srcdata.update({'ft': feat_src, 'el': el})
                #graph.dstdata.update({'er': er})
                #graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
                #e = self.leaky_relu(graph.edata.pop('e'))
                # compute softmax
                #graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
                # message passing
                #graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
                #                 fn.sum('m', 'ft'))
                graph.update_all(fn.copy_src('h', 'm_prod'),
                                 self._elementwise_product)
                rst = graph.dstdata['h_prod'].view(-1, self._num_heads,
                                                   self._out_feats)

            if self.activation:
                rst = self.activation(rst)
            return rst
예제 #9
0
    def forward(self, graph, feat, get_attention=False):
        with graph.local_scope():
            if not self._allow_zero_in_degree:
                if (graph.in_degrees() == 0).any():
                    raise DGLError('There are 0-in-degree nodes in the graph, '
                                   'output for those nodes will be invalid. '
                                   'This is harmful for some applications, '
                                   'causing silent performance regression. '
                                   'Adding self-loop on the input graph by '
                                   'calling `g = dgl.add_self_loop(g)` will resolve '
                                   'the issue. Setting ``allow_zero_in_degree`` '
                                   'to be `True` when constructing this module will '
                                   'suppress the check and let the code run.')

            if isinstance(feat, tuple):
                h_src = self.feat_drop(feat[0])
                h_dst = self.feat_drop(feat[1])
                if not hasattr(self, 'fc_src'):
                    feat_src = self.fc(
                        h_src).view(-1, self._num_heads, self._out_feats)
                    feat_dst = self.fc(
                        h_dst).view(-1, self._num_heads, self._out_feats)
                else:
                    feat_src = self.fc_src(
                        h_src).view(-1, self._num_heads, self._out_feats)
                    feat_dst = self.fc_dst(
                        h_dst).view(-1, self._num_heads, self._out_feats)
            else:
                h_src = h_dst = self.feat_drop(feat)
                feat_src = feat_dst = self.fc(h_src).view(
                    -1, self._num_heads, self._out_feats)
                if graph.is_block:
                    feat_dst = feat_src[:graph.number_of_dst_nodes()]
            # NOTE: GAT paper uses "first concatenation then linear projection"
            # to compute attention scores, while ours is "first projection then
            # addition", the two approaches are mathematically equivalent:
            # We decompose the weight vector a mentioned in the paper into
            # [a_l || a_r], then
            # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
            # Our implementation is much efficient because we do not need to
            # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
            # addition could be optimized with DGL's built-in function u_add_v,
            # which further speeds up computation and saves memory footprint.
            el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1)
            er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1)
            graph.srcdata.update({'ft': feat_src, 'el': el})
            graph.dstdata.update({'er': er})
            # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively.
            graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
            e = self.leaky_relu(graph.edata.pop('e'))
            # compute softmax
            graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
            # compute weighted attention
            graph.edata['a'] = (graph.edata['a'].permute(
                1, 2, 0)*graph.edata['weight']).permute(2, 0, 1)
            # message passing
            graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
                             fn.sum('m', 'ft'))
            rst = graph.dstdata['ft']
            # residual
            if self.res_fc is not None:
                resval = self.res_fc(h_dst).view(
                    h_dst.shape[0], -1, self._out_feats)
                rst = rst + resval
            # activation
            if self.activation:
                rst = self.activation(rst)

            if get_attention:
                return rst, graph.edata['a']
            else:
                return rst
    def _forward(self, graph, feat, get_attention=False):
        r"""

        Description
        -----------
        Compute graph attention network layer.

        Parameters
        ----------
        graph : DGLGraph
            The graph.
        feat : torch.Tensor or pair of torch.Tensor
            If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where
            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
            If a pair of torch.Tensor is given, the pair must contain two tensors of shape
            :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`.
        get_attention : bool, optional
            Whether to return the attention values. Default to False.

        Returns
        -------
        torch.Tensor
            The output feature of shape :math:`(N, H, D_{out})` where :math:`H`
            is the number of heads, and :math:`D_{out}` is size of output feature.
        torch.Tensor, optional
            The attention values of shape :math:`(E, H, 1)`, where :math:`E` is the number of
            edges. This is returned only when :attr:`get_attention` is ``True``.

        Raises
        ------
        DGLError
            If there are 0-in-degree nodes in the input graph, it will raise DGLError
            since no message will be passed to those nodes. This will cause invalid output.
            The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``.
        """
        with graph.local_scope():
            if not self._allow_zero_in_degree:
                if (graph.in_degrees() == 0).any():
                    raise DGLError('There are 0-in-degree nodes in the graph, '
                                   'output for those nodes will be invalid. '
                                   'This is harmful for some applications, '
                                   'causing silent performance regression. '
                                   'Adding self-loop on the input graph by '
                                   'calling `g = dgl.add_self_loop(g)` will resolve '
                                   'the issue. Setting ``allow_zero_in_degree`` '
                                   'to be `True` when constructing this module will '
                                   'suppress the check and let the code run.')

            if isinstance(feat, tuple):
                h_src = self.feat_drop(feat[0])
                h_dst = self.feat_drop(feat[1])
                basis_coef = softmax(self._basis_coef, dim=-1).reshape(-1, 1, 1)
                # if not hasattr(self, 'fc_src'):
                params_src = (self._basis[0] * basis_coef).sum(dim=0)
                params_dst = (self._basis[1] * basis_coef).sum(dim=0)
                feat_src = (params_src @ h_src.T).view(-1, self._num_heads, self._out_feats)
                feat_dst = (params_dst @ h_dst.T).view(-1, self._num_heads, self._out_feats)
                #     # feat_src = self.fc(h_src).view(-1, self._num_heads, self._out_feats)
                #     # feat_dst = self.fc(h_dst).view(-1, self._num_heads, self._out_feats)
                # else:
                #     params = self._basis * basis_coef
                #     feat_src = (params @ h_src.T).view(-1, self._num_heads, self._out_feats)
                #     feat_dst = (params @ h_dst.T).view(-1, self._num_heads, self._out_feats)
                #     # feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats)
                #     # feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats)
            else:
                h_src = h_dst = self.feat_drop(feat)
                basis_coef = softmax(self._basis_coef, dim=-1).reshape(-1, 1, 1)
                params = (self._basis * basis_coef).sum(dim=0)
                feat_src = feat_dst = (params @ h_src.T).view(-1, self._num_heads, self._out_feats)
                # feat_src = feat_dst = self.fc(h_src).view(
                #     -1, self._num_heads, self._out_feats)
                if graph.is_block:
                    feat_dst = feat_src[:graph.number_of_dst_nodes()]
            # NOTE: GAT paper uses "first concatenation then linear projection"
            # to compute attention scores, while ours is "first projection then
            # addition", the two approaches are mathematically equivalent:
            # We decompose the weight vector a mentioned in the paper into
            # [a_l || a_r], then
            # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j
            # Our implementation is much efficient because we do not need to
            # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus,
            # addition could be optimized with DGL's built-in function u_add_v,
            # which further speeds up computation and saves memory footprint.
            attn_l_param = (self._attn_basis[0] * basis_coef).sum(dim=0)
            attn_r_param = (self._attn_basis[1] * basis_coef).sum(dim=0)
            el = (feat_src * attn_l_param).sum(dim=-1).unsqueeze(-1)
            er = (feat_dst * attn_r_param).sum(dim=-1).unsqueeze(-1)
            # el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1)
            # er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1)
            graph.srcdata.update({'ft': feat_src, 'el': el})
            graph.dstdata.update({'er': er})
            # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively.
            graph.apply_edges(fn.u_add_v('el', 'er', 'e'))
            e = self.leaky_relu(graph.edata.pop('e'))
            # compute softmax
            graph.edata['a'] = self.attn_drop(edge_softmax(graph, e))
            # message passing
            graph.update_all(fn.u_mul_e('ft', 'a', 'm'),
                             fn.sum('m', 'ft'))
            rst = graph.dstdata['ft']
            # residual
            if self.res_fc is not None:
                resval = self.res_fc(h_dst).view(h_dst.shape[0], self._num_heads, self._out_feats)
                rst = rst + resval
            # bias
            if self.bias is not None:
                rst = rst + self.bias.view(1, self._num_heads, self._out_feats)
            # activation
            if self.activation:
                rst = self.activation(rst)

            if get_attention:
                return rst, graph.edata['a']
            else:
                return rst
예제 #11
0
 def forward(self, block):
     block.apply_edges(self.edge_attention)
     attention = edge_softmax(block, block.edata['attn'])
     return attention