def forward(self, user_feat, item_feat, rating_feat): r_emb = rating_feat[self.rating] i_emb = item_feat[self.row_idxs] u_emb = user_feat[self.col_idxs] # original peper formula (2) x = t.cat([i_emb, r_emb], dim=1) x_ia = self.gv(x) weight = self.att(x_ia, u_emb).view(-1, 1) value = edge_softmax(self.vu_g, weight) self.vu_g.edata['h'] = x_ia * value self.vu_g.update_all(message_func=fn.copy_edge(edge='h', out='m'), \ reduce_func=fn.sum(msg='m', out='n_f')) h = self.vu_g.ndata['n_f'][:self.userNum] if self.act is None: hi = self.w(h) else: hi = self.act(self.w(h)) return hi
def forward(self, user_feat, item_feat, rating_feat): r_emb = rating_feat[self.rating] u_emb = user_feat[self.row_idxs] i_emb = item_feat[self.col_idxs] # original peper formula (15) x = t.cat([u_emb, r_emb], dim=1) f_jt = self.gu(x) # f_jt = F.relu(self.w_r1(t.cat([u_emb, r_emb], dim=1))) weight = self.att(f_jt, i_emb).view(-1, 1) value = edge_softmax(self.uv_g, weight) self.uv_g.edata['h'] = f_jt * value self.uv_g.update_all(message_func=fn.copy_edge(edge='h', out='m'), \ reduce_func=fn.sum(msg='m', out='n_f')) z = self.uv_g.ndata['n_f'][self.userNum:] if self.act is None: z = self.w(z) else: z = self.act(self.w(z)) return z
def forward(self, graph, node_feat, edge_feat): with graph.local_scope(): h_src = h_dst = node_feat feat_src = feat_dst = self.fc(h_src).view(-1, self._edata_channels, self._out_feats) e_feat = self.edge_fc(edge_feat).view(-1, self._edata_channels, 1) graph.edata.update({'feat': e_feat}) el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'feat': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = graph.edata.pop('e') * e_feat e = self.leaky_relu(e) # compute softmax graph.edata['a'] = edge_softmax(graph, e) # message passing def message_func(edges): feat_with_e = th.cat([edges.src['feat'], edges.data['feat']], 2) # apply a fc layer to adjust the dim of node feat that concatenate E_p to the out_feat_dim feat_with_e = self.nfeat_with_e_fc(feat_with_e) return {'m': edges.data['a'] * feat_with_e} graph.update_all(message_func, fn.sum('m', 'ft')) rst = graph.dstdata['ft'] rst = th.sigmoid(rst) return rst
def forward(self, g, feat): """ :param g: DGLGraph 二分图(只包含一种关系) :param feat: tensor(N_src, d_in) or (tensor(N_src, d_in), tensor(N_dst, d_in)) 输入特征 :return: tensor(N_dst, K*d_out) 该关系关于目标顶点的表示 """ with g.local_scope(): feat_src, feat_dst = expand_as_pair(feat, g) feat_src = self.fc_src(self.feat_drop(feat_src)).view(-1, self.num_heads, self.out_dim) feat_dst = self.fc_dst(self.feat_drop(feat_dst)).view(-1, self.num_heads, self.out_dim) # a^T (z_u || z_v) = (a_l^T || a_r^T) (z_u || z_v) = a_l^T z_u + a_r^T z_v = el + er el = (feat_src * self.attn_src[:, :self.out_dim]).sum(dim=-1, keepdim=True) # (N_src, K, 1) er = (feat_dst * self.attn_src[:, self.out_dim:]).sum(dim=-1, keepdim=True) # (N_dst, K, 1) g.srcdata.update({'ft': feat_src, 'el': el}) g.dstdata['er'] = er g.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(g.edata.pop('e')) g.edata['a'] = edge_softmax(g, e) # (E, K, 1) # 消息传递 g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) ret = g.dstdata['ft'].view(-1, self.num_heads * self.out_dim) if self.activation: ret = self.activation(ret) return ret
def calc_author_citation(g): """使用论文引用数加权求和计算学者引用数 :param g: DGLGraph 学者-论文二分图 :return: tensor(N_author) 学者引用数 """ import dgl.function as fn from dgl.ops import edge_softmax with g.local_scope(): # 第k作者的权重为1/k,最后一个视为通讯作者,权重为1/2 g.edges['writes'].data['w'] = 1.0 / g.edges['writes'].data['order'] g.update_all(fn.copy_e('w', 'w'), fn.min('w', 'mw'), etype='writes') g.apply_edges(fn.copy_u('mw', 'mw'), etype='writes_rev') w, mw = g.edges['writes'].data.pop( 'w'), g.edges['writes_rev'].data.pop('mw') w[w == mw] = 0.5 # 每篇论文所有作者的权重归一化,每个学者所有论文的引用数加权求和 p = edge_softmax(g['author', 'writes', 'paper'], torch.log(w).unsqueeze(dim=1)) g.edges['writes_rev'].data['p'] = p.squeeze(dim=1) g.update_all(fn.u_mul_e('citation', 'p', 'c'), fn.sum('c', 'c'), etype='writes_rev') return g.nodes['author'].data['c']
def forward(self, g, feat_src, feat_dst): """ :param g: DGLGraph 邻居-目标顶点二分图 :param feat_src: tensor(N_src, d) 邻居顶点输入特征 :param feat_dst: tensor(N_dst, d) 目标顶点输入特征 :return: tensor(N_dst, d) 目标顶点输出特征 """ with g.local_scope(): # HeCo作者代码中使用attn_drop的方式与原始GAT不同,这样是不对的,却能顶点聚类提升性能…… attn_l = self.attn_drop(self.attn_l) attn_r = self.attn_drop(self.attn_r) el = (feat_src * attn_l).sum(dim=-1).unsqueeze(dim=-1) # (N_src, 1) er = (feat_dst * attn_r).sum(dim=-1).unsqueeze(dim=-1) # (N_dst, 1) g.srcdata.update({'ft': feat_src, 'el': el}) g.dstdata['er'] = er g.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(g.edata.pop('e')) g.edata['a'] = edge_softmax(g, e) # (E, 1) # 消息传递 g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) ret = g.dstdata['ft'] if self.activation: ret = self.activation(ret) return ret
def forward(self, graph, memory, ts): graph = graph.local_var() # Using local scope for graph if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError( 'There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') #print("Shape: ",memory.shape,ts.shape) graph.srcdata.update({'s': memory, 'timestamp': ts}) graph.dstdata.update({'s': memory, 'timestamp': ts}) # Dot product Calculate the attentio weight graph.apply_edges(self.weight_fn) # Edge softmax graph.edata['sa'] = edge_softmax( graph, graph.edata['a']) / (self._out_feats**0.5) # Update dst node Here msg_fn include edge feature graph.update_all(self.msg_fn, fn.sum('attn', 'agg_u')) rst = graph.dstdata['agg_u'] # Implement skip connection rst = self.merge(rst.view(-1, self._num_heads * self._out_feats), graph.dstdata['s']) return rst
def forward(self, graph, feat): ''' :param graph: DGLGraph :param feat: <N, b, F> :return: ''' with graph.local_scope(): N, b, _ = feat.size() graph = graph.local_var() graph = graph.to(feat.device) feat = torch.cat([self.fc1(feat[:get_Parameter('taxi_size')]), self.fc2(feat[get_Parameter('taxi_size'):])], dim=0) feat_src = feat_dst = feat.view(N, b, self._num_heads, self._out_feats) #feat_src = feat_dst = self.fc(feat).view(N, b, self._num_heads, self._out_feats) el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_l).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) graph.apply_edges(fn.u_add_v('el', 'er', 'e')) #graph.apply_edges(fn.u_mul_e('e', 'w', 'e')) e = self.leaky_relu(graph.edata.pop('e')) graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) #print(graph.edata['a'].size()) graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] rst = rst.reshape(N, -1, self._num_heads*self._out_feats) return rst, graph.edata['a']
def test_edge_softmax(g, norm_by, shp, idtype): g = g.astype(idtype).to(F.ctx()) edata = F.tensor(np.random.rand(g.number_of_edges(), *shp)) e1 = F.attach_grad(F.clone(edata)) with F.record_grad(): score1 = edge_softmax(g, e1, norm_by=norm_by) F.backward(F.reduce_sum(score1)) grad_edata = F.grad(e1) with F.record_grad(): e2 = F.attach_grad(F.clone(edata)) e2_2d = F.reshape( e2, (g.number_of_src_nodes(), g.number_of_dst_nodes(), *e2.shape[1:])) if norm_by == 'src': score2 = F.softmax(e2_2d, 1) score2 = F.reshape(score2, (-1, *e2.shape[1:])) if norm_by == 'dst': score2 = F.softmax(e2_2d, 0) score2 = F.reshape(score2, (-1, *e2.shape[1:])) assert F.allclose(score1, score2) print('forward passed') F.backward(F.reduce_sum(score2)) assert F.allclose(F.grad(e2), grad_edata) print('backward passed')
def forward(self, g, feat): with g.local_scope(): if self.aggre_type == 'attention': if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]).view( -1, self.num_heads, self.in_size) h_dst = self.feat_drop(feat[1]).view( -1, self.num_heads, self.in_size) el = (h_src * self.attn_l).sum(dim=-1).unsqueeze(-1) g.srcdata.update({'ft': h_src, 'el': el}) g.apply_edges(fn.copy_u('el', 'e')) e = self.leaky_relu(g.edata.pop('e')) g.edata['a'] = self.attn_drop(edge_softmax(g, e)) g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = g.dstdata['ft'].flatten(1) if self.residual: rst = rst + h_dst if self.activation: rst = self.activation(rst) elif self.aggre_type == 'mean': h_src = self.feat_drop(feat[0]).view( -1, self.in_size * self.num_heads) g.srcdata['ft'] = h_src g.update_all(fn.copy_u('ft', 'm'), fn.mean('m', 'ft')) rst = g.dstdata['ft'] elif self.aggre_type == 'pool': h_src = self.feat_drop(feat[0]).view( -1, self.in_size * self.num_heads) g.srcdata['ft'] = F.relu(self.fc_pool(h_src)) g.update_all(fn.copy_u('ft', 'm'), fn.mean('m', 'ft')) rst = g.dstdata['ft'] return rst
def forward(self, g, feat): """ :param g: DGLGraph 二分图(只包含一种关系) :param feat: tensor(N_src, d_in) or (tensor(N_src, d_in), tensor(N_dst, d_in)) 输入特征 :return: tensor(N_dst, d_out) 目标顶点该关于关系的表示 """ with g.local_scope(): feat_src, feat_dst = expand_as_pair(feat, g) # (N_src, d_in) -> (N_src, d_out) -> (N_src, K, d_out/K) k = self.k_linear(feat_src).view(-1, self.num_heads, self.d_k) v = self.v_linear(feat_src).view(-1, self.num_heads, self.d_k) q = self.q_linear(feat_dst).view(-1, self.num_heads, self.d_k) # k[:, h] @= w_att[h] => k[n, h, j] = ∑(i) k[n, h, i] * w_att[h, i, j] k = torch.einsum('nhi,hij->nhj', k, self.w_att) v = torch.einsum('nhi,hij->nhj', v, self.w_msg) g.srcdata.update({'k': k, 'v': v}) g.dstdata['q'] = q g.apply_edges(fn.v_dot_u('q', 'k', 't')) # g.edata['t']: (E, K, 1) attn = g.edata.pop('t').squeeze(dim=-1) * self.mu / math.sqrt( self.d_k) attn = edge_softmax(g, attn) # (E, K) self.attn = attn.detach() g.edata['t'] = attn.unsqueeze(dim=-1) # (E, K, 1) g.update_all(fn.u_mul_e('v', 't', 'm'), fn.sum('m', 'h')) out = g.dstdata['h'].view(-1, self.out_dim) # (N_dst, d_out) return out
def forward(self, g, node_feat, edge_feat): """ :param g: DGLGraph 基于给定元路径的邻居组成的图,每条边表示一个元路径实例 :param node_feat: tensor(N, d_in) 输入顶点特征,N为g的终点个数 :param edge_feat: tensor(E, L, d_in) 元路径实例特征(由中间顶点的特征组成),E为g的边数,L为元路径长度 :return: tensor(N, K, d_out) 输出顶点特征,K为注意力头数 """ # 与GAT/HAN顶点层次的注意力的区别:注意力对象由基于元路径的邻居改为元路径实例,考虑了元路径实例的中间顶点 with g.local_scope(): edge_feat = self.encoder(edge_feat) # (E, L, d_in) -> (E, K*d_out) edge_feat = edge_feat.view(-1, self.num_heads, self.out_dim) # (E, K, d_out) # a^T (h_p || h_v) = (a_l^T || a_r^T) (h_p || h_v) = a_l^T h_p + a_r^T h_v = el + er el = (edge_feat * self.attn_l).sum(dim=-1).unsqueeze( dim=-1) # (E, K, 1) er = self.attn_r(node_feat).unsqueeze(dim=-1) # (N, K, 1) g.edata.update({'ft': edge_feat, 'el': el}) g.dstdata['er'] = er g.apply_edges(fn.e_add_v('el', 'er', 'e')) e = self.leaky_relu(g.edata.pop('e')) g.edata['a'] = self.attn_drop(edge_softmax(g, e)) # (E, K, 1) # 消息传递 g.update_all( lambda edges: {'m': edges.data['ft'] * edges.data['a']}, fn.sum('m', 'ft')) ret = g.dstdata['ft'] if self.activation: ret = self.activation(ret) return ret
def forward(self, g, feat): """ :param g: DGLGraph 同构图 :param feat: tensor(N_src, d_in) 输入顶点特征 :return: tensor(N_dst, K, d_out) 输出顶点特征 """ with g.local_scope(): feat_src = self.fc(self.feat_drop(feat)).view( -1, self.num_heads, self.out_dim) feat_dst = feat_src[:g.num_dst_nodes()] if g.is_block else feat_src e = self.leaky_relu(self.attn(g, feat_src, feat_dst)) # (E, K, 1) g.edata['a'] = self.attn_drop(edge_softmax(g, e)) # (E, K, 1) g.srcdata['ft'] = feat_src # 消息传递 g.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) out = g.dstdata['ft'] # (N_dst, K, d_out) if self.training: # 负采样 neg_g = dgl.graph(self.neg_sampler(g, list(range(g.num_edges()))), num_nodes=g.num_nodes(), device=g.device) neg_e = self.attn(neg_g, feat_src, feat_src) # (E', K, 1) self.attn_x = torch.cat([e, neg_e]).squeeze(dim=-1).mean( dim=1) # (E+E',) self.attn_y = torch.cat([torch.ones(e.shape[0]), torch.zeros(neg_e.shape[0])]) \ .to(self.attn_x.device) if self.activation: out = self.activation(out) return out
def forward( self, value: Union[Tensor, Dict[str, Tensor]], # edge features (may be fused) key: Union[Tensor, Dict[str, Tensor]], # edge features (may be fused) query: Dict[str, Tensor], # node features graph: DGLGraph): with nvtx_range('AttentionSE3'): with nvtx_range('reshape keys and queries'): if isinstance(key, Tensor): # case where features of all types are fused key = key.reshape(key.shape[0], self.num_heads, -1) # need to reshape queries that way to keep the same layout as keys out = torch.cat( [query[str(d)] for d in self.key_fiber.degrees], dim=-1) query = out.reshape( list(query.values())[0].shape[0], self.num_heads, -1) else: # features are not fused, need to fuse and reshape them key = self.key_fiber.to_attention_heads( key, self.num_heads) query = self.key_fiber.to_attention_heads( query, self.num_heads) with nvtx_range('attention dot product + softmax'): # Compute attention weights (softmax of inner product between key and query) edge_weights = dgl.ops.e_dot_v(graph, key, query).squeeze(-1) edge_weights = edge_weights / np.sqrt( self.key_fiber.num_features) edge_weights = edge_softmax(graph, edge_weights) edge_weights = edge_weights[..., None, None] with nvtx_range('weighted sum'): if isinstance(value, Tensor): # features of all types are fused v = value.view(value.shape[0], self.num_heads, -1, value.shape[-1]) weights = edge_weights * v feat_out = dgl.ops.copy_e_sum(graph, weights) feat_out = feat_out.view(feat_out.shape[0], -1, feat_out.shape[-1]) # merge heads out = unfuse_features(feat_out, self.value_fiber.degrees) else: out = {} for degree, channels in self.value_fiber: v = value[str(degree)].view(-1, self.num_heads, channels // self.num_heads, degree_to_dim(degree)) weights = edge_weights * v res = dgl.ops.copy_e_sum(graph, weights) out[str(degree)] = res.view( -1, channels, degree_to_dim(degree)) # merge heads return out
def forward(self, G, h): with G.local_scope(): node_dict, edge_dict = self.node_dict, self.edge_dict for srctype, etype, dsttype in G.canonical_etypes: sub_graph = G[srctype, etype, dsttype] k_linear = self.k_linears[node_dict[srctype]] v_linear = self.v_linears[node_dict[srctype]] q_linear = self.q_linears[node_dict[dsttype]] k = k_linear(h[srctype]).view(-1, self.n_heads, self.d_k) v = v_linear(h[srctype]).view(-1, self.n_heads, self.d_k) q = q_linear(h[dsttype]).view(-1, self.n_heads, self.d_k) e_id = self.edge_dict[etype] relation_att = self.relation_att[e_id] relation_pri = self.relation_pri[e_id] relation_msg = self.relation_msg[e_id] k = torch.einsum("bij,ijk->bik", k, realtion_att) v = torch.einsum("bij,ijk->bik", k, relation_msg) sub_graph.srcdata['k'] = k sub_graph.dstdata['q'] = q sub_graph.srcdata['v'] = v sub_graph.apply_edges(fn.v_dot_u('q', 'k', 't')) attn_score = sub_graph.edata.pop('t').sum( -1) * relation_pri / self.sqrt_dk attn_score = edge_softmax(sub_graph, attn_score, norm_by='dst') sub_graph.edata['t'] = attn_score.unsqueeze(-1) G.multi_update_all({etype : (fn.u_mul_e('v', 't', 'm'), fn.sum('m', 't')) \ for etype in edge_dict}, cross_reducer = 'mean') new_h = {} for ntype in G.ntypes: ''' Step 3: Target-specific Aggregation x = norm( W[node_type] * gelu( Agg(x) ) + x ) ''' n_id = node_dict[ntype] alpha = torch.sigmoid(self.skip[n_id]) t = G.nodes[ntype].data['t'].view(-1, self.out_dim) trans_out = self.drop(self.a_linears[n_id](t)) trans_out = trans_out * alpha + h[ntype] * (1 - alpha) if self.use_norm: new_h[ntype] = self.norms[n_id](trans_out) else: new_h[ntype] = trans_out return new_h
def forward(self, g, feat_src, feat_dst): if self.batch_norm_q is not None: feat_src = self.batch_norm_q(feat_src) feat_dst = self.batch_norm_k(feat_dst) if self.feat_drop is not None: feat_src = self.feat_drop(feat_src) feat_dst = self.feat_drop(feat_dst) score = F.u_dot_v(g, feat_src, feat_dst) # (num_edges, 1) weight = F.edge_softmax(g, score) rst = F.u_mul_e_sum(g, feat_src, weight) rst = th.relu(self.fc(rst)) return rst
def forward(self, g, feats): """ :param g: DGLGraph 异构图 :param feats: Dict[str, tensor(N_i, d_in)] 顶点类型到输入顶点特征的映射 :return: Dict[str, tensor(N_i, d_out)] 顶点类型到输出特征的映射 """ with g.local_scope(): for stype, etype, dtype in g.canonical_etypes: sg = g[stype, etype, dtype] feat_src, feat_dst = feats[stype], feats[dtype] # (N_i, d_in) -> (N_i, d_out) -> (N_i, K, d_out/K) k = self.k_linears[stype](feat_src).view( -1, self.num_heads, self.d_k) v = self.v_linears[stype](feat_src).view( -1, self.num_heads, self.d_k) q = self.q_linears[dtype](feat_dst).view( -1, self.num_heads, self.d_k) # k[:, h] @= w_att[h] => k[n, h, j] = ∑(i) k[n, h, i] * w_att[h, i, j] k = torch.einsum('nhi,hij->nhj', k, self.w_att[etype]) v = torch.einsum('nhi,hij->nhj', v, self.w_msg[etype]) sg.srcdata.update({'k': k, f'v_{etype}': v}) sg.dstdata['q'] = q # 第1步:异构互注意力 sg.apply_edges(fn.v_dot_u('q', 'k', 't')) # sg.edata['t']: (E, K, 1) attn = sg.edata.pop('t').squeeze( dim=-1) * self.mu[etype] / math.sqrt(self.d_k) attn = edge_softmax(sg, attn) # (E, K) sg.edata['t'] = attn.unsqueeze(dim=-1) # 第2步:异构消息传递+目标相关的聚集 g.multi_update_all( { etype: (fn.u_mul_e(f'v_{etype}', 't', 'm'), fn.sum('m', 'h')) for etype in g.etypes }, 'mean') # 第3步:残差连接 out_feats = {} for ntype in g.ntypes: alpha = torch.sigmoid(self.skip[ntype]) h = g.nodes[ntype].data['h'].view(-1, self.out_dim) trans_out = self.drop(self.a_linears[ntype](h)) out = alpha * trans_out + (1 - alpha) * feats[ntype] out_feats[ntype] = self.norms[ntype]( out) if self.use_norm else out return out_feats
def forward(self, sg, feat): if self.batch_norm is not None: feat = self.batch_norm(feat) feat = self.feat_drop(feat) q = self.fc_q(feat) k = self.fc_k(feat) v = self.fc_v(feat) e = F.u_add_v(sg, q, k) e = self.fc_e(th.sigmoid(e)) a = F.edge_softmax(sg, e) rst = F.u_mul_e_sum(sg, v, a) if self.activation is not None: rst = self.activation(rst) return rst
def forward(self, graph, feat, soft_label): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError('There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') if self.ptype == 'ind': feat_src = h_dst = self.feat_drop(feat) el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = th.zeros(graph.num_nodes(), device=graph.device) elif self.ptype == 'tra': feat_src = self.feat_drop(self.fc_emb) feat_dst = h_dst = th.zeros(graph.num_nodes(), device=graph.device) el = feat_src er = feat_dst cog_label = soft_label graph.srcdata.update({'ft': cog_label, 'el': el}) graph.dstdata.update({'er': er}) # # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) # graph.edata['e'] = th.ones(graph.num_edges(), device=graph.device) # non-parameterized PLP e = graph.edata.pop('e') # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) att = graph.edata['a'].squeeze() # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) if self.mlp_layers > 0: rst = th.sigmoid(self.lr_alpha) * graph.dstdata['ft'] + \ th.sigmoid(-self.lr_alpha) * self.mlp(feat) else: rst = graph.dstdata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h_dst) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst, att, th.sigmoid(self.lr_alpha).squeeze(), el.squeeze(), er.squeeze()
def forward(self, user_feat, hi): trust_emb = user_feat[self.row_idxs] trustee_emb = hi[self.col_idxs] weight = self.att(trust_emb, trustee_emb).view(-1, 1) # value = edge_softmax(self.uu_g, weight, norm_by='src').view(-1) value = edge_softmax(self.uu_g, weight).view(-1) A = t.sparse.FloatTensor(self.idxs, value, self.shape).detach() A = A.transpose(0, 1) if self.act is None: hs = self.w(t.spmm(A, hi)) else: hs = self.act(self.w(t.spmm(A, hi))) return hs
def forward(self, graph, feat, soft_label): graph = graph.local_var() if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError('There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') h_src = feat feat_src = feat_dst = self.fc(h_src) if graph.is_block: feat_dst = feat_src[:graph.number_of_dst_nodes()] # Assign features to nodes graph.srcdata.update({'ft': feat_src}) graph.dstdata.update({'ft': feat_dst}) # Step 1. dot product graph.apply_edges(fn.u_dot_v('ft', 'ft', 'a')) # graph.edata['a'] = th.ones(graph.num_edges(), device=graph.device) # Step 2. edge softmax to compute attention scores graph.edata['sa'] = edge_softmax(graph, graph.edata['a']) att = graph.edata['sa'].squeeze() cog_label = soft_label # cog_label = self.fc2(feat) # cog_label = th.sigmoid(self.lr_alpha) * soft_label + th.sigmoid(-self.lr_alpha) * self.fc2(feat) graph.srcdata.update({'ft': cog_label}) graph.dstdata.update({'ft': cog_label}) # Step 3. Broadcast softmax value to each edge, and aggregate dst node graph.update_all(fn.u_mul_e('ft', 'sa', 'attn'), fn.sum('attn', 'agg_u')) # output results to the destination nodes rst = graph.dstdata['agg_u'] return rst, att, th.sigmoid(self.lr_alpha).squeeze()
def forward(self, graph, feat, attn_feat): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError( 'There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') h_src = self.feat_drop(feat) attn_h_src = self.feat_drop(attn_feat) feat_src = self.fc(h_src).view(-1, self._num_heads, self._out_feats) attn_feat_src = attn_feat_dst = self.fc_attn(attn_h_src).view( -1, self._num_heads, self._out_feats) if graph.is_block: attn_feat_dst = attn_feat_src[:graph.number_of_dst_nodes()] el = (attn_feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (attn_feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(dgl.function.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(dgl.function.u_mul_e('ft', 'a', 'm'), dgl.function.sum('m', 'ft')) rst = graph.dstdata['ft'] return rst
def forward(self, graph, feat): with graph.local_scope(): h_src = h_dst = self.feat_drop(feat).view(-1, self._num_heads, self.in_size) el = (h_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (h_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': h_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual if self.residual: rst = rst + h_dst # activation if self.activation: rst = self.activation(rst) return rst
def forward(self, graph, feat): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): assert False if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) if not hasattr(self, "fc_src"): self.fc_src, self.fc_dst = self.fc, self.fc feat_src, feat_dst = h_src, h_dst feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src, feat_dst = h_src, h_dst feat_src = feat_dst = self.fc(h_src).view( -1, self._num_heads, self._out_feats) if graph.is_block: feat_dst = feat_src[:graph.number_of_dst_nodes()] if self._norm == "both": degs = graph.out_degrees().float().clamp(min=1) norm = torch.pow(degs, -0.5) shp = norm.shape + (1, ) * (feat_src.dim() - 1) norm = torch.reshape(norm, shp) feat_src = feat_src * norm # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({"ft": feat_src, "el": el}) graph.dstdata.update({"er": er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v("el", "er", "e")) e = self.leaky_relu(graph.edata.pop("e")) # compute softmax graph.edata["a"] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft")) rst = graph.dstdata["ft"] if self._norm == "both": degs = graph.in_degrees().float().clamp(min=1) norm = torch.pow(degs, 0.5) shp = norm.shape + (1, ) * (feat_dst.dim() - 1) norm = torch.reshape(norm, shp) rst = rst * norm # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self._activation is not None: rst = self._activation(rst) return rst
def forward(self, graph, feat): r"""Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. """ graph = graph.local_var() if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view(-1, self._num_heads, self._out_feats) if self.opt['att_type'] == "GAT": # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) elif self.opt['att_type'] == "cosine": el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) graph.srcdata['norm_h'] = F.normalize(el, p=2, dim=-1) graph.dstdata['norm_h'] = F.normalize(er, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') elif self.opt['att_type'] == "scaled_dot": el = feat_src * self.attn_l er = feat_dst * self.attn_r / th.sqrt( th.tensor(self.opt['num_hidden'] / self.opt['num_heads'])) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute dot graph.apply_edges(fn.u_dot_v('el', 'er', 'dot')) e = graph.edata.pop('dot') elif self.opt['att_type'] == "pearson": el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) src_mu = th.mean(el, dim=1, keepdim=True) graph.srcdata['norm_h'] = F.normalize(el - src_mu, p=2, dim=-1) dst_mu = th.mean(er, dim=1, keepdim=True) graph.dstdata['norm_h'] = F.normalize(er - dst_mu, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') elif self.opt['att_type'] == "spearman": #todo check all these operations el = feat_src * self.attn_l er = feat_dst * self.attn_r graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) el = el.view(-1, self._out_feats) er = er.view(-1, self._out_feats) el = soft_rank(el, regularization_strength=1.0) er = soft_rank(er, regularization_strength=1.0) ranked_src = soft_rank( 1000 * F.normalize(el, p=2, dim=-1)) #, regularization_strength=0.1) ranked_dst = soft_rank(1000 * F.normalize(er, p=2, dim=-1), regularization_strength=0.1) src_mu = th.mean(ranked_src, dim=1, keepdim=True) dst_mu = th.mean(ranked_dst, dim=1, keepdim=True) el = F.normalize(ranked_src - src_mu, p=2, dim=-1) er = F.normalize(ranked_dst - dst_mu, p=2, dim=-1) el = el.view(-1, self._num_heads, self._out_feats) er = er.view(-1, self._num_heads, self._out_feats) graph.srcdata['norm_h'] = F.normalize(el, p=2, dim=-1) graph.dstdata['norm_h'] = F.normalize(er, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) e = graph.edata.pop('cos') # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self.activation: rst = self.activation(rst) return rst
def forward(self, graph, feat): r""" Description ----------- Compute graph attention network layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor or pair of torch.Tensor If a torch.Tensor is given, the input feature of shape :math:`(N, D_{in})` where :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, D_{in_{src}})` and :math:`(N_{out}, D_{in_{dst}})`. Returns ------- torch.Tensor The output feature of shape :math:`(N, H, D_{out})` where :math:`H` is the number of heads, and :math:`D_{out}` is size of output feature. Raises ------ DGLError If there are 0-in-degree nodes in the input graph, it will raise DGLError since no message will be passed to those nodes. This will cause invalid output. The error can be ignored by setting ``allow_zero_in_degree`` parameter to ``True``. """ with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): raise DGLError( 'There are 0-in-degree nodes in the graph, ' 'output for those nodes will be invalid. ' 'This is harmful for some applications, ' 'causing silent performance regression. ' 'Adding self-loop on the input graph by ' 'calling `g = dgl.add_self_loop(g)` will resolve ' 'the issue. Setting ``allow_zero_in_degree`` ' 'to be `True` when constructing this module will ' 'suppress the check and let the code run.') if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) if not hasattr(self, 'fc_src'): self.fc_src, self.fc_dst = self.fc, self.fc feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = h_dst = self.feat_drop(feat) feat_src = feat_dst = self.fc(h_src).view( -1, self._num_heads, self._out_feats) if graph.is_block: feat_dst = feat_src[:graph.number_of_dst_nodes()] # NOTE: GAT paper uses "first concatenation then linear projection" # to compute attention scores, while ours is "first projection then # addition", the two approaches are mathematically equivalent: # We decompose the weight vector a mentioned in the paper into # [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j # Our implementation is much efficient because we do not need to # save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, # addition could be optimized with DGL's built-in function u_add_v, # which further speeds up computation and saves memory footprint. el = (feat_src * self.attn_l).sum(dim=-1).unsqueeze(-1) er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.srcdata.update({'ft': feat_src, 'el': el}) graph.dstdata.update({'er': er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. graph.apply_edges(fn.u_add_v('el', 'er', 'e')) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = self.attn_drop(edge_softmax(graph, e)) # message passing graph.update_all(fn.u_mul_e('ft', 'a', 'm'), fn.sum('m', 'ft')) rst = graph.dstdata['ft'] # residual rst = rst.flatten(1) rst_norm = self.layer_norm(rst) if self.res_fc is not None: resval = self.res_fc(h_dst).view(h_dst.shape[0], -1, self._out_feats).flatten(1) rst_norm = self.feat_drop(rst_norm) + resval # activation rst = self.activation(rst_norm) rst = rst_norm + self.feat_drop(rst) rst = self.ff_layer_norm(rst) return rst
def forward(self, g, ft_src): if self.batch_norm is not None: ft_src = { ntype: self.batch_norm[ntype](ft) for ntype, ft in ft_src.items() } if self.feat_drop is not None: ft_src = { ntype: self.feat_drop(ft) for ntype, ft in ft_src.items() } device = next(iter(ft_src.values())).device ft_dst = { vtype: ft_src[vtype][:g.number_of_dst_nodes(vtype)] for vtype in g.dsttypes } feats = {} for vtype, eutypes in self.vtype2eutypes.items(): src_nid = [] dst_nid = [] num_utypes_nodes = 0 src_val = [] attn_score = [] for etype, utype in eutypes: sg = g[etype] ft_e = (self.edge_embedding[etype](sg.edata['cnt'].to(device)) if etype in self.edge_embedding else None) e, v = self.edge_aggregate[etype]( sg, ft_src[utype], ft_dst[vtype], ft_e=ft_e, return_ev=True, ) uid, vid = sg.all_edges(form='uv', order='eid') src_nid.append(uid + num_utypes_nodes) dst_nid.append(vid) num_utypes_nodes += sg.number_of_src_nodes() src_val.append(v) attn_score.append(e) src_nid = th.cat(src_nid, dim=0) dst_nid = th.cat(dst_nid, dim=0) edge_softmax_g = dgl.heterograph(data_dict={ ('utypes', 'etypes', 'vtype'): (src_nid, dst_nid) }, num_nodes_dict={ 'utypes': num_utypes_nodes, 'vtype': g.number_of_dst_nodes(vtype) }, device=device) src_val = th.cat(src_val, dim=0) # (num_utypes_nodes, num_heads, num_feats) attn_score = th.cat(attn_score, dim=0) # (num_edges, num_heads, 1) attn_weight = F.edge_softmax(edge_softmax_g, attn_score) agg = F.u_mul_e_sum(edge_softmax_g, src_val, attn_weight) agg = agg.view(g.number_of_dst_nodes(vtype), -1) feats[vtype] = self.activation[vtype](self.linear_agg[vtype](agg) + self.linear_self[vtype] (ft_dst[vtype])) return feats
def forward(self, graph, feat): r"""Compute AGNN layer. Parameters ---------- graph : DGLGraph The graph. feat : torch.Tensor The input feature of shape :math:`(N, *)` :math:`N` is the number of nodes, and :math:`*` could be of any shape. If a pair of torch.Tensor is given, the pair must contain two tensors of shape :math:`(N_{in}, *)` and :math:`(N_{out}, *})`, the the :math:`*` in the later tensor must equal the previous one. Returns ------- torch.Tensor The output feature of shape :math:`(N, *)` where :math:`*` should be the same as input shape. """ graph = graph.local_var() feat_src, feat_dst = expand_as_pair(feat) graph.srcdata['h'] = feat_src if self.opt['att_type'] == "AGNN": graph.srcdata['norm_h'] = F.normalize(feat_src, p=2, dim=-1) if isinstance(feat, tuple): graph.dstdata['norm_h'] = F.normalize(feat_dst, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) cos = graph.edata.pop('cos') e = self.beta * cos #SAME AS AGNN elif self.opt['att_type'] == "cosine": graph.srcdata['norm_h'] = F.normalize(feat_src, p=2, dim=-1) if isinstance(feat, tuple): graph.dstdata['norm_h'] = F.normalize(feat_dst, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) cos = graph.edata.pop('cos') e = self.beta * cos elif self.opt['att_type'] == "scaled_dot": if isinstance(feat, tuple): graph.dstdata['h'] = feat_dst / th.sqrt(th.tensor(self.opt['num_hidden'])) # compute dot graph.apply_edges(fn.u_dot_v('h', 'h', 'dot')) dot = graph.edata.pop('dot') e = self.beta * dot elif self.opt['att_type'] == "pearson": src_mu = th.mean(feat_src, dim=1, keepdim=True) graph.srcdata['norm_h'] = F.normalize(feat_src - src_mu, p=2, dim=-1) if isinstance(feat, tuple): dst_mu = th.mean(feat_dst, dim=1, keepdim=True) graph.dstdata['norm_h'] = F.normalize(feat_dst - dst_mu, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) cos = graph.edata.pop('cos') e = self.beta * cos elif self.opt['att_type'] == "spearman": # F.normalize(feat_src, p=2, dim=1).detach().numpy() ranked_src = soft_rank(1000*F.normalize(feat_src, p=2, dim=-1))#, regularization_strength=0.1) src_mu = th.mean(ranked_src, dim=1, keepdim=True) graph.srcdata['norm_h'] = F.normalize(ranked_src - src_mu, p=2, dim=-1) if isinstance(feat, tuple): ranked_dst = soft_rank(1000*F.normalize(feat_dst, p=2, dim=-1), regularization_strength=0.1) dst_mu = th.mean(ranked_dst, dim=1, keepdim=True) graph.dstdata['norm_h'] = F.normalize(ranked_dst - dst_mu, p=2, dim=-1) # compute cosine distance graph.apply_edges(fn.u_dot_v('norm_h', 'norm_h', 'cos')) cos = graph.edata.pop('cos') e = self.beta * cos graph.edata['p'] = edge_softmax(graph, e) graph.update_all(fn.u_mul_e('h', 'p', 'm'), fn.sum('m', 'h')) return graph.dstdata.pop('h')
def forward(self, graph, feat): with graph.local_scope(): if not self._allow_zero_in_degree: if (graph.in_degrees() == 0).any(): assert False if isinstance(feat, tuple): h_src = self.feat_drop(feat[0]) h_dst = self.feat_drop(feat[1]) if not hasattr(self, "fc_src"): self.fc_src, self.fc_dst = self.fc, self.fc feat_src, feat_dst = h_src, h_dst feat_src = self.fc_src(h_src).view(-1, self._num_heads, self._out_feats) feat_dst = self.fc_dst(h_dst).view(-1, self._num_heads, self._out_feats) else: h_src = self.feat_drop(feat) feat_src = h_src feat_src = self.fc(h_src).view(-1, self._num_heads, self._out_feats) if graph.is_block: h_dst = h_src[:graph.number_of_dst_nodes()] feat_dst = feat_src[:graph.number_of_dst_nodes()] else: h_dst = h_src feat_dst = feat_src if self.training and self.edge_drop > 0: perm = torch.randperm(graph.number_of_edges(), device=graph.device) bound = int(graph.number_of_edges() * self.edge_drop) eids = perm[bound:] else: eids = torch.arange(graph.number_of_edges(), device=graph.device) el = (feat_src * self.attn_l).sum(-1).unsqueeze(-1) graph.srcdata.update({"ft": feat_src, "el": el}) # graph.dstdata.update({"er": er}) # compute edge attention, el and er are a_l Wh_i and a_r Wh_j respectively. if self.attn_r is not None: er = (feat_dst * self.attn_r).sum(dim=-1).unsqueeze(-1) graph.dstdata.update({"er": er}) graph.apply_edges(fn.u_add_v("el", "er", "e")) else: graph.apply_edges(fn.copy_u("el", "e")) e = self.leaky_relu(graph.edata.pop("e")) # compute softmax graph.edata["a"] = torch.zeros_like(e) graph.edata["a"][eids] = self.attn_drop( edge_softmax(graph, e[eids], eids=eids)) shp = graph.edata["gcn_norm"].shape + (1, ) * (feat_dst.dim() - 1) if self._norm == "sym": graph.edata["a"][ eids] = graph.edata["a"][eids] * torch.reshape( graph.edata["gcn_norm_adjust"], shp)[eids] if self._norm == "avg": graph.edata["a"][eids] = ( graph.edata["a"][eids] + torch.reshape(graph.edata["gcn_norm"], shp)[eids]) / 2 hstack = [graph.dstdata["ft"]] for _ in range(self._K): # message passing graph.update_all(fn.u_mul_e("ft", "a", "m"), fn.sum("m", "ft")) hstack.append(graph.dstdata["ft"]) hstack = [ h + self.position_emb[[k], :, :] for k, h in enumerate(hstack) ] a_l = (hstack[0] * self.hop_attn_l).sum(dim=-1).unsqueeze(-1) astack_r = [(feat_dst * self.hop_attn_r).sum(dim=-1).unsqueeze(-1) for feat_dst in hstack] a = torch.cat([(a_r + a_l) for a_r in astack_r], dim=-1) # a = torch.sigmoid(a) a = self.leaky_relu(a) a = F.softmax(a, dim=-1) a = self.attn_drop(a) # a = F.dropout(a, p=0.5, training=self.training) rst = 0 for i in range(a.shape[-1]): rst += hstack[i] * a[:, :, [i]] # residual if self.res_fc is not None: resval = self.res_fc(feat).view(h_dst.shape[0], -1, self._out_feats) rst = rst + resval # activation if self._activation is not None: rst = self._activation(rst) return rst
def forward(self, graph: dgl.DGLHeteroGraph, feat: tuple, dst_node_transformation_weight: nn.Parameter, src_node_transformation_weight: nn.Parameter, relation_embedding: torch.Tensor, relation_transformation_weight: nn.Parameter): r""" Parameters ---------- graph : specific relational DGLHeteroGraph feat : pair of torch.Tensor The pair contains two tensors of shape (N_{in}, D_{in_{src}})` and (N_{out}, D_{in_{dst}}). dst_node_transformation_weight: Parameter (input_dst_dim, n_heads * hidden_dim) src_node_transformation_weight: Parameter (input_src_dim, n_heads * hidden_dim) relation_embedding: torch.Tensor, (relation_input_dim) relation_transformation_weight: Parameter (relation_input_dim, n_heads * 2 * hidden_dim) Returns ------- torch.Tensor, shape (N, H, D_out)` where H is the number of heads, and D_out is size of output feature. """ graph = graph.local_var() # Tensor, (N_src, input_src_dim) feat_src = self.dropout(feat[0]) # Tensor, (N_dst, input_dst_dim) feat_dst = self.dropout(feat[1]) # Tensor, (N_src, n_heads, hidden_dim) -> (N_src, input_src_dim) * (input_src_dim, n_heads * hidden_dim) feat_src = torch.matmul(feat_src, src_node_transformation_weight).view( -1, self._num_heads, self._out_feats) # Tensor, (N_dst, n_heads, hidden_dim) -> (N_dst, input_dst_dim) * (input_dst_dim, n_heads * hidden_dim) feat_dst = torch.matmul(feat_dst, dst_node_transformation_weight).view( -1, self._num_heads, self._out_feats) # Tensor, (n_heads, 2 * hidden_dim) -> (1, input_dst_dim) * (input_dst_dim, n_heads * hidden_dim) relation_attention_weight = torch.matmul( relation_embedding.unsqueeze(dim=0), relation_transformation_weight).view(self._num_heads, 2 * self._out_feats) # first decompose the weight vector into [a_l || a_r], then # a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j, This implementation is much efficient # Tensor, (N_dst, n_heads, 1), (N_dst, n_heads, hidden_dim) * (n_heads, hidden_dim) e_dst = (feat_dst * relation_attention_weight[:, :self._out_feats]).sum( dim=-1, keepdim=True) # Tensor, (N_src, n_heads, 1), (N_src, n_heads, hidden_dim) * (n_heads, hidden_dim) e_src = (feat_src * relation_attention_weight[:, self._out_feats:]).sum( dim=-1, keepdim=True) # (N_src, n_heads, hidden_dim), (N_src, n_heads, 1) graph.srcdata.update({'ft': feat_src, 'e_src': e_src}) # (N_dst, n_heads, 1) graph.dstdata.update({'e_dst': e_dst}) # compute edge attention, e_src and e_dst are a_src * Wh_src and a_dst * Wh_dst respectively. graph.apply_edges(fn.u_add_v('e_src', 'e_dst', 'e')) # shape (edges_num, heads, 1) e = self.leaky_relu(graph.edata.pop('e')) # compute softmax graph.edata['a'] = edge_softmax(graph, e) graph.update_all(fn.u_mul_e('ft', 'a', 'msg'), fn.sum('msg', 'feat')) # (N_dst, n_heads * hidden_dim), reshape (N_dst, n_heads, hidden_dim) dst_features = graph.dstdata.pop('feat').reshape( -1, self._num_heads * self._out_feats) dst_features = self.relu(dst_features) return dst_features