def init_learning(self): # Get window function self.feature_window = TensorList( [dcf.hann2d(sz) for sz in self.feature_sz]) # Filter regularization self.filter_reg = self.fparams.attribute('filter_reg') # Activation function after the projection matrix (phi_1 in the paper) projection_activation = getattr(self.params, 'projection_activation', 'none') if isinstance(projection_activation, tuple): projection_activation, act_param = projection_activation if projection_activation == 'none': self.projection_activation = lambda x: x elif projection_activation == 'relu': self.projection_activation = layers.relu elif projection_activation == 'elu': self.projection_activation = layers.elu elif projection_activation == 'mlu': self.projection_activation = lambda x: layers.elu( leaky_relu(x, 1 / act_param), act_param) else: raise ValueError('Unknown activation') # Activation function after the output scores (phi_2 in the paper) response_activation = getattr(self.params, 'response_activation', 'none') if isinstance(response_activation, tuple): response_activation, act_param = response_activation if response_activation == 'none': self.response_activation = lambda x: x elif response_activation == 'relu': self.response_activation = layers.relu elif response_activation == 'elu': self.response_activation = layers.elu elif response_activation == 'mlu': self.response_activation = lambda x: layers.elu( leaky_relu(x, 1 / act_param), act_param) else: raise ValueError('Unknown activation')
def send_attention(src_feat, dst_feat, edge_feat): if edge_feat is None or not edge_feat: k_h = L.elu( L.reshape(src_feat["k_h"], [-1, num_heads, hidden_size, 1])) + 1 v_h = dst_feat["v_h"] else: edge_feat = edge_feat["edge"] edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size]) k_h = L.elu(src_feat["k_h"] + edge_feat) + 1 v_h = dst_feat["v_h"] + edge_feat k_h = L.reshape(k_h, [-1, num_heads, hidden_size, 1]) v_h = L.reshape(v_h, [-1, num_heads, hidden_size, 1]) sum_kTv = L.matmul(k_h, v_h, transpose_y=True) sum_k = L.reshape(k_h, [-1, num_heads * hidden_size]) sum_kTv = L.reshape(sum_kTv, [-1, num_heads * hidden_size * hidden_size]) return {"sum_k": sum_k, "sum_kTv": sum_kTv}
def func(self, place): shape = [2, 3, 7, 9] eps = 1e-6 alpha = 1.1 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.elu(x, alpha=alpha) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check([x], y, x_init=x_arr, place=place, eps=eps)
def func(self, place): shape = [2, 4, 4, 4] eps = 1e-6 alpha = 0.2 dtype = np.float64 SEED = 0 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.elu(x, alpha=alpha) np.random.RandomState(SEED) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) gradient_checker.double_grad_check_for_dygraph( self.elu_wrapper, [x], y, x_init=x_arr, place=place) fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
def forward(self, input, adj): """Forward network""" h = layers.fc(input, size=self.out_features, num_flatten_dims=2) _, N, _ = h.shape middle_result1 = layers.expand(layers.matmul(h, self.a1), expand_times=(1, 1, N)) middle_result2 = layers.transpose(layers.expand( layers.matmul(h, self.a2), expand_times=(1, 1, N)), perm=[0, 2, 1]) e = layers.leaky_relu(middle_result1 + middle_result2, self.alpha) adj = layers.cast(adj, dtype='int32') attention = nn.mask_fill(e, adj == 0.0, -1e9) attention = layers.softmax(attention, axis=2) attention = layers.dropout(attention, self.dropout) h_prime = layers.matmul(attention, h) if self.concat: return layers.elu(h_prime) else: return h_prime
def graph_linformer(gw, feature, edge_feature, hidden_size, name, num_heads=4, attn_drop=False, concat=True, skip_feat=True, gate=False, norm=True, relu=True, k_hop=2, is_test=False): """Implementation of graph Transformer from UniMP This is an implementation of the paper Unified Massage Passing Model for Semi-Supervised Classification (https://arxiv.org/abs/2009.03509). Args: name: Granph Transformer layer names. gw: Graph wrapper object (:code:`StaticGraphWrapper` or :code:`GraphWrapper`) feature: A tensor with shape (num_nodes, feature_size). hidden_size: The hidden size for graph transformer. num_heads: The head number in graph transformer. attn_drop: Dropout rate for attention. edge_feature: A tensor with shape (num_edges, feature_size). num_heads: 8 concat: Reshape the output (num_nodes, num_heads, hidden_size) by concat (num_nodes, hidden_size * num_heads) or mean (num_nodes, hidden_size) skip_feat: Whether use skip connect gate: Whether add skip_feat and output up with gate weight norm: Whether use layer_norm for output relu: Whether use relu activation for output is_test: Whether in test phrase. Return: A tensor with shape (num_nodes, hidden_size * num_heads) or (num_nodes, hidden_size) """ def send_attention(src_feat, dst_feat, edge_feat): if edge_feat is None or not edge_feat: k_h = L.elu( L.reshape(src_feat["k_h"], [-1, num_heads, hidden_size, 1])) + 1 v_h = dst_feat["v_h"] else: edge_feat = edge_feat["edge"] edge_feat = L.reshape(edge_feat, [-1, num_heads, hidden_size]) k_h = L.elu(src_feat["k_h"] + edge_feat) + 1 v_h = dst_feat["v_h"] + edge_feat k_h = L.reshape(k_h, [-1, num_heads, hidden_size, 1]) v_h = L.reshape(v_h, [-1, num_heads, hidden_size, 1]) sum_kTv = L.matmul(k_h, v_h, transpose_y=True) sum_k = L.reshape(k_h, [-1, num_heads * hidden_size]) sum_kTv = L.reshape(sum_kTv, [-1, num_heads * hidden_size * hidden_size]) return {"sum_k": sum_k, "sum_kTv": sum_kTv} def send_copy(src_feat, dst_feat, edge_feat): return src_feat def reduce_sum(msg): return L.sequence_pool(msg, "sum") q = L.elu( linear(feature, hidden_size * num_heads, name=name + '_q_weight', init_type='gcn')) + 1 k = linear(feature, hidden_size * num_heads, name=name + '_k_weight', init_type='gcn') v = linear(feature, hidden_size * num_heads, name=name + '_v_weight', init_type='gcn') reshape_q = L.reshape(q, [-1, num_heads, 1, hidden_size]) reshape_k = L.reshape(k, [-1, num_heads, hidden_size]) reshape_v = L.reshape(v, [-1, num_heads, hidden_size]) msg = gw.send(send_attention, nfeat_list=[("k_h", reshape_k), ("v_h", reshape_v)], efeat_list=[('edge', edge_feature)]) sum_k = gw.recv(msg["sum_k"], reduce_sum) sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum) for i in range(1, k_hop): msg = gw.send(send_copy, nfeat_list=[("sum_k", sum_k), ("sum_kTv", sum_kTv)]) sum_k = gw.recv(msg["sum_k"], reduce_sum) sum_kTv = gw.recv(msg["sum_kTv"], reduce_sum) # sum_k: [-1, num_heads * hidden_size] # sum_kTv: [-1, num_heads * hidden_size * hidden_size] sum_k = L.reshape(sum_k, [-1, num_heads, 1, hidden_size]) sum_kTv = L.reshape(sum_kTv, [-1, num_heads, hidden_size, hidden_size]) out_feat = L.reshape(L.matmul(reshape_q, sum_kTv), [-1, num_heads, hidden_size]) / L.reduce_sum( reshape_q * sum_k, -1) if concat: out_feat = L.reshape(out_feat, [-1, num_heads * hidden_size]) else: out_feat = L.reduce_mean(out_feat, dim=1) if skip_feat: if concat: skip_feature = linear(feature, hidden_size * num_heads, name=name + '_skip_weight', init_type='lin') else: skip_feature = linear(feature, hidden_size, name=name + '_skip_weight', init_type='lin') if gate: temp_output = L.concat( [skip_feature, out_feat, out_feat - skip_feature], axis=-1) gate_f = L.sigmoid( linear(temp_output, 1, name=name + '_gate_weight', init_type='lin')) out_feat = skip_feature * gate_f + out_feat * (1 - gate_f) else: out_feat = skip_feature + out_feat if norm: out_feat = layer_norm(out_feat, name="ln_%s" % name) if relu: out_feat = L.relu(out_feat) return out_feat