def call(self, x): def hw_flatten(x): return K.reshape(x, shape=[ K.shape(x)[0], K.shape(x)[1] * K.shape(x)[2], K.shape(x)[3] ]) f = K.conv2d(x, kernel=self.kernel_f, strides=(1, 1), padding='same') # [bs, h, w, c'] g = K.conv2d(x, kernel=self.kernel_g, strides=(1, 1), padding='same') # [bs, h, w, c'] h = K.conv2d(x, kernel=self.kernel_h, strides=(1, 1), padding='same') # [bs, h, w, c] s = K.batch_dot(hw_flatten(g), K.permute_dimensions(hw_flatten(f), (0, 2, 1))) # # [bs, N, N] beta = K.softmax(s, axis=-1) # attention map o = K.batch_dot(beta, hw_flatten(h)) # [bs, N, C] o = K.reshape(o, shape=K.shape(x)) # [bs, h, w, C] x = self.gamma * o + x return x
def MultiHeadsAttModel(l=8 * 8, d=512, dv=64, dout=512, nv=8): v1 = tf.keras.layers.Input(shape=(l, d)) q1 = tf.keras.layers.Input(shape=(l, d)) k1 = tf.keras.layers.Input(shape=(l, d)) v2 = tf.keras.layers.Dense(d, activation="relu")(v1) q2 = tf.keras.layers.Dense(d, activation="relu")(q1) k2 = tf.keras.layers.Dense(d, activation="relu")(k1) v = tf.keras.layers.Reshape([l, nv, dv])(v2) q = tf.keras.layers.Reshape([l, nv, dv])(q2) k = tf.keras.layers.Reshape([l, nv, dv])(k2) att = tf.keras.layers.Lambda( lambda x: K.batch_dot(x[0], x[1], axes=[-1, -1]) / np.sqrt(dv), output_shape=(l, nv, nv))([q, k]) att = tf.keras.layers.Lambda(lambda x: K.softmax(x), output_shape=(l, nv, nv))(att) out = tf.keras.layers.Lambda( lambda x: K.batch_dot(x[0], x[1], axes=[4, 3]), output_shape=(l, nv, dv))([att, v]) out = tf.keras.layers.Reshape([l, d])(out) out = tf.keras.layers.Add()([out, q1]) out = tf.keras.layers.Dense(dout, activation="relu")(out) return tf.keras.models.Model(inputs=[q1, k1, v1], outputs=out)
def call(self, x): f = K.conv2d(x, kernel=self.kernel_f, strides=(1, 1), padding='same') # [bs, h, w, c'] g = K.conv2d(x, kernel=self.kernel_g, strides=(1, 1), padding='same') # [bs, h, w, c'] h = K.conv2d(x, kernel=self.kernel_h, strides=(1, 1), padding='same') # [bs, h, w, c'] f_ = K.permute_dimensions(self._hw_flatten(f), (0, 2, 1)) # [bs, 3c', N] s = K.batch_dot(self._hw_flatten(g), f_) # [bs, N, N] beta = K.softmax(s, axis=-1) # attention map double_attn = K.batch_dot(f_, self._hw_flatten(x)) # [bs, 3c', 3c] double_attn = K.softmax(double_attn, axis=1) h_tmp, shape_tmp = self._hw_flatten(h, return_shape=True) # [bs, N, 3c'] o_tmp = K.batch_dot(beta, h_tmp) # [bs, N, 3c'] o = K.batch_dot(o_tmp, double_attn) # [bs, N, 3c] o = self._hw_recover(o, shape_tmp) # [bs, h, w, C] x = self.gamma * o + x return x
def call(self, inputs, mask=None, **kwargs): if len(inputs) == 4: query, key, value, prev = inputs mask = mask[1] else: query = key = value = inputs[0] prev = inputs[1] mask = mask[0] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt( K.cast(feature_dim, dtype=K.floatx())) new_prev = e = e + prev if self.history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] indices = K.expand_dims(K.arange(0, key_len), axis=0) upper = K.expand_dims(K.arange(0, query_len), axis=-1) e -= 10000.0 * K.expand_dims(K.cast(indices > upper, K.floatx()), axis=0) if mask is not None: e -= 10000.0 * (1.0 - K.cast(K.expand_dims(mask, axis=-2), K.floatx())) self.intensity = e e = K.exp(e - K.max(e, axis=-1, keepdims=True)) self.attention = e / K.sum(e, axis=-1, keepdims=True) v = K.batch_dot(self.attention, value) output = [v, new_prev] if self.return_attention: output.append(self.attention) return output
def call(self, inputs: tensorflow.Tensor, mask: Optional[tensorflow.Tensor] = None, **kwargs) -> tensorflow.Tensor: if isinstance(inputs, list): query, key, value = inputs else: query = key = value = inputs if isinstance(mask, list): mask = mask[1] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt( K.cast(feature_dim, dtype=K.floatx())) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) if self.history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] indices = K.tile(K.expand_dims(K.arange(key_len), axis=0), [query_len, 1]) upper = K.expand_dims(K.arange(key_len), axis=-1) e *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None: e *= K.cast(K.expand_dims(mask, axis=-2), K.floatx()) a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon()) v = K.batch_dot(a, value) if self.return_attention: return [v, a] return v
def call(self, inputs, masks, n_head): q, k, v = inputs q = self.reshape_to_attention_shape(q, n_head) k = self.reshape_to_attention_shape(k, n_head) v = self.reshape_to_attention_shape(v, n_head) # every mask is the same mask = masks[0] emb_dim = K.shape(q)[-1] # [N * n_head, max_len, max_len] scores = K.batch_dot(q, k, axes=2) / K.sqrt(K.cast(emb_dim, K.floatx())) # softmax 1 scores = K.exp(scores - K.max(scores, axis=-1, keepdims=True)) if mask is not None: mask = self.reshape_mask(mask, n_head) # [N * n_head, max_len, max_len] * [N * n_head, 1, max_len] scores *= mask # softmax 2 scores /= (K.sum(scores, axis=-1, keepdims=True) + K.epsilon()) # [N * n_head, max_len, emb_dim] y = K.batch_dot(scores, v) return y
def symmetric_cross_entropy(y_actual, y_pred, A=-6, alpha=0.1, beta=1): '''Define the symmetric cross entropy that will be used for training ''' q = K.one_hot(K.cast(y_actual, 'uint8'), 10) # 200 or 10 custom_loss = -alpha * K.mean( K.batch_dot(q, K.maximum(K.log(y_pred + 1e-15), A))) - beta * K.mean( K.batch_dot(K.maximum(K.log(q + 1e-15), A), y_pred)) return custom_loss
def pam(x): gamma = K.variable(np.array([0]), dtype='float32', name='gamma') # channel = 2048 # spatial_size = height = width = 7 batch, height, width, channel = x.get_shape().as_list() assert height == width, "height and width not equal." proj_query = Conv2D(height, 1, padding='same', strides=1)(x) proj_query = Reshape((height * width, height))(proj_query) # print(proj_query.get_shape());exit() proj_query = K.permute_dimensions(proj_query, (0, 2, 1)) proj_key = Conv2D(height, 1, padding='same', strides=1)(x) proj_key = Reshape((height * width, height))(proj_key) proj_value = Conv2D(channel, 1, padding='same', strides=1)(x) proj_value = Reshape((height * width, channel))(proj_value) energy = K.batch_dot(proj_key, proj_query) attention = K.softmax(energy) attention = K.permute_dimensions(attention, (0, 2, 1)) out = K.batch_dot(attention, proj_value) out = Reshape((height, width, channel))(out) # out = Add()([Multiply()([gamma,out]), x]) out = x + gamma * out return out
def __call__(self, q, k, v, mask, idx): """Applies scaled dot product attention. Args: q: Queries k: Keys v: Values mask: Masking if required -- sets softmax to very large value Returns: Tuple of (layer outputs, attention weights) """ temper = tf.sqrt(tf.cast(tf.shape(k)[-1], dtype='float32')) attn = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]) / temper, name=f"ScaledDotProdAttenLambda{idx}")( [q, k]) # shape=(batch, q, k) if mask is not None: mmask = Lambda(lambda x: (-1e+9) * (1. - K.cast(x, 'float32')), name=f"ScaledDotProdAttenLambdaMask{idx}")( mask) # setting to infinity attn = Add(name=f'SDPA_ADD_{idx}')([attn, mmask]) attn = self.activation(attn) attn = self.dropout(attn) output = Lambda(lambda x: K.batch_dot(x[0], x[1]), name=f"ScaledDotProdAttenOutput{idx}")([attn, v]) return output, attn
def call(self, inputs): if self.share_weights: u_hat_vectors = K.conv1d(inputs, self.W) else: u_hat_vectors = K.local_conv1d(inputs, self.W, [1], [1]) # u_hat_vectors : The spatially transformed input vectors (with local_conv_1d) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] u_hat_vectors = K.reshape(u_hat_vectors, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vectors = K.permute_dimensions(u_hat_vectors, (0, 2, 1, 3)) routing_weights = K.zeros_like(u_hat_vectors[:, :, :, 0]) for i in range(self.routings): capsule_weights = K.softmax(routing_weights, 1) outputs = K.batch_dot(capsule_weights, u_hat_vectors, [2, 2]) if K.ndim(outputs) == 4: outputs = K.sum(outputs, axis=1) if i < self.routings - 1: outputs = K.l2_normalize(outputs, -1) routing_weights = K.batch_dot(outputs, u_hat_vectors, [2, 3]) if K.ndim(routing_weights) == 4: routing_weights = K.sum(routing_weights, axis=1) return self.activation(outputs)
def call(self, x, training=False): fea_map, fea_vec = self.backbone(x, training=training) if self.region_attn: cls_fea_map_ori = call_layers(self.conv_bn_relu_list[1], fea_vec, training) cls_fea_map, HxW = flatten_hw(cls_fea_map_ori) attr_fea_map_i = call_layers(self.conv_bn_relu_list[0], fea_vec, training) attr_pool_i = call_layers(self.pool_bn_relu_dropout, attr_fea_map_i, training) attr_pool_i = tf.expand_dims(attr_pool_i, -1) # (n, hidden_dim, 1) # TODO: `fea_map` -> `cls_fea_map` fea_map_ = K.permute_dimensions(cls_fea_map, (0, 2, 1)) # (n, hidden_dim, HxW) # fea_map_, HxW = flatten_hw(fea_vec) # (n, HxW, fea_dim) # fea_map_ = K.permute_dimensions(fea_map_, (0, 2, 1)) # (n, fea_dim, HxW) attn_cls = K.batch_dot(cls_fea_map, attr_pool_i) # (n, HxW, 1) region_attn_map = self.sigmoid(attn_cls) region_attn_map /= tf.cast(HxW, tf.float32) region_fea = K.batch_dot(fea_map_, region_attn_map) # (n, hidden_dim, 1) fea_vec = tf.squeeze(region_fea, -1) # (n, hidden_dim) fea_vec = self.region_bn(fea_vec, training=training) else: if self.add_linear: fea_vec = self.fc(fea_vec) # (n, embedding_dim) fea_vec = tf.nn.relu(fea_vec) fea_vec = self.pool(fea_vec) return fea_map, fea_vec
def attention_k(q_w_q, k_w_k, v_w_v, mask=None, dropout=None): """ Parameters ---------- q_w_q: (batch size, num heads, num tokens in sentence, d_model / d_k), (5, 2, 4, 6) k_w_k v_w_v mask: (5, 1, 1, 4) dropout: dropout layer, not dropout rate Returns ------- """ def masked_fill(x, mask, target_mask_val, filled_value=-1e9): return x * (x != target_mask_val) + (mask == target_mask_val) * filled_value d_k = q_w_q.shape.as_list()[-1] scores = K.batch_dot(q_w_q, k_w_k, axes=[3, 3]) / math.sqrt( d_k) # (5, 2, 4, 4) if mask is not None: scores = masked_fill(scores, mask, 0, -1e9) p_attn = K.softmax(scores) if dropout is not None: p_attn = dropout(p_attn) return K.batch_dot(p_attn, v_w_v, axes=[3, 2]), p_attn
def call(self, x): # print("in call!") # print("x =", x) if len(x) == 3: Q_seq, K_seq, V_seq = x Q_len, V_len = None, None elif len(x) == 5: Q_seq, K_seq, V_seq, Q_len, V_len = x Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape( Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) # print("Q_seq1 =", Q_seq) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) # print("Q_seq2 =", Q_seq) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape( K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape( V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head**0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') # print("\n\n\n\n", O_seq) return O_seq
def call(self, u_vecs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): b = K.permute_dimensions( b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] c = K.softmax(b) c = K.permute_dimensions(c, (0, 2, 1)) b = K.permute_dimensions(b, (0, 2, 1)) outputs = self.activation(K.batch_dot(c, u_hat_vecs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) return outputs
def _control_circuit(self, psi, action): """ Args: psi (Tensor([batch_size,N], c64)): batch of states action (dict, 'alpha' : Tensor([batch_size,2], tf.float32), 'beta' : Tensor([batch_size,2], tf.float32), 'phi' : Tensor([batch_size,1], tf.float32), 'theta' : Tensor([batch_size,1], tf.float32)) Returns: see parent class docs """ # extract parameters alpha = hf.vec_to_complex(action['alpha']) beta = hf.vec_to_complex(action['beta']) phi = action['phi'] Rotation = self.rotate(action['theta']) Kraus = {} T = {'a': self.translate(alpha), 'b': self.translate(beta / 2.0)} Kraus[0] = 1 / 2 * (tf.linalg.adjoint(T['b']) + self.phase(phi) * T['b']) Kraus[1] = 1 / 2 * (tf.linalg.adjoint(T['b']) - self.phase(phi) * T['b']) psi = self.simulate(psi, self.t_feedback) psi = batch_dot(T['a'], psi) psi_cached = batch_dot(Rotation, psi) psi = self.simulate(psi_cached, self.t_round + self.t_idle) psi_final, msmt = measurement(psi, Kraus) return psi_final, psi_cached, msmt
def call(self, inputs, training=None): input_shape = K.int_shape(inputs) reduction_axes = list(range(0, len(input_shape))) if self.axis is not None: del reduction_axes[self.axis] del reduction_axes[0] # Put axis last inputs = K.permute_dimensions( inputs, tuple([0] + reduction_axes + [self.axis])) # Collapse all other dims into dim 1 cinp = K.reshape(inputs, (K.shape(inputs)[0], -1, input_shape[self.axis])) n_reduced = K.shape(cinp)[1] # Calculate dot product pure_gram = K.batch_dot(cinp, cinp, 1) scaled_gram = pure_gram / K.cast( 2 * n_reduced * input_shape[self.axis], 'float32') return scaled_gram #return K.sqrt(scaled_gram) # Calculate covariance means = K.mean(cinp, [1], keepdims=True) mean_mat = K.batch_dot(means, means, 1) cov = scaled_gram - mean_mat return cov
def scaled_dot_product_attention( inputs, mask=None, return_attention=False, history_only=False ): query, key, value, query_group_ids, key_group_ids = inputs if isinstance(mask, list): mask = mask[1] feature_dim = K.shape(query)[-1] e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx())) group_mask = tf.equal(query_group_ids[:, :, None], key_group_ids[:, None, :]) e -= (1.0 - tf.cast(group_mask, tf.float32)) * 1e9 if history_only: query_len, key_len = K.shape(query)[1], K.shape(key)[1] ones = tf.ones((query_len, key_len)) e -= (ones - tf.matrix_band_part(ones, -1, 0)) * 1e9 if mask is not None: e -= (1.0 - K.cast(K.expand_dims(mask, axis=-2), K.floatx())) * 1e9 a = tf.keras.activations.softmax(e) v = K.batch_dot(a, value, axes=[2, 1]) if return_attention: return [v, a] return v
def call(self, inputs): if self._masking: assert len( inputs ) == 4, "inputs should be set [queries, keys, values, masks]." queries, keys, values, masks = inputs else: assert len( inputs) == 3, "inputs should be set [queries, keys, values]." queries, keys, values = inputs if K.dtype(queries) != 'float32': queries = K.cast(queries, 'float32') if K.dtype(keys) != 'float32': keys = K.cast(keys, 'float32') if K.dtype(values) != 'float32': values = K.cast(values, 'float32') matmul = K.batch_dot(queries, tf.transpose(keys, [0, 2, 1])) # MatMul scaled_matmul = matmul / int(queries.shape[-1])**0.5 # Scale if self._masking: scaled_matmul = self.mask(scaled_matmul, masks) # Mask(opt.) if self._future: scaled_matmul = self.future_mask(scaled_matmul) softmax_out = K.softmax(scaled_matmul) # SoftMax # Dropout out = K.dropout(softmax_out, self._dropout_rate) outputs = K.batch_dot(out, values) return outputs
def attention(x_inner, x_outer, n_factor, dropout): x_Q = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_inner) x_K = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_outer) x_V = L.Conv1D( n_factor, 1, activation='linear', kernel_initializer='glorot_uniform', bias_initializer='glorot_uniform', )(x_outer) x_KT = L.Permute((2, 1))(x_K) res = L.Lambda(lambda c: K.batch_dot(c[0], c[1]) / np.sqrt(n_factor))( [x_Q, x_KT]) # res = tf.expand_dims(res, axis = 3) # res = L.Conv2D(16, 3, 1, padding = "same", activation = "relu")(res) # res = L.Conv2D(1, 3, 1, padding = "same", activation = "relu")(res) # res = tf.squeeze(res, axis = 3) att = L.Lambda(lambda c: K.softmax(c, axis=-1))(res) att = L.Lambda(lambda c: K.batch_dot(c[0], c[1]))([att, x_V]) return att
def call(self, u_ves): print(self.W_kernel.shape) print("*****", u_ves.shape) u_ves = tf.transpose(u_ves, perm=[0, 2, 1]) print("*****", u_ves.shape) u_hat_vecs = K.conv1d(u_ves, self.W_kernel) print("*****", u_hat_vecs.shape) batch_size = tf.shape(u_ves)[0] input_num_capsule = tf.shape(u_ves)[1] u_hat_vecs = tf.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.out_num_capsule, self.out_dim_capusle)) u_hat_vecs = tf.transpose( u_hat_vecs, perm=[0, 2, 1, 3] ) # finally shape = [N0ne,out_num_capsule,input_num_capsule,out_dim_capsule] # Dynamic routing b = tf.zeros_like( u_hat_vecs[:, :, :, 0]) #shape = [N0ne,out_num_capsule,input_num_capsule] for i in range(self.routings): c = softmax(b, 1) output = K.batch_dot(c, u_hat_vecs, [2, 2]) output = self.activation(output) if i < self.routings - 1: # o = tf.nn.l2_normalize(o,-1) b = b + K.batch_dot(output, u_hat_vecs, [2, 3]) pose = output print("pose is :", pose.shape) return pose
def call(self, inputs, **kwargs): """Following the routing algorithm from Hinton's paper, but replace b = b + <u,v> with b = <u,v>. This change can improve the feature representation of the capsule. However, you can replace b = K.batch_dot(outputs, hat_inputs, [2, 3]) with b += K.batch_dot(outputs, hat_inputs, [2, 3]) to get standard routing. """ if self.share_weights: hat_inputs = K.conv1d(inputs, self.kernel) else: hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1]) batch_size = K.shape(inputs)[0] input_num_capsule = K.shape(inputs)[1] hat_inputs = K.reshape(hat_inputs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3)) b = K.zeros_like(hat_inputs[:, :, :, 0]) print(self.routings) for i in range(self.routings): c = K.softmax(b, 1) o = self.activation(K.batch_dot(c, hat_inputs, [2, 2])) if i < self.routings - 1: b = K.batch_dot(o, hat_inputs, [2, 3]) if K.backend() == 'theano': o = K.sum(o, axis=1) return o
def cfam_module(input,classes=6,channel=128,channel1=64): input_shape = input.get_shape().as_list() _,H,W,_ = input_shape N = classes C = channel C1 = channel1 x = Conv2D(C,3,padding='same',use_bias=False)(input) x1 = Conv2D(C1,1,padding='same',use_bias=False)(x) x1 = tf.transpose(K.reshape(x1,(-1,H*W,C1)),(0,2,1)) p = Conv2D(N,1,padding='same',use_bias=False)(x) p1 = Activation('softmax')(p) p1 = K.reshape(p1,(-1,H*W,N)) A = K.batch_dot(x1,p1) A = Activation('softmax')(A) p1 = tf.transpose(p1,(0,2,1)) x2 = K.batch_dot(A,p1) x2 = K.reshape(tf.transpose(x2,(0,2,1)),(-1,H,W,C1)) x2 = Conv2D(C,(1,1),padding='same',use_bias=False)(x2) x2 = BatchNormalization(epsilon=1e-3)(x2) x2 = Activation('relu')(x2) x3 = Concatenate()([x2,x]) y = Conv2D(C,(1,1),padding='same',use_bias=False)(x3) y = BatchNormalization(epsilon=1e-3)(y) y = Activation('relu')(y) return y
def call(self, x, **kwargs): assert isinstance(x, list) inp_a, inp_b = x outp_a = K.l2_normalize(inp_a, -1) outp_b = K.l2_normalize(inp_b, -1) alpha = K.batch_dot(outp_b, outp_a, axes=[2, 2]) alpha = K.l2_normalize(alpha, 1) alpha = K.one_hot(K.argmax(alpha, 1), K.int_shape(inp_a)[1]) hmax = K.batch_dot(alpha, outp_b, axes=[1, 1]) kcon = K.eye(K.int_shape(inp_a)[1], dtype='float32') m = [] for i in range(self.output_dim): outp_a = inp_a * self.W[i] outp_hmax = hmax * self.W[i] outp_a = K.l2_normalize(outp_a, -1) outp_hmax = K.l2_normalize(outp_hmax, -1) outp = K.batch_dot(outp_hmax, outp_a, axes=[2, 2]) outp = K.sum(outp * kcon, -1, keepdims=True) m.append(outp) if self.output_dim > 1: persp = K.concatenate(m, 2) else: persp = m[0] return [persp, persp]
def call(self, x): # 如果只传入Q_seq,K_seq,V_seq,那么就不做Mask # 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask if len(x) == 3: Q_seq, K_seq, V_seq = x Q_len, V_len = None, None elif len(x) == 5: Q_seq, K_seq, V_seq, Q_len, V_len = x # 对Q、K、V做线性变换 Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.head_dim)) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.head_dim)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.head_dim)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) # 计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.head_dim ** 0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) # 输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def mpgm_loss(target, prediction, l_A=1., l_E=1., l_F=1.): """ Loss function using max-pooling graph matching as describes in the GraphVAE paper. Lets see if backprop works. Args obvly the same as above! """ A, E, F = target A_hat, E_hat, F_hat = prediction n = A.shape[1] k = A_hat.shape[1] mpgm = MPGM() X = tf.cast(mpgm.call(A, A_hat, E, E_hat, F, F_hat), dtype=tf.float64) # now comes the loss part from the paper: A_t = tf.transpose(X, perm=[0, 2, 1]) @ A @ X # shape (bs,k,n) E_hat_t = tf.transpose(batch_dot(batch_dot(X, E_hat, axes=(-1, 1)), X, axes=(-2, 1)), perm=[0, 1, 3, 2]) F_hat_t = tf.matmul(X, F_hat) # To avoid inf or nan errors we add the smallest possible value to all elements. A_hat_4log = add_e7(A_hat) term_1 = (1 / k) * tf.math.reduce_sum( diag_part(A_t) * tf.math.log(diag_part(A_hat_4log)), [1], keepdims=True) term_2 = tf.reduce_sum( (tf.ones_like(diag_part(A_t)) - diag_part(A_t)) * (tf.ones_like(diag_part(A_hat)) - tf.math.log(diag_part(A_hat_4log))), [1], keepdims=True) # TODO unsure if (1/(k*(1-k))) or ((1-k)/k) ??? Also the second sum in the paper is confusing. I am going to interpret it as matrix multiplication and sum over all elements. b = diag_part(A_t) term_31 = set_diag(A_t, tf.zeros_like(diag_part(A_t))) * set_diag( tf.math.log(A_hat_4log), tf.zeros_like(diag_part(A_hat))) term_31 = replace_nan(term_31) # You know why! term_32 = tf.ones_like(A_t) - set_diag(A_t, tf.zeros_like( diag_part(A_t))) * tf.math.log( tf.ones_like(A_t) - set_diag(A_hat_4log, tf.zeros_like(diag_part(A_hat)))) term_32 = replace_nan(term_32) term_3 = (1 / k * (1 - k)) * tf.expand_dims( tf.math.reduce_sum(term_31 + term_32, [1, 2]), -1) log_p_A = term_1 + term_2 + term_3 # Man so many confusions: is the log over one or both Fs??? F = tf.cast(F, dtype=tf.float64) A = tf.cast(A, dtype=tf.float64) E = tf.cast(E, dtype=tf.float64) log_p_F = (1 / n) * tf.math.log( tf.expand_dims(tf.math.reduce_sum(add_e7(F * F_hat_t), [1, 2]), -1)) log_p_E = tf.math.log( tf.expand_dims((1 / (tf.norm(A, ord='fro', axis=[-2, -1]) - n)) * tf.math.reduce_sum(add_e7(E * E_hat_t), [1, 2, 3]), -1)) log_p = -l_A * log_p_A - l_F * log_p_F - l_E * log_p_E return log_p
def acf_module(coarse_input, feature_map): input_shape = coarse_input.get_shape().as_list() _, H, W, N = input_shape coarse = tf.transpose(K.reshape(coarse_input, (-1, H * W, N)), (0, 2, 1)) C = 64 x = Conv2D(C, (1, 1), padding='same', use_bias=False, activation=None, name='feature_map_conv1')(feature_map) x = BatchNormalization(name='feature_map_conv1_BN')(x) x = Activation(tf.nn.relu)(x) x = Dropout(0.1)(x) x = K.reshape(x, (-1, H * W, C)) x = K.batch_dot(coarse, x) x = tf.subtract(K.max(x, axis=-1, keepdims=True), x) x = tf.nn.softmax(x, axis=-1) x = tf.transpose(x, (0, 2, 1)) x = K.batch_dot(x, coarse) x = tf.transpose(x, (0, 2, 1)) x = K.reshape(x, (-1, H, W, C)) x = Conv2D(C, (1, 1), padding='same', use_bias=False, activation=None, name='feature_map_conv2')(x) return x
def call(self, x, mask=None): q, k, v = x d_k = q.shape.as_list()[2] # in pure tensorflow: # weights = tf.matmul(x_batch, tf.transpose(y_batch, perm=[0, 2, 1])) # normalized_weights = tf.nn.softmax(weights/scaling) # output = tf.matmul(normalized_weights, x_batch) weights = K.batch_dot(q, k, axes=[2, 2]) if mask is not None: # add mask weights if isinstance(mask, (list, tuple)): if len(mask) > 0: raise ValueError( "mask can only be a Tensor or a list of length 1 containing a tensor." ) mask = mask[0] weights += -1e10 * (1 - mask) normalized_weights = K.softmax(weights / np.sqrt(d_k)) output = K.batch_dot(normalized_weights, v) if self._return_attention: return [output, normalized_weights] else: return output
def call(self, x): if len(x) == 3:#解析传入的入Q_seq,K_seq,V_seq Q_seq,K_seq,V_seq = x Q_len,V_len = None,None elif len(x) == 5:#Q_len,V_len为mask的长度 Q_seq,K_seq,V_seq,Q_len,V_len = x print("Q_seq------------------",Q_seq) #对Q、K、V做线性变换,一共做nb_head次,每次线性变化成size_per_head维度 Q_seq = K.dot(Q_seq, self.WQ)#查询 Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))#相当于transpose,排列各维度的顺序 shape=(4,) K_seq = K.dot(K_seq, self.WK)#键 K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0,2,1,3))#shape=(4,) V_seq = K.dot(V_seq, self.WV)#值 V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0,2,1,3)) #计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3,3]) / self.size_per_head**0.5#attention_11/Shape_12:0", shape=(5,) ########上句报错 ########ValueError: Dimension must be 5 but is 4 for 'attention_11/transpose_7' #####在TF1中,A形状为shape=(4,),到了TF2中,A形状变成了(5,) A = K.permute_dimensions(A, (0,3,2,1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0,3,2,1)) A = K.softmax(A) #输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3,2]) O_seq = K.permute_dimensions(O_seq, (0,2,1,3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def call(self, x): Q_seq, K_seq, V_seq = x Q_len, V_len = None, None print("build attention") Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head ** 0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, "add") A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, "mul") return O_seq
def attention(self, q, k, v, training=None) -> KTensor: ndim = K.cast(K.shape(q)[-1], dtype=K.floatx()) product = K.batch_dot(q, k, axes=(2, 2)) weights = K.softmax(product / K.sqrt(ndim)) if self.regularise: self.add_regularisation(weights) weights_dropout = ops.apply_dropout(self.dropout, weights, training) return K.batch_dot(weights_dropout, v)