def simple_context(X, mask): desc, head = X[:, :parameters.max_len_desc, :], X[:, parameters. max_len_desc:, :] head_activations, head_words = head[:, :, :parameters. activation_rnn_size], head[:, :, parameters. activation_rnn_size:] desc_activations, desc_words = desc[:, :, :parameters. activation_rnn_size], desc[:, :, parameters. activation_rnn_size:] activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) activation_energies = activation_energies + -1e20 * K.expand_dims( 1. - K.cast(mask[:, :parameters.max_len_desc], 'float32'), 1) activation_energies = K.reshape(activation_energies, (-1, parameters.max_len_desc)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape( activation_weights, (-1, parameters.max_len_head, parameters.max_len_desc)) desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def simple_context(X, mask): """ Simple context calculation layer logic X = (batch_size, time_steps, units) time_steps are nothing but number of words in our case. """ # segregrate heading and desc desc, head = X[:, :parameters.max_len_desc, :], X[:, parameters.max_len_desc:, :] # segregrate activation and context part head_activations, head_words = head[:, :, :parameters.activation_rnn_size], head[:, :, parameters.activation_rnn_size:] desc_activations, desc_words = desc[:, :, :parameters.activation_rnn_size], desc[:, :, parameters.activation_rnn_size:] # p=(bacth_size, length_desc_words, rnn_units) # q=(bacth_size, length_headline_words, rnn_units) # K.dot(p,q) = (bacth_size, length_desc_words,length_headline_words) activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2)) # make sure we dont use description words that are masked out activation_energies = activation_energies + -1e20 * K.expand_dims(1. - K.cast(mask[:, :parameters.max_len_desc], 'float32'), 1) # for every head word compute weights for every desc word activation_energies = K.reshape(activation_energies, (-1, parameters.max_len_desc)) activation_weights = K.softmax(activation_energies) activation_weights = K.reshape(activation_weights, (-1, parameters.max_len_head, parameters.max_len_desc)) # for every head word compute weighted average of desc words desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1)) return K.concatenate((desc_avg_word, head_words))
def pairwise_attention_dot(x1, x2, x1_mask=None, x2_mask=None, return_score=False, scope_name='pairwise_attention_dot', reuse=tf.AUTO_REUSE): ''' :param x1: [N, S1, d] :param x2: [N, S2, d] :param x1_mask: [N, S1] :param x2_mask: [N, S2], 1 as valid position :return: ''' with tf.variable_scope(scope_name, reuse=reuse): alpha = tf.matmul(x1, tf.transpose(x2, perm=[0, 2, 1])) # [N, S1, S2] alpha1 = alpha if not x2_mask is None: alpha1 = add_mask(alpha, x2_mask, expand_axis=(1, )) alpha2 = alpha if not x1_mask is None: alpha2 = add_mask(alpha, x1_mask, expand_axis=(2, )) alpha1 = tf.nn.softmax(alpha1, axis=2) alpha2 = tf.nn.softmax(alpha2, axis=1) x1_att = K.batch_dot(alpha1, x2, axes=[2, 1]) x2_att = K.batch_dot(alpha2, x1, axes=[1, 1]) if return_score: return x1_att, x2_att, alpha1, alpha2, alpha return x1_att, x2_att
def call(self, inputs, mask=None): # output = softmax(score) k, q = inputs if len(q.shape) == 2: q = K.expand_dims(q, axis=1) # k: (?, K_LEN, EMBED_DIM,) # q: (?, Q_LEN, EMBED_DIM,) # score: (?, Q_LEN, K_LEN,) if self.score_function == 'scaled_dot_product': kt = K.permute_dimensions(k, (0, 2, 1)) qkt = K.batch_dot(q, kt) score = qkt / self.EMBED_DIM elif self.score_function == 'mlp': kq = K.concatenate([k, q], axis=1) kqw2 = K.tanh(K.dot(kq, self.W2)) score = K.permute_dimensions(K.dot(self.W1, kqw2), (1, 0, 2)) elif self.score_function == 'bi_linear': qw = K.dot(q, self.W) kt = K.permute_dimensions(k, (0, 2, 1)) score = K.batch_dot(qw, kt) else: raise RuntimeError('invalid score_function') score = K.softmax(score) # if mask is not None: # score *= K.cast(mask[0], K.floatx()) # output: (?, Q_LEN, EMBED_DIM,) output = K.batch_dot(score, k) return output
def call(self, u_vecs, **kwargs): if self.share_weights: u_hat_vecs = K.conv1d(u_vecs, self.W) else: u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule] b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] for i in range(self.routings): c = softmax(b, 1) o = K.batch_dot(c, u_hat_vecs, [2, 2]) if K.backend() == 'theano': o = K.sum(o, axis=1) if i < self.routings - 1: o = K.l2_normalize(o, -1) b = K.batch_dot(o, u_hat_vecs, [2, 3]) if K.backend() == 'theano': b = K.sum(b, axis=1) return self.activation(o)
def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim #xw = K.reshape(K.dot(x[0], K.reshape(self.W, (features_dim, features_dim))), (-1, features_dim)) #yavg=K.reshape(K.mean(K.mean(x[1], axis=1, keepdims=True),axis=0, keepdims=True), (features_dim,-1)) xw1 = K.dot(x[0], K.reshape(self.W1, (features_dim, features_dim))) xw2 = K.dot(x[1], K.reshape(self.W2, (features_dim, features_dim))) xw1t = K.permute_dimensions(xw1, [0, 2, 1]) xw2t = K.permute_dimensions(xw2, [0, 2, 1]) xw11 = K.batch_dot(xw1, xw1t) / (step_dim**0.5) xw12 = K.batch_dot(xw1, xw2t) / (step_dim**0.5) s11 = self.ll * K.softmax(xw11) s12 = (1 - self.ll) * K.softmax(xw12) eij = s11 + s12 print(eij.get_shape()) V = x[0] * K.mean(eij, axis=2, keepdims=True) if self.get_alpha: return eij else: if self.get_sequence: return V else: return K.sum(V, axis=1)
def call(self, inputs, mask=None, training=None): inputs, relatives, memories, bias_context, bias_relative = inputs full = K.concatenate([memories, inputs], axis=1) # (batch, prev_len + seq_len, units) w_q = K.dot(inputs, self.kernel_q) # (batch, seq_len, units) w_kv = K.dot(full, self.kernel_kv) # (batch, prev_len + seq_len, units * 2) w_r = K.dot(relatives, self.kernel_r) # (batch, prev_len + seq_len, units) if self.use_bias: w_q = K.bias_add(w_q, self.bias_q) w_kv = K.bias_add(w_kv, self.bias_kv) w_r = K.bias_add(w_r, self.bias_r) if self.activation is not None: w_q = self.activation(w_q) w_kv = self.activation(w_kv) w_r = self.activation(w_r) w_k = w_kv[:, :, :self.units] # (batch, prev_len + seq_len, units) w_v = w_kv[:, :, self.units:] # (batch, prev_len + seq_len, units) w_qc = K.bias_add(w_q, bias_context) w_qc = self._reshape_to_batches(w_qc) # (batch * n_head, seq_len, units_head) w_k = self._reshape_to_batches(w_k) # (batch * n_head, prev_len + seq_len, units_head) a_context = K.batch_dot(w_qc, w_k, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) w_qr = K.bias_add(w_q, bias_relative) w_qr = self._reshape_to_batches(w_qr) # (batch * n_head, seq_len, units_head) w_r = self._reshape_to_batches(w_r) # (batch * n_head, prev_len + seq_len, units_head) a_relative = K.batch_dot(w_qr, w_r, axes=2) # (batch * n_head, seq_len, prev_len + seq_len) a_relative = self._relative_shift(a_relative) # (batch * n_head, seq_len, prev_len + seq_len) att = (a_context + a_relative) / K.sqrt(K.constant(self.units_head, dtype=K.floatx())) exp = K.exp(att - K.max(att, axis=-1, keepdims=True)) q_len, k_len = K.shape(w_q)[1], K.shape(w_k)[1] indices = K.expand_dims(K.arange(0, k_len), axis=0) upper = K.expand_dims(K.arange(k_len - q_len, k_len), axis=-1) exp *= K.expand_dims(K.cast(indices <= upper, K.floatx()), axis=0) if mask is not None and mask[0] is not None: mask = K.cast(mask[0], K.floatx()) mask = K.concatenate([K.ones_like(memories[:, :, 0]), mask], axis=1) exp *= K.expand_dims(self._reshape_mask(mask), axis=1) att = exp / K.sum(exp, axis=-1, keepdims=True) if self.att_drop_layer is not None: att = self.att_drop_layer(att, training=training) w_v = self._reshape_to_batches(w_v) # (batch * n_head, prev_len + seq_len, units_head) w_o = K.batch_dot(att, w_v) # (batch * n_head, seq_len, units_head) w_o = self._reshape_from_batches(w_o) # (batch, seq_len, units) w_o = K.dot(w_o, self.kernel_o) # (batch, seq_len, units) if self.use_bias: w_o = K.bias_add(w_o, self.bias_o) if self.activation is not None: w_o = self.activation(w_o) # Add shape information to tensor when using `tf.keras` input_shape = K.int_shape(inputs) if input_shape[1] is not None: w_o = K.reshape(w_o, (-1,) + input_shape[1:]) return w_o
def __call__(self, q, k, v, mask): attn = Lambda( lambda x: K.batch_dot(x[0], x[1], axes=[2, 2]) / self.temper)( [q, k]) if mask is not None: mmask = Lambda(lambda x: (-1e+10) * (1 - x))(mask) attn = Add()([attn, mmask]) attn = Activation('softmax')(attn) attn = self.dropout(attn) output = Lambda(lambda x: K.batch_dot(x[0], x[1]))([attn, v]) return output, attn
def call(self, u_vecs, scores=None): # if self.share_weights: # u_hat_vecs = K.conv1d(u_vecs, self.W) # else: # u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1]) u_hat_vecs = u_vecs batch_size = K.shape(u_vecs)[0] input_num_capsule = K.shape(u_vecs)[1] if scores is not None: scores = K.permute_dimensions(scores, (0, 2, 1)) u_hat_vecs = u_hat_vecs * scores u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule, self.num_capsule, self.dim_capsule)) u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3)) b = K.zeros_like( u_hat_vecs[:, :, :, 0]) # shape = [None, num_capsule, input_num_capsule] # biases = self.add_weight(name='capsule_kernel', # shape=(batch_size1, self.num_capsule, self.dim_capsule), # # shape=self.kernel_size, # dtype=tf.float32, # initializer='glorot_uniform', # trainable=True) # biases = tf.get_variable(name='bias', # shape=(self.num_capsule, self.dim_capsule), initializer='glorot_uniform',) for i in range(self.routings): # b = K.permute_dimensions(b, (0, 2, 1)) # shape = [None, input_num_capsule, num_capsule] # c = K.softmax(b) leak = tf.zeros_like(b, optimize=True) leak = tf.reduce_sum(leak, axis=1, keep_dims=True) leaky_logits = tf.concat([leak, b], axis=1) leaky_routing = tf.nn.softmax(leaky_logits, dim=1) c = tf.split(leaky_routing, [1, self.num_capsule], axis=1)[1] # c = K.permute_dimensions(c, (0, 2, 1)) # b = K.permute_dimensions(b, (0, 2, 1)) o = K.batch_dot(c, u_hat_vecs, [2, 2]) # + self.biases outputs = self.activation(o) if i < self.routings - 1: b = K.batch_dot(outputs, u_hat_vecs, [2, 3]) # self.c = scores return outputs
def selfattoptions(args): q = args[0] k = args[1] v = args[2] q = tf.expand_dims(q, -1) k = tf.expand_dims(k, -1) v = tf.expand_dims(v, -1) QK = K.batch_dot(q, K.permute_dimensions(k, [0, 2, 1])) QK = QK / (20**0.5) QK = K.softmax(QK) MV = K.batch_dot(QK, v) MV = tf.squeeze(MV, -1) return MV
def gram_matrix(x, norm_by_channels=False): ''' Returns the Gram matrix of the tensor x. ''' if K.ndim(x) == 3: features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1))) shape = K.shape(x) C, H, W = shape[0], shape[1], shape[2] gram = K.dot(features, K.transpose(features)) elif K.ndim(x) == 4: # Swap from (H, W, C) to (B, C, H, W) x = K.permute_dimensions(x, (0, 3, 1, 2)) shape = K.shape(x) B, C, H, W = shape[0], shape[1], shape[2], shape[3] # Reshape as a batch of 2D matrices with vectorized channels features = K.reshape(x, K.stack([B, C, H * W])) # This is a batch of Gram matrices (B, C, C). gram = K.batch_dot(features, features, axes=2) else: raise ValueError( 'The input tensor should be either a 3d (H, W, C) or 4d (B, H, W, C) tensor.' ) # Normalize the Gram matrix if norm_by_channels: denominator = C * H * W # Normalization from Johnson else: denominator = H * W # Normalization from Google gram = gram / K.cast(denominator, x.dtype) return gram
def _merge_function(self, inputs): base_layer_utils.no_ragged_support(inputs, self.name) if len(inputs) != 2: raise ValueError( 'A `Dot` layer should be called on exactly 2 inputs') x1 = inputs[0] x2 = inputs[1] if isinstance(self.axes, int): if self.axes < 0: axes = [ self.axes % backend.ndim(x1), self.axes % backend.ndim(x2) ] else: axes = [self.axes] * 2 else: axes = [] for i in range(len(self.axes)): if self.axes[i] < 0: axes.append(self.axes[i] % backend.ndim(inputs[i])) else: axes.append(self.axes[i]) if self.normalize: x1 = nn.l2_normalize(x1, axis=axes[0]) x2 = nn.l2_normalize(x2, axis=axes[1]) output = backend.batch_dot(x1, x2, axes) return output
def call(self, x, mask=None): ''' shape=(batch_size,new_time_step,filters) x_cont=Tensor("layer_dropout_5/cond/Identity:0", shape=(None, None, 128), dtype=float32) x_ques=Tensor("layer_dropout_11/cond/Identity:0", shape=(None, None, 128), dtype=float32) c_mask=Tensor("batch_slice_4/Slice:0", shape=(None, None), dtype=bool)# q_mask=Tensor("batch_slice_5/Slice:0", shape=(None, None), dtype=bool) ''' x_cont, x_ques, c_mask, q_mask = x # get similarity matrix S ##K.dot(x_cont, self.W0)维度变化: [batch_size,time_step,dim] *[dim,1] =[batch_size,time_step,1] subres0 = K.tile(K.dot(x_cont, self.W0), [1, 1, self.q_maxlen]) subres1 = K.tile( K.permute_dimensions(K.dot(x_ques, self.W1), pattern=(0, 2, 1)), [1, self.c_maxlen, 1]) subres2 = K.batch_dot(x_cont * self.W2, K.permute_dimensions(x_ques, pattern=(0, 2, 1))) S = subres0 + subres1 + subres2 S += self.bias q_mask = tf.expand_dims(q_mask, 1) #默认是对最后一维度,即axis=-1 S_ = tf.nn.softmax(self.mask_logits(S, q_mask)) c_mask = tf.expand_dims(c_mask, 2) S_T = K.permute_dimensions( tf.nn.softmax(self.mask_logits(S, c_mask), axis=1), (0, 2, 1)) c2q = tf.matmul(S_, x_ques) q2c = tf.matmul(tf.matmul(S_, S_T), x_cont) result = K.concatenate([x_cont, c2q, x_cont * c2q, x_cont * q2c], axis=-1) return result
def call(self, x, mask=None): energy = self.activation(K.dot(x, self.W0) + self.b0) #energy=self.activation(K.dot(energy, self.W) + self.b) energy = K.dot(energy, self.W) + self.b energy = K.reshape(energy, (-1, self.input_length)) energy = K.softmax(energy) xx = K.batch_dot(energy, x, axes=(1, 1)) all = K.concatenate([xx, energy]) return all
def call(self, inputs): X = inputs[0] A = inputs[1] A_t = A + self.I D_t = tf.linalg.diag(tf.pow(K.sum(A_t, axis=2), -0.5)) A_t = K.batch_dot(K.batch_dot(D_t, A_t), D_t) X_p = tf.tensordot(K.batch_dot(A_t, X), self.W, axes=[[-1], [0]]) if self.activation is not None: X_p = self.activation(X_p) if self.output_adjacency: outputs = [X_p, A] else: outputs = X_p return outputs
def fallback_metric(self, y_true, y_pred): #grab the most confident prediction predictions = K.max(y_pred, axis=-1) #fill a tensor with our threshold_value threshold_tensor = tf.fill(tf.shape(predictions), self.threshold) #Are we confident in our prediction? threshold_high = predictions > threshold_tensor threshold_high = tf.cast(threshold_high, tf.int32) #Do we have low confidence in our prediction? threshold_low = predictions <= threshold_tensor threshold_low = tf.cast(threshold_low, tf.int32) idx_true = K.argmax(y_true, -1) idx_pred = K.argmax(y_pred, -1) #For our confident predictions, compare the top prediction to the label of the true value high_correct = math_ops.equal(idx_true, idx_pred) high_correct = tf.cast(high_correct, tf.int32) #For our less confident predictions, grab the top 2 most confident predictions _, max_pred = tf.math.top_k(y_pred, k=2) #Gather the lineages of those top 2 predictions using the transpose of the hierarchy's adjaency matrix because the adjacency only points from ancestor to descendant lineages = tf.gather(K.transpose(self.hierarchy.A), max_pred) lineages = K.cast(lineages, tf.int32) #Grab the first two columns of this matrix fallback = tf.bitwise.bitwise_and(lineages[:, 0], lineages[:, 1]) #Gather the lineage of the true value actual = tf.gather(K.transpose(self.hierarchy.A), K.argmax(y_true)) actual = K.cast(actual, tf.int32) #Multiply the two together overlap_score = K.batch_dot(fallback, actual) #Are either of the top 2 predictions in the lineage of the true value? If so, overlap_score should be >1 and we count the result as correct low_correct = overlap_score > 1 low_correct = tf.cast(low_correct, tf.int32) low_correct = tf.squeeze(low_correct) #results for the high confidence predictions high_accuracy = tf.math.multiply(threshold_high, high_correct) #results for the low confidence predictions low_accuracy = tf.math.multiply(threshold_low, low_correct) # total accuracy vector correct = high_accuracy + low_accuracy #return batch accuracy value return K.mean(K.cast(correct, tf.float32))
def _outer_product(x): '''Calculate outer-products of two tensors. Args: x: a list of two tensors. Assume that each tensor has shape = (size_minibatch, total_pixels, size_filter) Returns: Outer-products of two tensors. ''' return keras_backend.batch_dot(x[0], x[1], axes=[1, 1]) / x[0].get_shape().as_list()[1]
def call(self, x, mask=None): features_dim = self.features_dim step_dim = self.step_dim t1 = x[:, 0, :] t1 = K.expand_dims(t1, 1) # t1 = K.tile(t1, [1, step_dim, 1]) print(t1) eij = K.batch_dot(x, t1, (2, 2)) #(?,500,1) # eij = K.tile(eij, [1, 1, features_dim]) print(eij) a = K.exp(eij) a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) print(a) weighted_input = x * a temp = K.sum(weighted_input, axis=1) temp = K.expand_dims(temp, 1) temp = K.tile(temp, [1, 1, features_dim]) print(temp) alltemp = temp for i in range(1, step_dim): t1 = x[:, i, :] t1 = K.expand_dims(t1, 1) # t1 = K.tile(t1, [1, 2, 1]) eij = K.batch_dot(x, t1, (2, 2)) # eij = K.tile(eij, [1, 1, features_dim]) a = K.exp(eij) a /= K.cast( K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx()) weighted_input = x * a temp = K.sum(weighted_input, axis=1) temp = K.expand_dims(temp, 1) temp = K.tile(temp, [1, 1, features_dim]) alltemp = keras.layers.concatenate([alltemp, temp], 1) temp = keras.layers.concatenate([x, alltemp]) return temp
def call(self, inputs, **kwargs): batch_size, input_len, _ = inputs.shape q = K.expand_dims(K.dot(inputs, self.Wq), 2) k = K.expand_dims(K.dot(inputs, self.Wk), 1) h = tf.tanh(q + k + self.bh) e = K.dot(h, self.Wv) + self.ba # e = K.reshape(e, shape=(batch_size, input_len, input_len)) e = tf.reshape(e, shape=(batch_size, input_len, input_len)) e = K.exp(e - K.max(e, axis=-1, keepdims=True)) s = K.sum(e, axis=-1, keepdims=True) a = e / (s + K.epsilon()) v = K.batch_dot(a, inputs) return v
def loss_function_rate(y_true, y_pred): # y_true is actually the concatenation of the perfect CSI, h_real and the SNR, sigma^{-2}. # y_pred is the phases of obtained analog precoder v_RF.' (the transpose is due to the NN); h_real = tf.slice( y_true, [0, 0], [-1, Nt]) # the real partition of the complex-valued h_real. h_imag = tf.slice( y_true, [0, Nt], [-1, Nt]) # the imaginary partition of the complex-valued h_real. signal_power = tf.slice(y_true, [0, Nt * 2], [-1, 1]) # sigma^{-2} phase_vrf = tf.transpose( y_pred) # the NN output is 1*Nt, but actual vrf is Nt*1. # transfer the y_pred (the phases) into exact complex v_RF (the lambda layer in the letter) v_real = tf.cos(phase_vrf) v_imag = tf.sin(phase_vrf) # compute the value of norm(hv_RF)^2 # backend.batch_dot only compute the diagonal elements which is really required and thus reduce complexity. hvrf_2 = tf.pow(backend.batch_dot(h_real, v_real) - backend.batch_dot(h_imag, v_imag), 2) + \ tf.pow(backend.batch_dot(h_real, v_imag) + backend.batch_dot(h_imag, v_real), 2) # compute the spectral efficiency with real CSI rate = tf.log(1 + hvrf_2 / Nt * signal_power) / tf.log(2.0) # since the NN is trained to minimize the objective, so we are aiming at minimizing the minus rate. return -rate
def gram_matrix(self, X): """グラム行列の算出""" X_sw = K.permute_dimensions( X, (0, 3, 2, 1) ) # 軸の入れ替え s = K.shape(X_sw) new_shape = (s[0], s[1], s[2]*s[3]) X_rs = K.reshape(X_sw, new_shape) X_rs_t = K.permute_dimensions( X_rs, (0, 2, 1) ) # 行列の転置 dot = K.batch_dot(X_rs, X_rs_t) # 内積の計算 norm = K.prod(K.cast(s[1:], 'float32')) return dot/norm
def call(self, x, **kwargs): # 如果只传入Q_seq,K_seq,V_seq,那么就不做Mask # 如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask if len(x) == 3: Q_seq, K_seq, V_seq = x Q_len, V_len = None, None elif len(x) == 5: Q_seq, K_seq, V_seq, Q_len, V_len = x else: Q_seq, K_seq, V_seq = x Q_len, V_len = None, None # 对Q、K、V做线性变换 Q_seq = K.dot(Q_seq, self.WQ) Q_seq = K.reshape( Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head)) Q_seq = K.permute_dimensions(Q_seq, (0, 2, 1, 3)) K_seq = K.dot(K_seq, self.WK) K_seq = K.reshape( K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head)) K_seq = K.permute_dimensions(K_seq, (0, 2, 1, 3)) V_seq = K.dot(V_seq, self.WV) V_seq = K.reshape( V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head)) V_seq = K.permute_dimensions(V_seq, (0, 2, 1, 3)) # 计算内积,然后mask,然后softmax A = K.batch_dot(Q_seq, K_seq, axes=[3, 3]) / self.size_per_head**0.5 A = K.permute_dimensions(A, (0, 3, 2, 1)) A = self.Mask(A, V_len, 'add') A = K.permute_dimensions(A, (0, 3, 2, 1)) A = K.softmax(A) # 输出并mask O_seq = K.batch_dot(A, V_seq, axes=[3, 2]) O_seq = K.permute_dimensions(O_seq, (0, 2, 1, 3)) O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim)) O_seq = self.Mask(O_seq, Q_len, 'mul') return O_seq
def gram_matrix(X): # 軸の入れ替え => batch, channel, height, width axis_replaced_X = K.permute_dimensions(X, (0, 3, 2, 1)) replaced_shape = K.shape(axis_replaced_X) # 特徴マップ(高さと幅を1つの軸に展開)の内積をとるためのshape dot_shape = (replaced_shape[0], replaced_shape[1], replaced_shape[2] * replaced_shape[3]) # 実際に内積を計算する行列 dot_X = K.reshape(axis_replaced_X, dot_shape) # 転置行列 dot_X_t = K.permute_dimensions(dot_X, (0, 2, 1)) # 行列の内積 dot = K.batch_dot(dot_X, dot_X_t) norm = K.prod(K.cast(replaced_shape[1:], 'float32')) return dot / norm
def call(self, inputs): # inputs_trans = (batch_size, the number of filters, sentence_length) inputs_trans = tf.transpose(inputs, [0, 2, 1]) # at = (batch_size, the number of classes, sentence_length) at = tf.matmul(self.Wa, inputs_trans) # Softmax at = K.exp(at - K.max(at, axis=-1, keepdims=True)) at = at / K.sum(at, axis=-1, keepdims=True) # weighted sum # v = (batch_size, the number of classes, the number of filters) v = K.batch_dot(at, inputs) return v
def loss(y_true, y_pred): y_true = K.squeeze(y_true, axis=-1) # Squeeze y_pred; i.e remove last layer which is of dim 1 y_pred_squeezed = K.squeeze(y_pred, axis=-1) """ Reconstructing x_org """ # reversed_wnorm = Lambda(lambda x: ) # reversed_wnorm = dict(map(reversed, wnorm.items())) # x_org = Lambda(lambda x: [tf.reshape(tf.where(tf.equal(wnorm, word)), [-1])[0] for sent in x for word in sent])(y_true) # x_org = # x_org = [reversed_wnorm.get(word) for sent in y_true for word in sent] x_org = raw_input print(raw_input.shape) x_temp = Lambda(lambda x: tf.cast(tf.reshape(x, [-1, ]), dtype=tf.int32))(x_org) K.print_tensor(K.shape(y_pred_squeezed), message='y_pred_squeezed are ') print(f'Inside decoder....After reshape of x_norm is {y_pred_squeezed.shape}') # Calc prob logits print(type(y_pred_squeezed)) print(type(wnorm)) print(f'wnorm shape is {wnorm.shape}') prob_logits = K.batch_dot(y_pred_squeezed, wnorm, axes=[2, 1]) prob = Lambda(lambda x: tf.nn.log_softmax(x * 100, axis=-1, name='prob_lambda'))(prob_logits) print(f'Prob shape is {prob.shape}') prob = Lambda(lambda x: tf.reshape(x, [-1, n_words]))(prob) # prob = K.reshape(prob, [-1, wnorm.shape[0]]) print(f'Prob reshaped is {prob.shape}') """ Get prob of all the words """ idx = Lambda(lambda x: tf.range(K.shape(x)[0], K.shape(x)[1]))(y_pred_squeezed) all_idx = K.transpose(K.stack([idx, x_temp])) all_prob = Lambda(lambda prob_idx_list: tf.gather_nd(prob_idx_list[0], prob_idx_list[1]))([prob, all_idx]) K.print_tensor(K.shape(all_prob), message='all_prob shape is: ') recons_loss = Lambda(lambda x: -tf.reduce_mean(x))(all_prob) # K.print_tensor(loss, message='Loss is: ') # weighted_recons_loss = loss_weight * recons_loss return recons_loss
def compute_win(self, y_true, y_pred, to_numpy=False): if self.N > self.num_leaves: # if there are more leaf nodes than total nodes in the hierarchy (should always be the case, # but allowed to work either way) then pad with a zero for each non-leaf node in the taxonomy y_true = self._pad(y_true) y_pred = self._pad(y_pred) # propagate the probabilities (algo 1) propagated_probabilities = K.dot(self.A, K.transpose(y_pred)) # find the index from the actual label win_idx = self.select_correct_idx(y_true) # find the mask associated with that label win_mask = tf.gather(self.W, win_idx) # win is q . w (algo 2) win = K.batch_dot(win_mask, K.transpose(propagated_probabilities)) # win is in [0.5,1], remap to [0,1]: remapped = 2 * (win - 0.5) if to_numpy: remapped = K.reshape(remapped, []).numpy() return remapped
def call(self, inputs, mask=None): x, u = inputs if u is None: u = self.add_weight(name="u_{:s}".format(self.name), shape=(self.ATTENTION_SIZE, ), initializer="glorot_normal", trainable=True) # u: (?, ATTENTION_SIZE,) # x: (?, MAX_TIMESTEPS, EMBED_SIZE) # ut: (?, MAX_TIMESTEPS, ATTENTION_SIZE) ut = K.tanh(K.dot(x, self.W) + self.b) # at: (?, MAX_TIMESTEPS,) at = K.batch_dot(ut, u) at = K.softmax(at) if mask is not None: at *= K.cast(mask, K.floatx()) # ot: (?, MAX_TIMESTEPS, EMBED_SIZE,) atx = K.expand_dims(at, axis=-1) ot = atx * x # output: (?, EMBED_SIZE,) output = K.sum(ot, axis=1) return output
def call(self, x): Wx_b = K.dot(x, self.w) + self.b a = tf.nn.softmax(Wx_b) rows = [] for k in range(self.k_centers): error = x - self.c[:, k] row = K.batch_dot(a[:, :, k], error) row = tf.nn.l2_normalize(row, dim=1) rows.append(row) output = tf.stack(rows) output = tf.transpose(output, perm=[1, 0, 2]) output = tf.reshape( output, [tf.shape(output)[0], tf.shape(output)[1] * tf.shape(output)[2]]) return output
def _merge_function(self, inputs): if len(inputs) != 2: raise ValueError('A `Dot` layer should be called ' 'on exactly 2 inputs') x1 = inputs[0] x2 = inputs[1] if isinstance(self.axes, int): if self.axes < 0: axes = [self.axes % K.ndim(x1), self.axes % K.ndim(x2)] else: axes = [self.axes] * 2 else: axes = [] for i in range(len(self.axes)): if self.axes[i] < 0: axes.append(self.axes[i] % K.ndim(inputs[i])) else: axes.append(self.axes[i]) if self.normalize: x1 = nn.l2_normalize(x1, axis=axes[0]) x2 = nn.l2_normalize(x2, axis=axes[1]) output = K.batch_dot(x1, x2, axes) return output
def call(self, inputs): batch_size = K.shape(inputs)[0] num_rows = K.int_shape(inputs)[1] num_cols = K.int_shape(inputs)[2] num_channels = K.int_shape(inputs)[3] n = num_rows * num_cols X = K.reshape(inputs, (batch_size, num_channels, n)) factor = K.cast(1 / n, K.floatx()) I_hat = factor * (K.eye(n) - factor * K.ones((n, n))) I_hat = K.tile( K.expand_dims(I_hat, axis=0), (batch_size, 1, 1)) # One identity matrix per sample in batch Sigma = K.batch_dot(K.batch_dot(X, I_hat), K.permute_dimensions(X, (0, 2, 1))) # Pre-normalization trace = K.sum(K.sum(K.eye(num_channels) * Sigma, axis=1, keepdims=True), axis=2, keepdims=True) A = Sigma / trace # Newton-Schulz Iteration Y = A Z = K.eye(num_channels) Z = K.tile(K.expand_dims(Z, axis=0), (batch_size, 1, 1)) I3 = 3 * K.eye(num_channels) I3 = K.tile(K.expand_dims(I3, axis=0), (batch_size, 1, 1)) for i in range(self.num_iter): Y = 0.5 * K.batch_dot(Y, I3 - K.batch_dot(Z, Y)) Z = 0.5 * K.batch_dot(I3 - K.batch_dot(Z, Y), Z) # Post-compensation C = K.sqrt(trace) * Y # Extract upper triangular matrix as vector ones = K.ones((num_channels, num_channels)) mask = tf.matrix_band_part(ones, 0, -1) # Upper triangular matrix of 0s and 1s mask = K.cast(mask, 'bool') # Convert integer mask to boolean mask triuvec = tf.boolean_mask( C, mask, axis=1) # Apply mask to 2nd and 3rd dimension triuvec.set_shape((None, num_channels * (num_channels + 1) / 2)) # Set correct shape manually return triuvec
def _attention_layer(self, memory_plus_inputs, ws): from_length = self.num_memory_slots + 1 to_length = self.num_memory_slots + 1 q_bias, k_bias, v_bias = array_ops.split(ws["attention_bias"], 3, axis=0) # [B, F, N, H] query_layer = K.dot( memory_plus_inputs, ws["attention_kernel"][:, :self.units]) query_layer = K.bias_add(query_layer, q_bias) query_layer = array_ops.reshape( query_layer, [-1, from_length, self.num_attention_heads, self.size_per_head]) # [B, N, F, H] query_layer1 = array_ops.transpose(query_layer, perm=[0, 2, 1, 3]) # [B*N, F, H] query_layer = array_ops.reshape( query_layer1, shape=[-1, from_length, self.size_per_head]) # [B, T, N, H] key_layer = K.dot( memory_plus_inputs, ws["attention_kernel"][:, self.units:self.units * 2]) key_layer = K.bias_add(key_layer, k_bias) key_layer = array_ops.reshape( key_layer, [-1, to_length, self.num_attention_heads, self.size_per_head]) # [B, N, T, H] key_layer = array_ops.transpose(key_layer, perm=[0, 2, 1, 3]) # [B*N, T, H] key_layer = array_ops.reshape( key_layer, shape=[-1, to_length, self.size_per_head]) # [B, T, N, H] value_layer = K.dot( memory_plus_inputs, ws["attention_kernel"][:, self.units * 2:self.units * 3]) value_layer = K.bias_add(value_layer, v_bias) value_layer = array_ops.reshape( value_layer, [-1, to_length, self.num_attention_heads, self.size_per_head]) # [B, N, T, H] value_layer = array_ops.transpose(value_layer, perm=[0, 2, 1, 3]) # [B*N, T, H] value_layer = array_ops.reshape( value_layer, shape=[-1, to_length, self.size_per_head]) # [B*N, F, T] attention_scores = K.batch_dot(query_layer, key_layer, axes=[2, 2]) if self.use_relative_position: # [F+T-1, N*H] r = K.dot(self.rel_table, ws["rel_kernel"]) # [F+T-1, N, H] r = array_ops.reshape( r, [-1, self.num_attention_heads, self.size_per_head]) # [B, N, F, F+T-1] bd = tf.einsum("bnfh,lnh->bnfl", query_layer1, r) # [B*N, F, F+T-1] bd = array_ops.reshape( bd, [-1, from_length, from_length + to_length - 1]) # [B*N, F, T] bd = tf.einsum("bfl,ftl->bft", bd, self.pos_table) # [B*N, F, T] attention_scores += bd # [B*N, F, T] attention_scores = attention_scores / K.cast(self.size_per_head, tf.float32) # [B*N, F, T] attention_probs = K.softmax(attention_scores) # [B*N, F, H] context_layer = K.batch_dot(attention_probs, value_layer, axes=[2, 1]) # [B, N, F, H] context_layer = array_ops.reshape( context_layer, [-1, self.num_attention_heads, from_length, self.size_per_head]) # [B, F, N, H] context_layer = array_ops.transpose(context_layer, perm=[0, 2, 1, 3]) # [B, F, N*H] context_layer = array_ops.reshape( context_layer, [-1, from_length, self.num_attention_heads * self.size_per_head]) return context_layer