def _aggregateBlock(self, v_1, v_2, scope): """ :param v_1: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) :param v_2: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) :param scope: scope name v1_sum, v2_sum: sum of the compared phrases (axis = seq_length), tensor with shape (batch_size, hidden_size) v: concat of v1_sum, v2_sum, tensor with shape (batch_size, 2 * hidden_size) ff_outputs: output of feed forward layer (H), tensor with shape (batch_size, hidden_size) :return: y_hat: output of a linear layer, tensor with shape (batch_size, n_classes) """ with tf.variable_scope(scope): # v1 = \sum_{i=1}^l_a v_{1,i} # v2 = \sum_{j=1}^l_b v_{2,j} (4) v1_sum = tf.reduce_sum(v_1, axis=1) v2_sum = tf.reduce_sum(v_2, axis=1) print_shape('v1_sum', v1_sum) print_shape('v2_sum', v2_sum) # y_hat = H([v1, v2]) (5) v = tf.concat([v1_sum, v2_sum], axis=1) print_shape('v', v) ff_outputs = self._feedForwardBlock(v, self.hidden_size, 'H') print_shape('ff_outputs', ff_outputs) y_hat = tf.layers.dense(ff_outputs, self.n_classes) print_shape('y_hat', y_hat) return y_hat
def _compositionBlock(self, v_p, v_h, scope): """ :param v_p: concat of [m_p, a_p, sub_p, mul_p], tensor with shape (batch_size, self_attention_r, 4 * 2 * rnn_size) :param v_h: concat of [m_h, a_h, sub_h, mul_h], tensor with shape (batch_size, self_attention_r, 4 * 2 * rnn_size) :param scope: scope name v_mean_p, v_mean_h: self-attentive directions (axis = self_attention_r) average of v_p, v_h, tensor with shape (batch_size, 4 * 2 * hidden_size) v_max_p, v_max_h: self-attentive directions (axis = self_attention_r) max value of v_p, v_h, tensor with shape (batch_size, 4 * 2 * hidden_size) v: concat of [v_mean_p, v_mean_h, v_max_p, v_max_h], tensor with shape (batch_size, 4 * 4 * 2 * hidden_size) ff_outputs: output of feed forward layer, tensor with shape (batch_size, hidden_size) :return: y_hat: output of a linear layer, tensor with shape (batch_size, n_classes) """ with tf.variable_scope(scope): v_mean_p = tf.reduce_mean(v_p, axis=1) v_mean_h = tf.reduce_mean(v_h, axis=1) v_max_p = tf.reduce_max(v_p, axis=1) v_max_h = tf.reduce_max(v_h, axis=1) print_shape('v_mean_p', v_mean_p) print_shape('v_max_p', v_max_p) v = tf.concat([v_mean_p, v_mean_h, v_max_p, v_max_h], axis=1) print_shape('v', v) ff_outputs = self._feedForwardBlock(v, self.hidden_size, 'H') print_shape('ff_outputs', ff_outputs) y_hat = tf.layers.dense(ff_outputs, self.n_classes) print_shape('y_hat', y_hat) return y_hat
def _add_variables(self): """ Embedding: Variables to hold word embeddings. Untrainable. """ self.Embedding = tf.Variable(tf.truncated_normal( [self.n_vocab, self.embedding_size]), dtype=tf.float32, name='Embedding', trainable=False) self.init_embedding = self.Embedding.assign(self.embed_matrix) self.Embedding = self._projectionBlock(self.Embedding, self.hidden_size, 'Projection') print_shape('projected embeddings', self.Embedding)
def _loss_op(self, l2_lambda=0.0001): """ :param l2_lambda: L2 normalization constant AAt_p, AAt_h: product of self attention weight vector (A * At), tensor with shape (batch_size, self_attention_r, self_attention_r) batch_I: batch identity matrix, tensor with shape (batch_size, self_attention_r, self_attention_r) penalty_p, penalty_h: penalty of premise's self attention vector, hypothesis's self attention vector, tensor with shape (batch_size) lambda_penalty: penalty normalization constant penalty: penalty of self attention vector, a scalar :return: loss: training loss """ with tf.name_scope('cost'): AAt_p = tf.matmul(self.A_p, tf.transpose(self.A_p, [0, 2, 1])) AAt_h = tf.matmul(self.A_h, tf.transpose(self.A_h, [0, 2, 1])) print_shape('AAt_p', AAt_p) I = tf.eye(self.self_attention_r) batch_I = tf.reshape(tf.tile(I, [tf.shape(self.A_p)[0], 1]), [-1, self.self_attention_r, self.self_attention_r]) print_shape('batch_I', batch_I) penalty_p = tf.square(tf.norm(AAt_p - batch_I, axis=[-2, -1], ord='fro')) penalty_h = tf.square(tf.norm(AAt_h - batch_I, axis=[-2, -1], ord='fro')) print_shape('penalty_p', penalty_p) penalty = tf.reduce_mean((penalty_p + penalty_h) * self.lambda_penalty) print_shape('penalty', penalty) losses = tf.nn.softmax_cross_entropy_with_logits(labels=self.y, logits=self.logits) label_loss = tf.reduce_mean(losses, name='loss_val') weights = [v for v in tf.trainable_variables() if 'kernel' in v.name] l2_loss = tf.add_n([tf.nn.l2_loss(w) for w in weights]) * l2_lambda loss = label_loss + l2_loss + penalty return loss
def _attentionBlock(self, m_p, m_h, scope): """ :param m_p: output of self-attention layer, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) :param m_q: output of self-attention layer, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) :param scope: scope name a_p, a_h: output of attention layer, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) sub_p, sub_h: difference of m_p and a_p, m_h and a_h, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) mul_p, mul_h: hadamard product of m_p and a_p, m_h and a_h, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) :return: v_p: concat of [m_p, a_p, sub_p, mul_p], tensor with shape (batch_size, self_attention_r, 4 * 2 * rnn_size) v_h: concat of [m_h, a_h, sub_h, mul_h], tensor with shape (batch_size, self_attention_r, 4 * 2 * rnn_size) """ with tf.variable_scope(scope): attention_layer = AttentionLayer() a_p, a_h = attention_layer(m_p, m_h) print_shape('a_p', a_p) sub_p = tf.subtract(m_p, a_p) sub_h = tf.subtract(m_h, a_h) mul_p = tf.multiply(m_p, a_p) mul_h = tf.multiply(m_h, a_h) print_shape('sub_p', sub_p) print_shape('mul_p', mul_p) v_p = tf.concat([m_p, a_p, sub_p, mul_p], axis=2) v_h = tf.concat([m_h, a_h, sub_h, mul_h], axis=2) print_shape('v_p', v_p) return v_p, v_h
def _compareBlock(self, alpha, beta, scope): """ :param alpha: context vectors, tensor with shape (batch_size, seq_length, embedding_size) :param beta: context vectors, tensor with shape (batch_size, seq_length, embedding_size) :param scope: scope name a_beta, b_alpha: concat of [embeded_premise, beta], [embeded_hypothesis, alpha], tensor with shape (batch_size, seq_length, 2 * embedding_size) :return: v_1: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) v_2: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) """ with tf.variable_scope(scope): a_beta = tf.concat([self.embeded_left, beta], axis=2) b_alpha = tf.concat([self.embeded_right, alpha], axis=2) print_shape('a_beta', a_beta) print_shape('b_alpha', b_alpha) # v_1,i = G([a_bar_i, beta_i]) # v_2,j = G([b_bar_j, alpha_j]) (3) v_1 = self._feedForwardBlock(a_beta, self.hidden_size, 'G') v_2 = self._feedForwardBlock(b_alpha, self.hidden_size, 'G', isReuse=True) print_shape('v_1', v_1) print_shape('v_2', v_2) return v_1, v_2
def _compositionBlock(self, m_a, m_b, hiddenSize, scope): """ :param m_a: concat of [a_bar, a_hat, a_diff, a_mul], tensor with shape (batch_size, seq_length, 4 * 2 * hidden_size) :param m_b: concat of [b_bar, b_hat, b_diff, b_mul], tensor with shape (batch_size, seq_length, 4 * 2 * hidden_size) :param hiddenSize: biLSTM cell's hidden states size :param scope: scope name outputV_a, outputV_b: hidden states of biLSTM, tuple (forward LSTM cell, backward LSTM cell) v_a, v_b: concate of biLSTM hidden states, tensor with shape (batch_size, seq_length, 2 * hidden_size) v_a_avg, v_b_avg: timestep (axis = seq_length) average of v_a, v_b, tensor with shape (batch_size, 2 * hidden_size) v_a_max, v_b_max: timestep (axis = seq_length) max value of v_a, v_b, tensor with shape (batch_size, 2 * hidden_size) v: concat of [v_a_avg, v_b_avg, v_a_max, v_b_max], tensor with shape (batch_size, 4 * 2 * hidden_size) :return: y_hat: output of feed forward layer, tensor with shape (batch_size, n_classes) """ with tf.variable_scope(scope): outputV_a, finalStateV_a = self._biLSTMBlock( m_a, hiddenSize, 'biLSTM') outputV_b, finalStateV_b = self._biLSTMBlock(m_b, hiddenSize, 'biLSTM', isReuse=True) v_a = tf.concat(outputV_a, axis=2) v_b = tf.concat(outputV_b, axis=2) print_shape('v_a', v_a) print_shape('v_b', v_b) # v_{a,avg} = \sum_{i=1}^l_a \frac{v_a,i}{l_a}, v_{a,max} = \max_{i=1} ^ l_a v_{a,i} (18) # v_{b,avg} = \sum_{j=1}^l_b \frac{v_b,j}{l_b}, v_{b,max} = \max_{j=1} ^ l_b v_{b,j} (19) v_a_avg = tf.reduce_mean(v_a, axis=1) v_b_avg = tf.reduce_mean(v_b, axis=1) v_a_max = tf.reduce_max(v_a, axis=1) v_b_max = tf.reduce_max(v_b, axis=1) print_shape('v_a_avg', v_a_avg) print_shape('v_a_max', v_a_max) # v = [v_{a,avg}; v_{a,max}; v_{b,avg}; v_{b_max}] (20) v = tf.concat([v_a_avg, v_a_max, v_b_avg, v_b_max], axis=1) print_shape('v', v) y_hat = self._feedForwardBlock(v, self.hidden_size, self.n_classes, 'feed_forward') return y_hat
def _aggregateBlock(self, v_1, v_2, scope, left_mask, right_mask, dense_features): """ :param v_1: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) :param v_2: compare the aligned phrases, output of feed forward layer (G), tensor with shape (batch_size, seq_length, hidden_size) :param scope: scope name v1_sum, v2_sum: sum of the compared phrases (axis = seq_length), tensor with shape (batch_size, hidden_size) v: concat of v1_sum, v2_sum, tensor with shape (batch_size, 2 * hidden_size) ff_outputs: output of feed forward layer (H), tensor with shape (batch_size, hidden_size) :return: y_hat: output of a linear layer, tensor with shape (batch_size, n_classes) """ with tf.variable_scope(scope): left_mask = tf.to_float(tf.expand_dims(left_mask, axis=2)) right_mask = tf.to_float(tf.expand_dims(right_mask, axis=2)) v_1 = v_1 * left_mask v_2 = v_2 * right_mask # v1 = \sum_{i=1}^l_a v_{1,i} # v2 = \sum_{j=1}^l_b v_{2,j} (4) v1_sum = tf.reduce_sum(v_1, axis=1) v2_sum = tf.reduce_sum(v_2, axis=1) print_shape('v1_sum', v1_sum) print_shape('v2_sum', v2_sum) # y_hat = H([v1, v2]) (5) v = tf.concat([v1_sum, v2_sum, dense_features], axis=1) print_shape('v', v) ff_outputs = self._feedForwardBlock(v, self.hidden_size, 'H') print_shape('ff_outputs', ff_outputs) # compute the logits y_hat = tf.layers.dense(ff_outputs, self.n_classes, \ kernel_initializer=tf.contrib.layers.xavier_initializer(), \ bias_initializer=tf.contrib.layers.xavier_initializer()) print_shape('y_hat', y_hat) return y_hat
def __call__(self, H): # A = softmax(W_s2 * tanh(W_s1 * H.T)) (7) Ws1Ht = tf.map_fn(fn = lambda x: tf.matmul(self.W_s1, tf.transpose(x)), elems=H) print_shape('Ws1Ht', Ws1Ht) e = tf.map_fn(fn = lambda x: tf.matmul(self.W_s2, tf.tanh(x)), elems=Ws1Ht) print_shape('e', e) A = tf.nn.softmax(e) print_shape('A', A) # M = A * H (8) M = tf.matmul(A, H) print_shape('M', M) return M, A
def _attendBlock(self, scope, left_mask, right_mask): """ :param scope: scope name embeded_left, embeded_right: tensor with shape (batch_size, seq_length, embedding_size) F_a_bar, F_b_bar: output of feed forward layer (F), tensor with shape (batch_size, seq_length, hidden_size) attentionSoft_a, attentionSoft_b: using Softmax at two directions, tensor with shape (batch_size, seq_length, seq_length) e: attention matrix with mask, tensor with shape (batch_size, seq_length, seq_length) :return: alpha: context vectors, tensor with shape (batch_size, seq_length, embedding_size) beta: context vectors, tensor with shape (batch_size, seq_length, embedding_size) """ with tf.variable_scope(scope): F_a_bar = self._feedForwardBlock(self.embeded_left, self.hidden_size, 'F') F_b_bar = self._feedForwardBlock(self.embeded_right, self.hidden_size, 'F', isReuse=True) print_shape('F_a_bar', F_a_bar) print_shape('F_b_bar', F_b_bar) # e_i,j = F'(a_hat, b_hat) = F(a_hat).T * F(b_hat) (1) e_raw = tf.matmul(F_a_bar, tf.transpose(F_b_bar, [0, 2, 1])) # mask padding sequence #mask = tf.multiply(tf.expand_dims(left_mask, 2), tf.expand_dims(right_mask, 1)) #e = tf.multiply(e_raw, mask) + (1.0 - mask)*(-1e9) #print_shape('e', e) right_mask = tf.to_float(tf.expand_dims(right_mask, axis=1)) e = e_raw + (1.0 - right_mask) * (-1e9) beta_attend = tf.nn.softmax(e, dim=-1) beta = tf.matmul(beta_attend, self.embeded_right) e_raw = tf.transpose(e_raw, [0, 2, 1]) left_mask = tf.to_float(tf.expand_dims(left_mask, axis=1)) e = e_raw + (1.0 - left_mask) * (-1e9) alpha_attend = tf.nn.softmax(e, dim=-1) alpha = tf.matmul(alpha_attend, self.embeded_left) # beta = \sum_{j=1}^l_b \frac{\exp(e_{i,j})}{\sum_{k=1}^l_b \exp(e_{i,k})} * b_hat_j # alpha = \sum_{i=1}^l_a \frac{\exp(e_{i,j})}{\sum_{k=1}^l_a \exp(e_{k,j})} * a_hat_i (2) print_shape('alpha', alpha) print_shape('beta', beta) return alpha, beta
def __call__(self, p, h): w_att = tf.matmul(p, tf.transpose(h, [0, 2, 1])) print_shape('w_att', w_att) softmax_p = tf.nn.softmax(w_att) softmax_h = tf.nn.softmax(tf.transpose(w_att)) softmax_h = tf.transpose(softmax_h) print_shape('softmax_p', softmax_p) print_shape('softmax_h', softmax_h) p_hat = tf.matmul(softmax_p, h) h_hat = tf.matmul(softmax_h, p) return p_hat, h_hat
def _logits_op(self): # [batch_size, seq_length, embedding_dim] self.embeded_left = tf.nn.embedding_lookup(self.Embedding, self.premise) self.embeded_right = tf.nn.embedding_lookup(self.Embedding, self.hypothesis) print_shape('embeded_left', self.embeded_left) print_shape('embeded_right', self.embeded_right) # [batch_size, seq_length] left_mask = tf.sequence_mask(self.premise_mask, self.seq_length, tf.float32) right_mask = tf.sequence_mask(self.hypothesis_mask, self.seq_length, tf.float32) print_shape('left_mask', left_mask) print_shape('right_mask', right_mask) alpha, beta = self._attendBlock('Attend', left_mask, right_mask) v_1, v_2 = self._compareBlock(alpha, beta, 'Compare') logits = self._aggregateBlock(v_1, v_2, 'Aggregate', left_mask, right_mask, self.features) return logits
def _inputEncodingBlock(self, scope): """ :param scope: scope name embeded_left, embeded_right: tensor with shape (batch_size, seq_length, embedding_size) :return: a_bar: tensor with shape (batch_size, seq_length, 2 * hidden_size) b_bar: tensor with shape (batch_size, seq_length, 2 * hidden_size) """ with tf.device('/cpu:0'): self.Embedding = tf.get_variable( 'Embedding', [self.n_vocab, self.embedding_size], tf.float32) self.embeded_left = tf.nn.embedding_lookup(self.Embedding, self.premise) self.embeded_right = tf.nn.embedding_lookup( self.Embedding, self.hypothesis) print_shape('embeded_left', self.embeded_left) print_shape('embeded_right', self.embeded_right) with tf.variable_scope(scope): # a_bar = BiLSTM(a, i) (1) # b_bar = BiLSTM(b, i) (2) outputsPremise, finalStatePremise = self._biLSTMBlock( self.embeded_left, self.hidden_size, 'biLSTM', self.premise_mask) outputsHypothesis, finalStateHypothesis = self._biLSTMBlock( self.embeded_right, self.hidden_size, 'biLSTM', self.hypothesis_mask, isReuse=True) a_bar = tf.concat(outputsPremise, axis=2) b_bar = tf.concat(outputsHypothesis, axis=2) print_shape('a_bar', a_bar) print_shape('b_bar', b_bar) return a_bar, b_bar
def _attendBlock(self, scope): """ :param scope: scope name embeded_left, embeded_right: tensor with shape (batch_size, seq_length, embedding_size) F_a_bar, F_b_bar: output of feed forward layer (F), tensor with shape (batch_size, seq_length, hidden_size) attentionSoft_a, attentionSoft_b: using Softmax at two directions, tensor with shape (batch_size, seq_length, seq_length) e: attention matrix with mask, tensor with shape (batch_size, seq_length, seq_length) :return: alpha: context vectors, tensor with shape (batch_size, seq_length, embedding_size) beta: context vectors, tensor with shape (batch_size, seq_length, embedding_size) """ with tf.device('/cpu:0'): self.Embedding = tf.get_variable('Embedding', [self.n_vocab, self.embedding_size], tf.float32) self.embeded_left = tf.nn.embedding_lookup(self.Embedding, self.premise) self.embeded_right = tf.nn.embedding_lookup(self.Embedding, self.hypothesis) print_shape('embeded_left', self.embeded_left) print_shape('embeded_right', self.embeded_right) with tf.variable_scope(scope): F_a_bar = self._feedForwardBlock(self.embeded_left, self.hidden_size, 'F') F_b_bar = self._feedForwardBlock(self.embeded_right, self.hidden_size, 'F', isReuse = True) print_shape('F_a_bar', F_a_bar) print_shape('F_b_bar', F_b_bar) # e_i,j = F'(a_hat, b_hat) = F(a_hat).T * F(b_hat) (1) e_raw = tf.matmul(F_a_bar, tf.transpose(F_b_bar, [0, 2, 1])) # mask padding sequence mask = tf.multiply(tf.expand_dims(self.premise_mask, 2), tf.expand_dims(self.hypothesis_mask, 1)) e = tf.multiply(e_raw, mask) print_shape('e', e) attentionSoft_a = tf.exp(e - tf.reduce_max(e, axis=2, keepdims=True)) attentionSoft_b = tf.exp(e - tf.reduce_max(e, axis=1, keepdims=True)) # mask attention weights attentionSoft_a = tf.multiply(attentionSoft_a, tf.expand_dims(self.hypothesis_mask, 1)) attentionSoft_b = tf.multiply(attentionSoft_b, tf.expand_dims(self.premise_mask, 2)) attentionSoft_a = tf.divide(attentionSoft_a, tf.reduce_sum(attentionSoft_a, axis=2, keepdims=True)) attentionSoft_b = tf.divide(attentionSoft_b, tf.reduce_sum(attentionSoft_b, axis=1, keepdims=True)) attentionSoft_a = tf.multiply(attentionSoft_a, mask) attentionSoft_b = tf.transpose(tf.multiply(attentionSoft_b, mask), [0, 2, 1]) print_shape('att_soft_a', attentionSoft_a) print_shape('att_soft_b', attentionSoft_b) # beta = \sum_{j=1}^l_b \frac{\exp(e_{i,j})}{\sum_{k=1}^l_b \exp(e_{i,k})} * b_hat_j # alpha = \sum_{i=1}^l_a \frac{\exp(e_{i,j})}{\sum_{k=1}^l_a \exp(e_{k,j})} * a_hat_i (2) beta = tf.matmul(attentionSoft_b, self.embeded_left) alpha = tf.matmul(attentionSoft_a, self.embeded_right) print_shape('alpha', alpha) print_shape('beta', beta) return alpha, beta
def _selfAttentiveEncodingBlock(self, scope): """ :param scope: scope name embeded_left, embeded_right: tensor with shape (batch_size, seq_length, embedding_size) rnn_p, rnn_h: output of biLSTM layer, tensor with shape (batch_size, seq_length, 2 * rnn_size) :return: m_premise, m_hypothesis: output of self-attention layer, tensor with shape (batch_size, self_attention_r, 2 * rnn_size) A_premise, A_hypothesis: self attention weights matrix, tensor with shape (batch_size, seq_attention_r, attention_size) """ with tf.device('/cpu:0'): self.Embedding = tf.get_variable('Embedding', [self.n_vocab, self.embedding_size], tf.float32) self.embeded_left = tf.nn.embedding_lookup(self.Embedding, self.premise) self.embeded_right = tf.nn.embedding_lookup(self.Embedding, self.hypothesis) print_shape('embeded_left', self.embeded_left) print_shape('embeded_right', self.embeded_right) with tf.variable_scope(scope): rnn_outputs_premise, final_state_premise = self._biLSTMBlock(self.embeded_left, self.rnn_size, 'R', self.premise_mask) rnn_outputs_hypothesis, final_state_hypothesis = self._biLSTMBlock(self.embeded_right, self.rnn_size, 'R', self.hypothesis_mask, isReuse = True) rnn_p = tf.concat(rnn_outputs_premise, axis=2) rnn_h = tf.concat(rnn_outputs_hypothesis, axis=2) print_shape('rnn_p', rnn_p) print_shape('rnn_h', rnn_h) self_attention_layer1 = SelfAttentionLayer(self.rnn_size, self.attention_size, self.self_attention_r, 'premise') self_attention_layer2 = SelfAttentionLayer(self.rnn_size, self.attention_size, self.self_attention_r, 'hypothesis') m_premise, A_premise = self_attention_layer1(rnn_p) m_hypothesis, A_hypothesis = self_attention_layer2(rnn_h) print_shape('m_premise', m_premise) print_shape('m_hypothesis', m_hypothesis) print_shape('A_premise', A_premise) print_shape('A_hypothesis', A_hypothesis) return m_premise, A_premise, m_hypothesis, A_hypothesis
def _localInferenceBlock(self, a_bar, b_bar, scope): """ :param a_bar: tensor with shape (batch_size, seq_length, 2 * hidden_size) :param b_bar: tensor with shape (batch_size, seq_length, 2 * hidden_size) :param scope: scope name attentionWeights: attention matrix, tensor with shape (batch_size, seq_length, seq_length) attentionSoft_a, attentionSoft_b: using Softmax at two directions, tensor with shape (batch_size, seq_length, seq_length) a_hat, b_hat: context vectors, tensor with shape (batch_size, seq_length, 2 * hidden_size) a_diff, b_diff: difference of a_bar and a_hat, b_bar and b_hat, tensor with shape (batch_size, seq_length, 2 * hidden_size) a_mul, b_mul: hadamard product of a_bar and a_hat, b_bar and b_hat, tensor with shape (batch_size, seq_length, 2 * hidden_size) :return: m_a: concat of [a_bar, a_hat, a_diff, a_mul], tensor with shape (batch_size, seq_length, 4 * 2 * hidden_size) m_b: concat of [b_bar, b_hat, b_diff, b_mul], tensor with shape (batch_size, seq_length, 4 * 2 * hidden_size) """ with tf.variable_scope(scope): # e = a_bar.T * b_bar (11) attentionWeights = tf.matmul(a_bar, tf.transpose(b_bar, [0, 2, 1])) print_shape('att_wei', attentionWeights) # a_hat = softmax(e) * b_bar (12) # b_hat = softmax(e) * a_bar (13) attentionSoft_a = tf.nn.softmax(attentionWeights) attentionSoft_b = tf.nn.softmax(tf.transpose(attentionWeights)) attentionSoft_b = tf.transpose(attentionSoft_b) print_shape('att_soft_a', attentionSoft_a) print_shape('att_soft_b', attentionSoft_b) a_hat = tf.matmul(attentionSoft_a, b_bar) b_hat = tf.matmul(attentionSoft_b, a_bar) print_shape('a_hat', a_hat) print_shape('b_hat', b_hat) a_diff = tf.subtract(a_bar, a_hat) a_mul = tf.multiply(a_bar, a_hat) print_shape('a_diff', a_diff) print_shape('a_mul', a_mul) b_diff = tf.subtract(b_bar, b_hat) b_mul = tf.multiply(b_bar, b_hat) # m_a = [a_bar, a_hat, a_bar - a_hat, a_bar 'dot' a_hat] (14) # m_b = [b_bar, b_hat, b_bar - b_hat, b_bar 'dot' b_hat] (15) m_a = tf.concat([a_bar, a_hat, a_diff, a_mul], axis=2) m_b = tf.concat([b_bar, b_hat, b_diff, b_mul], axis=2) print_shape('m_a', m_a) print_shape('m_b', m_b) return m_a, m_b