def matrix_attention_layer(input_ts, context_ts, att_dim, scope_name, weights_only=False): output_dim = int(input_ts.shape[1]) # time_step, L input_dim = int(input_ts.shape[2]) # video_dims, k context_dim = int(context_ts.shape[1]) # question_dims, c #print 'input_ts:', input_ts.shape #print 'context_ts:', context_ts.shape with tf.variable_scope(scope_name): tiled_context = tf.tile(tf.expand_dims(context_ts, 1), tf.stack([1, output_dim, 1])) #print 'tiled_context:', tiled_context.shape w_c = tf.get_variable( 'w_c', shape=[context_dim, att_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.003)) w_i = tf.get_variable( 'w_i', shape=[input_dim, att_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.001)) b_i = tf.get_variable( 'b_i', shape=[att_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.01)) # (batch_size, time_step, att_dim) attention_input = tf.tanh( utils.tensor_matmul(input_ts, w_i) + utils.tensor_matmul(tiled_context, w_c) + b_i) #print 'attention_input:', attention_input.shape w_a = tf.get_variable( 'w_a', shape=[att_dim, 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.01)) # (batch_size, time_step) attention_score = tf.nn.softmax( tf.squeeze(utils.tensor_matmul(attention_input, w_a), axis=[2])) if weights_only: return attention_score #attention_score = tf.nn.softmax(tools.tensor_matmul(attention_input, w_a)) #print 'attention_score:', attention_score.shape # (batch_size, output_dim) else: attention_output = tf.reduce_sum( tf.multiply(input_ts, tf.expand_dims(attention_score, 2)), 1) return attention_output, attention_score
def bilinear_attention_layer(input_ts, context_ts, att_dim, scope_name): output_dim = int(input_ts.shape[1]) # time_step, L input_dim = int(input_ts.shape[2]) # video_dims, k context_dim = int(context_ts.shape[1]) # question_dims, c with tf.variable_scope(scope_name): reshaped_context = tf.reshape(context_ts, [-1, context_dim, 1]) p = tf.get_variable( 'p', shape=[input_dim, context_dim], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer( )) # initializer=tf.random_normal_initializer(stddev=0.003)) # (batch_size, time_step, video_dims) * (video_dims, context_dim) * (batch_size, context_dim, 1) -> (batch_size, time_step, 1) attention_input = tf.matmul(utils.tensor_matmul(input_ts, p), reshaped_context) #print 'attention_input:', attention_input.shape # (batch_size, time_step) attention_score = tf.nn.softmax(tf.squeeze(attention_input, axis=[2])) #print 'attention_score:', attention_score.shape # (batch_size, output_dim) attention_output = tf.reduce_sum( tf.multiply(input_ts, tf.expand_dims(attention_score, 2)), 1) #print 'attention_output:', attention_output.shape return attention_output
def generate_answer_on_testing(self): with tf.variable_scope("decoder"): answer_test = [] decoder_state = self.decoder_cell.zero_state( self.batch_size, tf.float32) loss = 0.0 with tf.variable_scope("lstm") as scope: for i in range(self.max_n_a_words): scope.reuse_variables() if i == 0: current_emb = self.decoder_input else: next_word_vec = tf.nn.embedding_lookup( self.Wemb, max_prob_word) current_emb = tf.nn.xw_plus_b(next_word_vec, self.word_to_lstm_w, self.word_to_lstm_b) # decoder_state tiled_decoder_state_h = tf.tile( tf.expand_dims(decoder_state, 1), tf.stack([1, self.input_n_frames - 1, 1])) tiled_q_last_state = tf.tile( tf.expand_dims(self.q_last_state, 1), tf.stack([1, self.input_n_frames - 1, 1])) attention_input = tf.tanh( utils.tensor_matmul(self.v_first_lstm_output, self.attention_w_x) + utils.tensor_matmul(tiled_q_last_state, self.attention_w_q) + utils.tensor_matmul(tiled_decoder_state_h, self.attention_w_h) + self.attention_b) attention_score = tf.nn.softmax( tf.squeeze(utils.tensor_matmul(attention_input, self.attention_a), axis=[2])) attention_output = tf.reduce_sum( tf.multiply(self.v_first_lstm_output, tf.expand_dims(attention_score, 2)), 1) attention_decoder = tf.matmul(attention_output, self.attention_to_decoder) # decoder : GRU with attention decoder_input = tf.concat( [decoder_state, attention_decoder, current_emb], axis=1) decoder_r_t = tf.nn.sigmoid( tf.matmul(decoder_input, self.decoder_r)) decoder_z_t = tf.nn.sigmoid( tf.matmul(decoder_input, self.decoder_z)) decoder_middle = tf.concat([ tf.multiply(decoder_r_t, decoder_state), tf.multiply(decoder_r_t, attention_decoder), current_emb ], axis=1) decoder_state_ = tf.tanh( tf.matmul(decoder_middle, self.decoder_w)) decoder_state = tf.multiply( (1 - decoder_z_t), decoder_state) + tf.multiply( decoder_z_t, decoder_state_) output = decoder_state # ground truth labels = tf.expand_dims(self.y[:, i], 1) indices = tf.expand_dims(tf.range(0, self.batch_size, 1), 1) concated = tf.concat([indices, labels], 1) onehot_labels = tf.sparse_to_dense( concated, tf.stack([self.batch_size, self.n_words]), 1.0, 0.0) logit_words = tf.nn.xw_plus_b(output, self.embed_word_W, self.embed_word_b) max_prob_word = tf.argmax(logit_words, 1) answer_test.append(max_prob_word) cross_entropy = tf.nn.softmax_cross_entropy_with_logits( labels=onehot_labels, logits=logit_words) # cross_entropy = cross_entropy * self.reward cross_entropy = cross_entropy * self.y_mask[:, i] current_loss = tf.reduce_sum(cross_entropy) loss = loss + current_loss loss = loss / tf.reduce_sum(self.y_mask[:, 1:]) return answer_test, loss