def _soft_attention(im, ctx, embed_dim, keep_prob=1.0, scope=""): scope = scope or "Att" with tf.variable_scope(scope): im_ctx = mlb(im, ctx, embed_dim, keep_prob) fv = _soft_attention_pool(im, im_ctx) # fv = tf.expand_dims(fv, 1) return fv
def build_attention_vaq_model(im, ans_embed, quest, quest_len, embed_dim, vocab_size, keep_prob, pad_token, num_dec_cells, phase='train'): # average pooling over image fv = softmax_attention(im, ans_embed, embed_dim, keep_prob=keep_prob, scope='AnsAttention') in_embed = mlb(fv, ans_embed, embed_dim, keep_prob, scope='VAEmbed') with tf.variable_scope('vaq'): if phase == 'train': inputs, targets, length = _build_caption_inputs_and_targets( quest, quest_len) return build_lstm_decoder(in_embed, inputs, length, targets, vocab_size, num_dec_cells, keep_prob, pad_token) else: return build_lstm_predictor(in_embed, quest, vocab_size, num_dec_cells, pad_token)
def compute_gates(g_im, ctx, embed_dim, num_outputs, keep_prob): # is_training = keep_prob != 1.0 g_h = mlb(g_im, ctx, embed_dim, keep_prob=keep_prob, scope='gate') g_logits = slim.fully_connected(g_h, num_outputs, activation_fn=None, scope='g_logits') return tf.nn.softmax(g_logits)
def low_rank_attention(im, ctx, embed_dim, num_rank, keep_prob, scope='LR_att'): scope = scope or "LR_att" with tf.variable_scope(scope): im_ctx = mlb(im, ctx, embed_dim, keep_prob) fv = _low_rank_attention_pool(im, im_ctx, num_rank) return fv
def semantic_attention(attr, quest_embed, embed_dim, keep_prob, scope='SemAtt'): with tf.variable_scope(scope): aq_embed = mlb(attr, quest_embed, embed_dim, keep_prob, scope='gates') gates = slim.fully_connected(aq_embed, 1, activation_fn=tf.nn.sigmoid, scope='gates') # apply gates gated_attr = tf.mul(attr, gates) return gated_attr
def conditional_attention_cell_helper(im, a, part_q, embed_dim, keep_prob=1.0, scope=""): scope = scope or "ConditionalAttentionCell" _, h, w, c = im.get_shape().as_list() with tf.variable_scope(scope): # QA joint embedding ctx = concat_fusion(part_q, a, embed_dim) # soft attention im_ctx = mlb(im, ctx, embed_dim, keep_prob, scope='Matching') v, am = _soft_attention_pool_with_map(im, im_ctx) am = tf.reshape(am, shape=[-1, h * w]) return v, ctx, am
def __call__(self, inputs, state, scope=None): """Attention cell with answer context.""" with tf.variable_scope(scope or type(self).__name__): with tf.variable_scope('Attention'): v, ctx, am = conditional_attention_cell_helper( self._context, self._answer_context, inputs, self._embed_dim, keep_prob=self._keep_prob) h = mlb(v, ctx, self._embed_dim, self._keep_prob, scope='OutputMLB') # residual connection h = inputs + h return h, h
def _scale_specific_vq_prediction(net, ctx, embed_dim, num_ans, keep_prob, scope, expand_dim=True): with tf.variable_scope(scope): v = _soft_attention(net, ctx, embed_dim, keep_prob=keep_prob) pre_logits = mlb(v, ctx, embed_dim, keep_prob, scope='pre_logits') logits = slim.fully_connected(pre_logits, num_ans, activation_fn=None, scope='logits') if expand_dim: return tf.expand_dims(logits, 1) else: return logits
def __call__(self, inputs, state, scope='MultiModalAttentionCell'): """Attention cell with answer context.""" with tf.variable_scope('MultiModalAttentionCell'): with tf.variable_scope('Attention'): v, ctx, am = conditional_attention_cell_helper( self._context, self._answer_context, inputs, self._embed_dim, keep_prob=self._keep_prob) h = mlb(v, ctx, self._embed_dim, self._keep_prob, scope='OutputMLB') # residual connection new_h = inputs + h # new_h, new_state return new_h, h