def visual_semantic_infer(self, visual_feature_train_pos, visual_feature_train_neg, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print("Building training network...............................\n")
            transformed_clip_train_mix = fc('v2s_lt', tf.concat([visual_feature_train_pos, visual_feature_train_neg], 0), output_dim=self.semantic_size)
            transformed_clip_train_norm_mix = tf.nn.l2_normalize(transformed_clip_train_mix, dim=1)

            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)
            cross_modal_vec_train_mix = self.cross_modal_comb(transformed_clip_train_norm_mix,
                                                              tf.tile(transformed_sentence_train_norm, [2,1]),
                                                              self.batch_size)

            sim_score_mat_train_mix = vs_multilayer.vs_multilayer(cross_modal_vec_train_mix, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train_mix = tf.reshape(sim_score_mat_train_mix, [self.batch_size*2, 3])

            tf.get_variable_scope().reuse_variables()
            print("Building test network...............................\n")
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train_mix, sim_score_mat_test
예제 #2
0
def localization_module(vis_feat, spatial_feat, lang_feat,
    scope="localization_module", reuse=None):
    # Input:
    #   vis_feat: [N, D_vis]
    #   spatial_feat: [N, D_spatial]
    #   lang_feat: [N, D_lang]
    # Output:
    #   localization_scores: [N, 1]
    #
    # This function is not responsible for initializing the variables. Please
    # handle variable initialization outside.

    with tf.variable_scope(scope, reuse=reuse):
        # An embedding module that maps the visual feature plus the spatial feature
        # linearly to the same dimension as the language feature
        D_lang = lang_feat.get_shape().as_list()[-1]
        vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=1)
        vis_spatial_embed = fc('vis_spatial_embed', vis_spatial_feat, output_dim=D_lang)

        # Elementwise multiplication with language feature and l2-normalization
        eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 1)

        # Localization scores as linear classification over the l2-normalized
        localization_scores = fc('localization_scores', eltwise_mult, output_dim=1)

    return localization_scores
예제 #3
0
def attbilstm(text_seq_batch, name, num_vocab, embed_dim, lstm_dim,
    apply_dropout, reuse=None):
    with tf.variable_scope(name, reuse=reuse):
        T = tf.shape(text_seq_batch)[0]
        N = tf.shape(text_seq_batch)[1]

        # 0. Word embedding
        embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim])
        # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
        embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

        # 1. Encode the sentence into a vector representation, using the final
        # hidden states in a two-layer bidirectional LSTM network
        seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True)
        outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell,
            embedded_seq, seq_length, dtype=tf.float32, time_major=True,
            scope="bidirectional_lstm1")
        outputs1 = tf.concat(outputs1_raw, axis=2)
        outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell,
            outputs1, seq_length, dtype=tf.float32, time_major=True,
            scope="bidirectional_lstm2")
        outputs2 = tf.concat(outputs2_raw, axis=2)
        # q_reshape has shape [T, N, lstm_dim*4]
        q_reshape = tf.concat([outputs1, outputs2], axis=2)
        if apply_dropout:
            q_reshape = drop(q_reshape, 0.5)

        # 2. three attention units over the words in each sentence
        with tf.variable_scope("attention"):
            q_reshape_flat = tf.reshape(q_reshape, to_T([T*N, lstm_dim*4]))

            score_shape = to_T([T, N, 1])
            scores_obj1 = tf.reshape(fc('fc_scores_obj1', q_reshape_flat, output_dim=1), score_shape)
            scores_obj2 = tf.reshape(fc('fc_scores_obj2', q_reshape_flat, output_dim=1), score_shape)
            scores_rel = tf.reshape(fc('fc_scores_rel', q_reshape_flat, output_dim=1), score_shape)

            # 2.4 Compute probability and average BoW representation
            # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1]
            # Remove the probability over <pad> (<pad> is 0)
            is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32)
            probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad
            probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad
            probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad
            probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True)
            probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True)
            probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True)

            tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel))

            # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim]
            BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0)
            BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0)
            BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0)
            BoW_obj1.set_shape([None, embed_dim])
            BoW_obj2.set_shape([None, embed_dim])
            BoW_rel.set_shape([None, embed_dim])

    return (BoW_obj1, BoW_obj2, BoW_rel)
예제 #4
0
def relationship_module_spatial_only(spatial_feat1,
                                     scores1,
                                     spatial_feat2,
                                     scores2,
                                     lang_feat,
                                     scope="relationship_module_spatial_only",
                                     reuse=None):
    # Input shape:
    #   spatial_feat1, spatial_feat2 : [N1, D_spatial], [N2, D_spatial]
    #   scores1, scores2: [N1, 1], [N2, 1]
    #   lang_feat: [1, D_lang]
    # Output shape:
    #   relationship_scores: [N1, N2, 1]
    #
    # This function is not responsible for initializing the variables. Please
    # handle variable initialization outside.

    with tf.variable_scope(scope, reuse=reuse):
        # An embedding module that maps the visual feature plus the spatial feature
        # linearly to the same dimension as the language feature
        D_lang = lang_feat.get_shape().as_list()[-1]

        N1 = tf.shape(spatial_feat1)[0]
        N2 = tf.shape(spatial_feat2)[0]

        D_spatial = spatial_feat1.get_shape().as_list()[-1]

        # Tiled spatial features of size [N1, N2, 5*2], such that
        # spatial_feat_tiled[i, j] = [ spatial_feat1[i], spatial_feat1[j] ]
        spatial_feat_tiled = tf.reshape(
            tf.concat([
                tf.tile(tf.reshape(spatial_feat1, [-1, 1, D_spatial]),
                        to_T([1, N2, 1])),
                tf.tile(tf.reshape(spatial_feat2, [1, -1, D_spatial]),
                        to_T([N1, 1, 1]))
            ],
                      axis=2), [-1, D_spatial * 2])

        spatial_embed = fc('spatial_embed',
                           spatial_feat_tiled,
                           output_dim=D_lang)

        # Elementwise multiplication with language feature and l2-normalization
        eltwise_mult = tf.nn.l2_normalize(spatial_embed * lang_feat, 1)

        # Localization scores as linear classification over the l2-normalized
        relationship_scores = fc('relationship_scores',
                                 eltwise_mult,
                                 output_dim=1)
        relationship_scores = tf.reshape(relationship_scores, to_T([N1, N2,
                                                                    1]))

        final_scores = tf.add(
            tf.add(tf.reshape(scores1, [-1, 1, 1]),
                   tf.reshape(scores2, [1, -1, 1])), relationship_scores)
        final_scores.set_shape([None, None, 1])

    return final_scores
예제 #5
0
def build_output_unit_loc(q_encoding, kb_batch, att_last,
                          scope='output_unit_loc', reuse=None):
    """
    Apply a 1-layer convolution network to predict localization scores.
    Apply dropout
    if specified.

    Input:
        kb_batch: [N, H, W, d], tf.float32
        att_last: [N, H, W, 1], tf.float32
    Return:
        loc_scores: [N, H*W], tf.float32
        bbox_offset: [N, 4], tf.float32
    """

    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.LOC_SCORES_POS_AFFINE:
            # make sure att signs do not flip
            w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', []))
            b = tf.get_variable('loc_scores_affine_b', [])
            loc_scores = w * att_last + b
        else:
            loc_scores = conv(
                'conv_loc', att_last, kernel_size=3, stride=1, output_dim=1)
        loc_scores = tf.reshape(
            loc_scores, [-1, cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT])
        # extract the attended features for bounding box regression
        if cfg.MODEL.BBOX_REG_AS_FCN:
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc(
                    'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM)
                bbox_offset_input = tf.nn.l2_normalize(
                    q_mapped[:, ax, ax, :] * kb_batch, axis=-1)
            else:
                bbox_offset_input = kb_batch
            bbox_offset_fcn = conv(
                'conv_bbox_offset', bbox_offset_input, 1, 1, output_dim=4)
            N = tf.shape(bbox_offset_fcn)[0]
            B = cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT  # B = H*W
            # bbox_offset_fcn [N, B, 4] is used for training
            bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4]))
            # bbox_offset [N, 4] is only used for prediction
            bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N*B, 4]))
            slice_inds = tf.range(N) * B + tf.argmax(
                loc_scores, axis=-1, output_type=tf.int32)
            bbox_offset = tf.gather(bbox_offset_flat, slice_inds)
        else:
            bbox_offset_fcn = None
            kb_loc = _extract_softmax_avg(kb_batch, att_last)
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc(
                    'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM)
                elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1)
                bbox_offset = fc(
                    'fc_bbox_offset_with_q', elt_prod, output_dim=4)
            else:
                bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4)
    return loc_scores, bbox_offset, bbox_offset_fcn
예제 #6
0
    def DescribeModule(self,
                       input_0,
                       time_idx,
                       batch_idx,
                       map_dim=250,
                       scope='DescribeModule',
                       reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Element-wise multiplication of the two, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):

                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                text_param_mapped = fc('fc_text',
                                       text_param,
                                       output_dim=map_dim)

                att_softmax = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))),
                    to_T([N, H, W, 1]))

                # att_feat, att_feat_1 has shape [N, D_vis]
                att_feat = tf.reduce_sum(image_feat_grid * att_softmax,
                                         axis=[1, 2])
                att_feat_mapped = tf.reshape(
                    fc('fc_att', att_feat, output_dim=map_dim),
                    to_T([N, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    text_param_mapped * att_feat_mapped, 1)
                scores = fc('fc_eltwise',
                            eltwise_mult,
                            output_dim=self.num_choices)

        return scores
예제 #7
0
  def _build_encoder(self, input_seq_batch, seq_len_batch, scope='encoder',
    reuse=None):
    lstm_dim = self.lstm_dim
    num_layers = self.num_layers
    apply_dropout = self.encoder_dropout

    with tf.variable_scope(scope, reuse=reuse):
      #T = tf.shape(input_seq_batch)[0]
      T = input_seq_batch.shape.as_list()[0]
      N = tf.shape(input_seq_batch)[1]
      self.T_encoder = T
      self.N = N
      with tf.variable_scope(self.embed_scope, reuse=True):
        embedding_mat = tf.get_variable('embed_mat', [self.encoder_num_vocab,
                                                      self.encoder_embed_dim])
      # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
      embedded_seq = tf.nn.embedding_lookup(embedding_mat, input_seq_batch)
      self.embedded_input_seq = embedded_seq

      # The RNN
      cell = _get_lstm_cell(num_layers, lstm_dim, apply_dropout)

      # encoder_outputs has shape [T, N, lstm_dim]
      encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell, embedded_seq,
                                                          seq_len_batch,
                                                          dtype=tf.float32,
                                                          time_major=True,
                                                          scope='lstm')
      self.encoder_outputs = encoder_outputs
      self.encoder_states = encoder_states

      # check if wv flag is set
      if self.params['use_word_vectors']:
        # transform the encoder outputs for further attention alignments
        # encoder_outputs_flat has shape [T, N, lstm_dim]
        encoder_h_transformed = fc('encoder_h_transform',
          tf.reshape(embedded_seq, [-1, self.encoder_embed_dim]),
                            output_dim=lstm_dim)
      else:
        # transform the encoder outputs for further attention alignments
        # encoder_outputs_flat has shape [T, N, lstm_dim]
        encoder_h_transformed = fc('encoder_h_transform',
          tf.reshape(encoder_outputs, [-1, lstm_dim]), output_dim=lstm_dim)

      encoder_h_transformed = tf.reshape(encoder_h_transformed,
                       to_T([T, N, lstm_dim]))
      self.encoder_h_transformed = encoder_h_transformed

      # seq_not_finished has shape [T, N, 1], where seq_not_finished[t, n]
      # is 1 iff sequence n is not finished at time t, and 0 otherwise
      seq_not_finished = tf.less(tf.range(T)[:, tf.newaxis, tf.newaxis],
                   seq_len_batch[:, tf.newaxis])
      seq_not_finished = tf.cast(seq_not_finished, tf.float32)
      self.seq_not_finished = seq_not_finished
예제 #8
0
파일: vc_model.py 프로젝트: AmmieQi/vc
    def build_regulizer(self):
        """ context regularization score
        """
        # text and region features
        text_bilstm_feat = self.text_bilstm_feat
        text_word_embed_feat = self.text_word_embed_feat
        word_is_not_pad = self.word_is_not_pad
        region_visual_feat = self.region_visual_feat
        region_spatial_feat = self.region_spatial_feat
        reg_dim = self.config.reg_dim

        # Tensor dimensionality
        L = tf.shape(text_bilstm_feat)[0]
        N1 = tf.shape(text_bilstm_feat)[1]
        N2 = tf.shape(region_spatial_feat)[0]
        D1 = text_bilstm_feat.get_shape().as_list()[-1]  # lstm_dim*4
        D2 = text_word_embed_feat.get_shape().as_list()[-1]  # embed_dim
        D3 = region_spatial_feat.get_shape().as_list()[-1]  # spatial_dim
        D4 = region_visual_feat.get_shape().as_list()[-1]  # visual_dim

        region_feat = tf.concat([region_visual_feat, region_spatial_feat],
                                axis=1)  # shape: [N2, D3+D4]

        with tf.variable_scope('regularizer'):
            # 1. language-vision association between single RoI and the expression, represented by y^{g} in the paper
            word_obj_attention_score = fc('word_attention_obj',
                                          tf.reshape(text_bilstm_feat,
                                                     [-1, D1]),
                                          output_dim=1)  # shape: [L*N1, 1]
            word_obj_attention_score = tf.reshape(word_obj_attention_score,
                                                  [L, N1, 1])
            word_prob = tf.nn.softmax(
                word_obj_attention_score,
                dim=0) * word_is_not_pad  #shape: [L, N1, 1]
            word_prob = word_prob / tf.reduce_sum(
                word_prob, 0, keep_dims=True)  #shape: [L, N1, 1]
            word_obj_feat = tf.reduce_sum(word_prob * text_word_embed_feat,
                                          axis=0)  #shape: [N1, D2]

            # 2. single score for subject
            region_embed = fc('region_obj_embed', region_feat,
                              output_dim=D2)  #shape: [N2, D2]
            mm_feat = tf.nn.l2_normalize(
                region_embed[tf.newaxis, ...] *
                tf.reshape(word_obj_feat, [N1, 1, D2]),
                dim=2)  #shape: [N1, N2, D2]
            score = fc('single_score',
                       tf.reshape(mm_feat, [-1, D2]),
                       output_dim=1)  # shape: [N1*N2, 1]
            score = tf.reshape(score, [N1, N2])  #shape[N1, N2]

            self.prior_score = score
예제 #9
0
    def FindSamePropertyModule(self, input_0, time_idx, batch_idx, map_dim=250,
        scope='FindSamePropertyModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x image_feat_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Convolve image features to map_dim
        #   4. Element-wise multiplication of the three, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                # image_feat_mapped has shape [N, H, W, map_dim]
                image_feat_mapped = _1x1_conv('conv_image', image_feat_grid,
                                              output_dim=map_dim)

                text_param_mapped = fc('fc_text', text_param, output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim]))

                att_softmax = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))),
                    to_T([N, H, W, 1]))
                # att_feat has shape [N, D_vis]
                att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2])
                att_feat_mapped = tf.reshape(
                    fc('fc_att', att_feat, output_dim=map_dim), to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    image_feat_mapped * text_param_mapped * att_feat_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
예제 #10
0
def localization_module_batch_score(vis_feat,
                                    spatial_feat,
                                    lang_feat,
                                    scope="localization_module",
                                    reuse=None):
    # Input:
    #   vis_feat: [N_batch, N_vis, D_vis]
    #   spatial_feat: [N_batch, N_vis, D_spatial]
    #   lang_feat: [N_batch, D_lang]
    # Output:
    #   localization_scores: [N_batch, N_vis, 1]
    #
    # This function is not responsible for initializing the variables. Please
    # handle variable initialization outside.

    with tf.variable_scope(scope, reuse=reuse):
        # An embedding module that maps the visual feature plus the spatial feature
        # linearly to the same dimension as the language feature
        N_batch = tf.shape(vis_feat)[0]
        N_vis = tf.shape(vis_feat)[1]
        D_vis = vis_feat.get_shape().as_list()[-1]
        D_spatial = spatial_feat.get_shape().as_list()[-1]
        D_lang = lang_feat.get_shape().as_list()[-1]

        # flatten the visual and spatial features and embed them to the same
        # dimension as the language feature
        vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=2)
        vis_spatial_feat = tf.reshape(vis_spatial_feat,
                                      [-1, D_vis + D_spatial])
        vis_spatial_embed = fc('vis_spatial_embed',
                               vis_spatial_feat,
                               output_dim=D_lang)

        # Reshape visual feature and language feature for broadcast multiplication
        lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang])
        vis_spatial_embed = tf.reshape(vis_spatial_embed,
                                       to_T([N_batch, -1, D_lang]))

        # Elementwise multiplication with language feature and l2-normalization
        eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 2)
        eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang])

        # Localization scores as linear classification over the l2-normalized
        localization_scores = fc('localization_scores',
                                 eltwise_mult,
                                 output_dim=1)
        localization_scores = tf.reshape(localization_scores,
                                         to_T([N_batch, N_vis, 1]))

    return localization_scores
예제 #11
0
    def instantiate_batch(self, inputs):
        """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
        vis_att, img_feat, text_att = inputs

        # text feature dimension, intermediate mapping dimension
        # batch size, image feature height and width
        text_dim = text_att.shape.as_list()[-1]
        map_dim = self._params['map_dim']
        encode_size = self._params['encode_size']
        N = tf.shape(img_feat)[0]
        H, W = img_feat.shape.as_list()[1:3]

        with tf.variable_scope(self._module_scope):
            with tf.variable_scope(self._scope, reuse=self._reuse):
                # image_feat_mapped has shape [N, H, W, map_dim]
                img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim)
                # nonlinearity
                img_map = tf.nn.relu(img_map)

                text_map = fc('fc_text', text_att, output_dim=map_dim)
                text_map = tf.reshape(text_map, [-1, 1, 1, map_dim])
                # nonlinearity
                text_map = tf.nn.relu(text_map)

                att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
                att_map = tf.reshape(
                    fc('fc_att', att_feats, output_dim=map_dim),
                    [N, 1, 1, map_dim])

                # interact via element wise map
                eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map,
                                                  3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

                # softmax
                att_grid_soft = tf.nn.softmax(tf.reshape(
                    att_grid, [-1, H * W]))
                att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1])

        return [att_grid]
예제 #12
0
def build_output_unit_vqa(q_encoding, m_last, num_choices, apply_dropout,
                          scope='output_unit', reuse=None):
    """
    Apply a 2-layer fully-connected network to predict answers. Apply dropout
    if specified.

    Input:
        q_encoding: [N, d], tf.float32
        m_last: [N, d], tf.float32
    Return:
        vqa_scores: [N, num_choices], tf.float32
    """

    output_dim = cfg.MODEL.VQA_OUTPUT_DIM
    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.VQA_OUTPUT_USE_QUESTION:
            fc1 = fc_elu(
                'fc1', tf.concat([q_encoding, m_last], axis=1),
                output_dim=output_dim)
        else:
            fc1 = fc_elu('fc1_wo_q', m_last, output_dim=output_dim)
        if apply_dropout:
            fc1 = tf.nn.dropout(fc1, cfg.TRAIN.DROPOUT_KEEP_PROB)
        fc2 = fc('fc2', fc1, output_dim=num_choices,
                 biases_initializer=tf.constant_initializer(
                    cfg.TRAIN.VQA_SCORE_INIT))

        vqa_scores = fc2
    vqa_scores = tf.nn.softmax(vqa_scores)                             ##TODO edit_vedika to get softmax prob
    return vqa_scores
예제 #13
0
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
                       embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout,
                       mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)[0]

    # Local image feature
    feat_vis = vgg_net.vgg_fc8(imcrop_batch,
                               'vgg_local',
                               apply_dropout=vgg_dropout)

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(axis=1,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis, 1), spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
예제 #14
0
    def LessNumModule(self, input_0, input_1, time_idx, batch_idx,
        scope='LessNumModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        # Mapping: att_grid x att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        #   input_1: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. linear transform of the attention map (also including max and min)
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                att_shape = tf.shape(input_0)

                H, W = self.att_shape[1:3]
                att_all_0 = tf.reshape(input_0, to_T([-1, H*W]))
                att_min_0 = tf.reduce_min(input_0, axis=[1, 2])
                att_max_0 = tf.reduce_max(input_0, axis=[1, 2])
                att_all_1 = tf.reshape(input_1, to_T([-1, H*W]))
                att_min_1 = tf.reduce_min(input_1, axis=[1, 2])
                att_max_1 = tf.reduce_max(input_1, axis=[1, 2])
                # att_reduced has shape [N, 3]
                att_concat = tf.concat([att_all_0, att_min_0, att_max_0,
                                        att_all_1, att_min_1, att_max_1],
                                       axis=1)
                scores = fc('fc_scores', att_concat, output_dim=self.num_choices)

        return scores
예제 #15
0
    def AnswerModule(self,
                     input_0,
                     time_idx,
                     batch_idx,
                     scope='AnswerModule',
                     reuse=None):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        att_grid = input_0
        # Mapping: att_grid -> answer probs
        # Input:
        #   att_grid: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. Max-pool over att_grid
        #   2. a linear mapping layer (without ReLU)
        with tf.variable_scope(scope, reuse=reuse):
            att_shape = tf.shape(att_grid)
            N = att_shape[0]
            H = att_shape[1]
            W = att_shape[2]

            att_min = tf.reduce_min(att_grid, axis=[1, 2])
            att_avg = tf.reduce_mean(att_grid, axis=[1, 2])
            att_max = tf.reduce_max(att_grid, axis=[1, 2])
            # att_reduced has shape [N, 3]
            att_reduced = tf.concat([att_min, att_avg, att_max], axis=1)
            scores = fc('fc_scores', att_reduced, output_dim=self.num_choices)

        return scores
예제 #16
0
def my_fc_layer(input_batch, name, output_dim, apply_dropout=False):
    with tf.variable_scope(name):
        print("input_batch: ", input_batch)
        fc7 = fc('fc', input_batch, output_dim=output_dim)
        print("fc7: ", fc7)
    if apply_dropout: fc7 = drop(fc7, 0.5)
    return fc7
예제 #17
0
def build_output_unit_vqa(q_encoding,
                          m_last,
                          num_choices,
                          apply_dropout,
                          scope='output_unit',
                          reuse=None):
    """
    Apply a 2-layer fully-connected network to predict answers. Apply dropout
    if specified.

    Input:
        q_encoding: [N, d], tf.float32
        m_last: [N, d], tf.float32
    Return:
        vqa_scores: [N, num_choices], tf.float32
    """

    output_dim = cfg.MODEL.VQA_OUTPUT_DIM
    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.VQA_OUTPUT_USE_QUESTION:
            fc1 = fc_elu('fc1',
                         tf.concat([q_encoding, m_last], axis=1),
                         output_dim=output_dim)
        else:
            fc1 = fc_elu('fc1_wo_q', m_last, output_dim=output_dim)
        if apply_dropout:
            fc1 = tf.nn.dropout(fc1, cfg.TRAIN.DROPOUT_KEEP_PROB)
            print(cfg.TRAIN.DROPOUT_KEEP_PROB)
            ipdb.set_trace()
        fc2 = fc('fc2', fc1, output_dim=num_choices)
        vqa_scores = fc2
    return vqa_scores
예제 #18
0
    def Transform(self,
                  att_stack,
                  stack_ptr,
                  mem_in,
                  c_i,
                  scope='Transform',
                  reuse=None):
        """
        Transforms the previous attention, and updates memory vector.
        """
        with tf.variable_scope(scope, reuse=reuse):
            # Get attention
            #   1) linearly map the controller vectors to the KB dimension
            #   2) extract attended features from the input attention
            #   2) elementwise product with KB
            #   3) 1x1 convolution to get attention logits

            # Pop from stack
            att_in = _read_from_stack(att_stack, stack_ptr)
            # stack_ptr = _move_ptr_bw(stack_ptr)  # cancel-out below

            c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM)
            kb_att_in = _extract_softmax_avg(self.kb_batch, att_in)
            elt_prod = tf.nn.l2_normalize(self.kb_batch *
                                          c_mapped[:, ax, ax, :] *
                                          kb_att_in[:, ax, ax, :],
                                          axis=-1)
            att_out = _1x1conv('conv_att_out', elt_prod, output_dim=1)

            # Push to stack
            # stack_ptr = _move_ptr_fw(stack_ptr)  # cancel-out above
            att_stack = _write_to_stack(att_stack, stack_ptr, att_out)

        return att_stack, stack_ptr, self.mem_zero
예제 #19
0
    def Find(self,
             att_stack,
             stack_ptr,
             mem_in,
             c_i,
             scope='Find',
             reuse=None):
        """
        Performs localization, and updates memory vector.
        """
        with tf.variable_scope(scope, reuse=reuse):
            # Get attention
            #   1) linearly map the controller vectors to the KB dimension
            #   2) elementwise product with KB
            #   3) 1x1 convolution to get attention logits
            c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM)
            elt_prod = tf.nn.l2_normalize(self.kb_batch *
                                          c_mapped[:, ax, ax, :],
                                          axis=-1)
            att_out = _1x1conv('conv_att_out', elt_prod, output_dim=1)

            # Push to stack
            stack_ptr = _move_ptr_fw(stack_ptr)
            att_stack = _write_to_stack(att_stack, stack_ptr, att_out)

        return att_stack, stack_ptr, self.mem_zero
예제 #20
0
    def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5,
        map_dim=250, scope='TransformModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   Convolutional layer that also involve text_param
        #   A 'soft' convolutional kernel that is modulated by text_param
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                att_shape = tf.shape(input_0)
                N = att_shape[0]
                H = att_shape[1]
                W = att_shape[2]
                att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size,
                    stride=1, output_dim=map_dim)

                text_param_mapped = fc('text_fc', text_param, output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
                       embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout,
                       mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                  lstm_dim)

    # Local image feature
    feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch,
                                             'deeplab',
                                             apply_dropout=deeplab_dropout)
    input_dim = 1
    for d in feat_vis.get_shape().as_list()[1:]:
        input_dim *= d
    feat_vis_flatten = tf.reshape(feat_vis, [-1, input_dim])

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(axis=1,
                         values=[
                             tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis_flatten, 1),
                             spatial_batch
                         ])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
예제 #22
0
    def __init__(self,
                 images,
                 q_encoding,
                 image_valid_batch,
                 num_choices,
                 scope='single_hop',
                 reuse=None):

        x_loc = self.loc_init(images, reuse=reuse)

        with tf.variable_scope(scope, reuse=reuse):
            x_loc_shape = tf.shape(x_loc)
            B, H, W = x_loc_shape[0], x_loc_shape[1], x_loc_shape[2]
            dim = x_loc.get_shape().as_list()[-1]  # static shape

            # attention over x_loc
            proj_q = fc('fc_q_map1', q_encoding, output_dim=dim)[:, ax, ax, :]
            interactions = tf.nn.l2_normalize(x_loc * proj_q, axis=-1)
            raw_att = conv('conv_att_score',
                           interactions,
                           kernel_size=1,
                           stride=1,
                           output_dim=1)
            raw_att = tf.reshape(raw_att, to_T([B, H * W]))  # (N, H*W)
            valid_mask = tf.reshape(image_valid_batch, tf.shape(raw_att))
            raw_att = raw_att * valid_mask - 1e18 * (1 - valid_mask)
            att = tf.nn.softmax(raw_att, axis=-1)  # (N, H*W)

            # collect attended image feature
            x_att = tf.matmul(tf.reshape(att, to_T([B, 1, H * W])),
                              tf.reshape(x_loc, to_T([B, H * W,
                                                      dim])))  # (N, 1, D_kb)
            x_att = tf.reshape(x_att, to_T([B, dim]))  # (N, D_kb)

            # VQA classification
            eQ = fc('fc_q_map2', q_encoding, output_dim=dim)
            if cfg.OUT_QUESTION_MUL:
                features = tf.concat([x_att, eQ, x_att * eQ], axis=-1)
            else:
                features = tf.concat([x_att, eQ], axis=-1)

            fc1 = fc_relu('fc_hidden',
                          features,
                          output_dim=cfg.OUT_CLASSIFIER_DIM)
            logits = fc('fc_scores', fc1, output_dim=num_choices)
            self.logits = logits
예제 #23
0
def build_output_unit_rec(rec_inputs,
                          input_seq_batch,
                          embed_seq,
                          seq_length_batch,
                          num_vocab,
                          scope='output_unit_rec',
                          reuse=None):
    """
    Try to reconstruct the input sequence from the controller outputs with a
    seq-to-seq LSTM.

    Input:
        rec_inputs: [T, N, ?], tf.float32
        input_seq_batch: [S, N], tf.int32
        embed_seq: [S, N, e], tf.float32
        seq_length_batch: [N], tf.int32
    Return:
        loss_rec: [], tf.float32
    """
    with tf.variable_scope(scope, reuse=reuse):
        S = tf.shape(input_seq_batch)[0]
        N = tf.shape(input_seq_batch)[1]

        lstm_dim = cfg.MODEL.LSTM_DIM
        # encoder
        cell_encoder = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim, name='c_encoder')
        _, states_encoder = tf.nn.dynamic_rnn(cell_encoder,
                                              rec_inputs,
                                              dtype=tf.float32,
                                              time_major=True)
        # decoder
        cell_decoder = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim, name='c_decoder')
        embed_seq_shifted = tf.concat(
            [tf.zeros_like(embed_seq[:1]), embed_seq[:-1]], axis=0)
        outputs_decoder, _ = tf.nn.dynamic_rnn(
            cell_decoder,
            embed_seq_shifted,
            sequence_length=seq_length_batch,
            initial_state=states_encoder,
            time_major=True)

        # word prediction
        outputs_flat = tf.reshape(outputs_decoder, to_T([S * N, lstm_dim]))
        word_scores_flat = fc('fc_word_scores',
                              outputs_flat,
                              output_dim=num_vocab)
        word_scores = tf.reshape(word_scores_flat, to_T([S, N, num_vocab]))

        # cross-entropy loss over the actual sequence words
        # att_mask: [S, N]
        att_mask = tf.less(tf.range(S)[:, ax], seq_length_batch)
        att_mask = tf.cast(att_mask, tf.float32)
        loss_rec = tf.reduce_sum(
            att_mask * tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=word_scores,
                labels=input_seq_batch)) / tf.reduce_sum(att_mask)

    return loss_rec
예제 #24
0
def deeplab_fc8(input_batch, name, apply_dropout=False):
    pool5a = deeplab_pool5(input_batch, name)
    with tf.variable_scope(name):
        fc6 = fc_relu('fc6', pool5a, output_dim=1024)
        if apply_dropout: fc6 = drop(fc6, 0.5)

        fc7 = fc_relu('fc7', fc6, output_dim=1024)
        if apply_dropout: fc7 = drop(fc7, 0.5)
        fc8 = fc('fc8', fc7, output_dim=1000)
        return fc8
    def FindModule(self,
                   time_idx,
                   batch_idx,
                   map_dim=500,
                   scope='FindModule',
                   reuse=None):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: image_feat_grid x text_param -> att_grid
        # Input:
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Elementwise multiplication between image_feat_grid and text_param
        #   2. L2-normalization
        #   3. Linear classification
        with tf.variable_scope(scope, reuse=reuse):
            image_shape = tf.shape(image_feat_grid)
            N = tf.shape(time_idx)[0]
            H = image_shape[1]
            W = image_shape[2]
            D_im = image_feat_grid.get_shape().as_list()[-1]
            D_txt = text_param.get_shape().as_list()[-1]

            # image_feat_mapped has shape [N, H, W, map_dim]
            image_feat_mapped = _1x1_conv('conv_image',
                                          image_feat_grid,
                                          output_dim=map_dim)

            text_param_mapped = fc('fc_text', text_param, output_dim=map_dim)
            text_param_mapped = tf.reshape(text_param_mapped,
                                           to_T([N, 1, 1, map_dim]))
            #########################
            #eltwise_mult = tf.nn.l2_normalize(image_feat_mapped * text_param_mapped, 3)
            #with ops.name_scope(name, "l2_normalize", [x]) as name:
            x = image_feat_mapped * text_param_mapped
            square_sum = math_ops.reduce_sum(math_ops.square(x),
                                             3,
                                             keep_dims=True)
            x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, 1e-12))
            eltwise_mult = math_ops.multiply(x, x_inv_norm, name=None)
            att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

            # TODO
            # Do we need to take exponential over the scores?
            # No.
            # Does the attention needs to be normalized? (sum up to 1)
            # No, since non-existence should be 0 everywhere

        return att_grid
예제 #26
0
	def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None):
		if dropout == None:
			dropout = self.dropout

		text_seq_batch = tf.transpose(sen_data, [1, 0])	# input data is [num_steps, batch_size]
		with tf.variable_scope('word_embedding'), tf.device("/cpu:0"):
			if self.embed_w is None:
				initializer = tf.contrib.layers.xavier_initializer(uniform=True)
			else:
				initializer = tf.constant_initializer(self.embed_w)
			embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32,
				initializer=initializer)
			# text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
			embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch)

		# encode phrase based on the last step of hidden states
		outputs, _, _ = bi_lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim,
						num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False,
						initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08))

		sen_raw = outputs[-1]
		vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size])

		sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9)
		vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9)

		sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, 2*self.lstm_dim])	# bi-directional lstm: hidden_size double
		vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size])

		sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1])
		feat_concat = tf.concat(3, [sen_tile, vis_output])

		feat_proj_init = msr_init([1, 1, 2*self.lstm_dim+self.img_feat_size, self.hidden_size])
		feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init)
		feat_relu = tf.nn.relu(feat_proj)

		att_conv_init = msr_init([1, 1, self.hidden_size, 5])
		att_conv = conv("att_conv", feat_relu, 1, 1, 5, weights_initializer=att_conv_init)
		att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop, 5])

		att_logits = tf.reshape(att_scores[:, :, 0], [self.batch_size, self.num_prop])
		_, pred_ind = tf.nn.top_k(att_logits, self.top_k)
		pred_ind = tf.reshape(pred_ind, [self.batch_size*self.top_k, 1])
		row_ind = tf.reshape(tf.range(0, self.batch_size), [-1, 1])
		row_ind = tf.reshape(tf.tile(row_ind, [1, self.top_k]), [self.top_k*self.batch_size, 1])
		pred_ind = tf.concat(1, [row_ind, pred_ind])	
		
		# (batch_size*top_k) x img_feat_size
		vis_top = tf.gather_nd(tf.reshape(vis_output, [self.batch_size, self.num_prop, self.img_feat_size]), pred_ind)
		vis_ref = tf.reduce_mean(tf.reshape(vis_top, [self.batch_size, self.top_k, self.img_feat_size]), 1)
		ref_feat = tf.concat(1, [vis_ref, sen_bn])
		# ref_feat = vis_ref
		reward_pred = tf.reshape(tf.sigmoid(fc('reward_pred', ref_feat, 1)),[self.batch_size])

		return att_scores, reward_pred
예제 #27
0
    def DescribeTwo(self,
                    att_stack,
                    stack_ptr,
                    mem_in,
                    c_i,
                    scope='DescribeTwo',
                    reuse=None):
        """
        Describe using two input attentions. Outputs zero attention.
        """
        with tf.variable_scope(scope, reuse=reuse):
            # Update memory:
            #   1) linearly map the controller vectors to the KB dimension
            #   2) extract attended features from the input attention
            #   3) elementwise multplication
            #   2) linearly merge with previous memory vector, find memory
            #      vector and control state

            att_stack_old, stack_ptr_old = att_stack, stack_ptr  # make a copy
            # Pop from stack
            att_in_2 = _read_from_stack(att_stack, stack_ptr)
            stack_ptr = _move_ptr_bw(stack_ptr)
            att_in_1 = _read_from_stack(att_stack, stack_ptr)
            # stack_ptr = _move_ptr_bw(stack_ptr)  # cancel-out below

            c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM)
            kb_att_in_1 = _extract_softmax_avg(self.kb_batch, att_in_1)
            kb_att_in_2 = _extract_softmax_avg(self.kb_batch, att_in_2)
            elt_prod = tf.nn.l2_normalize(c_mapped * kb_att_in_1 * kb_att_in_2,
                                          axis=-1)
            mem_out = fc('fc_mem_out',
                         tf.concat([c_i, mem_in, elt_prod], axis=1),
                         output_dim=self.mem_dim)

            # Push to stack
            # stack_ptr = _move_ptr_fw(stack_ptr)  # cancel-out above
            att_stack = _write_to_stack(att_stack, stack_ptr, self.att_zero)

            if cfg.MODEL.NMN.DESCRIBE_TWO.KEEP_STACK:
                att_stack, stack_ptr = att_stack_old, stack_ptr_old

        return att_stack, stack_ptr, mem_out
예제 #28
0
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name+" doesn't reuse variables"
        
        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        layer1=drop(layer1,0.5)
        outputs = fc('layer2', layer1,output_dim=4)
    return outputs
예제 #29
0
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False):
    with tf.variable_scope(name):
        if reuse==True:
            print name+" reuse variables"
            tf.get_variable_scope().reuse_variables()
        else:
            print name+" doesn't reuse variables"
        
        layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim)
        layer1=drop(layer1,0.5)
        outputs = fc('layer2', layer1,output_dim=2)
    return outputs
예제 #30
0
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test):
        name="CTRL_Model"
        with tf.variable_scope(name):
            print "Building training network...............................\n"     
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)
            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print "Building test network...............................\n" 
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)
            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [3])

            return sim_score_mat_train, sim_score_mat_test
예제 #31
0
    def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test,
                              sentence_ph_train_len, sentence_ph_test_len):

        name="CTRL_Model"
        with tf.variable_scope(name):
            print("Building training network...............................\n")
            transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) 
            transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1)

            if self.useLSTM:
                sentence_embed_train = self.lstm_embed(sentence_embed_train, sentence_ph_train_len)

            transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size)
            transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1)  
            cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size)
            sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000)
            sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3])

            tf.get_variable_scope().reuse_variables()
            print("Building test network...............................\n")
            transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size)
            transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1)

            if self.useLSTM:
                sentence_embed_test = self.lstm_embed(sentence_embed_test, sentence_ph_test_len)
            transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size)
            transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1)

            cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size)
            sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test = tf.reshape(sim_score_mat_test, [self.test_batch_size, self.test_batch_size, 3])

            cross_modal_vec_test_1 = self.cross_modal_comb(tf.reshape(transformed_clip_test_norm[1], shape=(1,1024)),
                                                           tf.reshape(transformed_sentence_test_norm[1], shape=(1,1024)), 1)
            sim_score_mat_test_1 = vs_multilayer.vs_multilayer(cross_modal_vec_test_1, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000)
            sim_score_mat_test_1 = tf.reshape(sim_score_mat_test_1, [3])
            return sim_score_mat_train, sim_score_mat_test, sim_score_mat_test_1
예제 #32
0
    def instantiate_batch(self, inputs):
        """
    Inputs:
      output from the previous modules
      image feature for the example
      text attention for all modules for the example
      time id for current module
    """
        vis_att, img_feat, text_att = inputs

        # text feature dimension, intermediate mapping dimension
        # batch size, image feature height and width
        text_dim = text_att.shape.as_list()[-1]
        map_dim = self._params['map_dim']
        encode_size = self._params['encode_size']
        N = tf.shape(img_feat)[0]
        H, W = img_feat.shape.as_list()[1:3]

        with tf.variable_scope(self._module_scope):
            with tf.variable_scope(self._scope, reuse=self._reuse):
                text_map = fc('fc_text', text_att, output_dim=map_dim)
                # nonlinearity
                text_map = tf.nn.relu(text_map)

                # att_feat, att_feat_1 has shape [N, D_vis]
                att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2])
                img_map = tf.reshape(
                    fc('fc_att', att_feats, output_dim=map_dim), [N, map_dim])
                # nonlinearity
                img_map = tf.nn.relu(img_map)

                eltwise_mult = tf.nn.l2_normalize(img_map * text_map, 1)
                context = fc('fc_eltwise',
                             eltwise_mult,
                             output_dim=encode_size)

        return [context]
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab,
    embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout):

    # Language feature (LSTM hidden state)
    feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)

    # Local image feature
    feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout)

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them
    feat_all = tf.concat(1, [tf.nn.l2_normalize(feat_lang, 1),
                             tf.nn.l2_normalize(feat_vis, 1),
                             spatial_batch])

    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)

    return mlp_l2
lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)

# Local image feature
fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False)

# L2-normalize the features (except for spatial_batch)
# and concatenate them along axis 1 (feature dimension)
feat_all = tf.concat(1, [tf.nn.l2_normalize(lstm_top_batch, 1),
                         tf.nn.l2_normalize(fc8_crop_batch, 1),
                         spatial_batch])

# Outputs
# MLP Classifier over concatenate feature
with tf.variable_scope('classifier'):
    mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
    mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)
scores = mlp_l2

# Load pretrained model
snapshot_saver = tf.train.Saver()
sess = tf.Session()
snapshot_saver.restore(sess, pretrained_model)

################################################################################
# Load annotations and bounding box proposals
################################################################################

query_dict = json.load(open(query_file))
bbox_dict = json.load(open(bbox_file))
imcrop_dict = json.load(open(imcrop_file))
imsize_dict = json.load(open(imsize_file))
예제 #35
0
def vgg_fc8(input_batch, name, apply_dropout, output_dim=1000):
    fc7 = vgg_fc7(input_batch, name, apply_dropout)
    with tf.variable_scope(name):
        # layer 8 (no ReLU after fc8)
        fc8 = fc('fc8', fc7, output_dim=output_dim)
        return fc8