Exemplo n.º 1
0
  def _build_encoder(self, input_seq_batch, seq_len_batch, scope='encoder',
    reuse=None):
    lstm_dim = self.lstm_dim
    num_layers = self.num_layers
    apply_dropout = self.encoder_dropout

    with tf.variable_scope(scope, reuse=reuse):
      #T = tf.shape(input_seq_batch)[0]
      T = input_seq_batch.shape.as_list()[0]
      N = tf.shape(input_seq_batch)[1]
      self.T_encoder = T
      self.N = N
      with tf.variable_scope(self.embed_scope, reuse=True):
        embedding_mat = tf.get_variable('embed_mat', [self.encoder_num_vocab,
                                                      self.encoder_embed_dim])
      # text_seq has shape [T, N] and embedded_seq has shape [T, N, D].
      embedded_seq = tf.nn.embedding_lookup(embedding_mat, input_seq_batch)
      self.embedded_input_seq = embedded_seq

      # The RNN
      cell = _get_lstm_cell(num_layers, lstm_dim, apply_dropout)

      # encoder_outputs has shape [T, N, lstm_dim]
      encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell, embedded_seq,
                                                          seq_len_batch,
                                                          dtype=tf.float32,
                                                          time_major=True,
                                                          scope='lstm')
      self.encoder_outputs = encoder_outputs
      self.encoder_states = encoder_states

      # check if wv flag is set
      if self.params['use_word_vectors']:
        # transform the encoder outputs for further attention alignments
        # encoder_outputs_flat has shape [T, N, lstm_dim]
        encoder_h_transformed = fc('encoder_h_transform',
          tf.reshape(embedded_seq, [-1, self.encoder_embed_dim]),
                            output_dim=lstm_dim)
      else:
        # transform the encoder outputs for further attention alignments
        # encoder_outputs_flat has shape [T, N, lstm_dim]
        encoder_h_transformed = fc('encoder_h_transform',
          tf.reshape(encoder_outputs, [-1, lstm_dim]), output_dim=lstm_dim)

      encoder_h_transformed = tf.reshape(encoder_h_transformed,
                       to_T([T, N, lstm_dim]))
      self.encoder_h_transformed = encoder_h_transformed

      # seq_not_finished has shape [T, N, 1], where seq_not_finished[t, n]
      # is 1 iff sequence n is not finished at time t, and 0 otherwise
      seq_not_finished = tf.less(tf.range(T)[:, tf.newaxis, tf.newaxis],
                   seq_len_batch[:, tf.newaxis])
      seq_not_finished = tf.cast(seq_not_finished, tf.float32)
      self.seq_not_finished = seq_not_finished
Exemplo n.º 2
0
    def EqualNumModule(self,
                       input_0,
                       input_1,
                       time_idx,
                       batch_idx,
                       scope='EqualNumModule',
                       reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        # Mapping: att_grid x att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        #   input_1: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. linear transform of the attention map (also including max and min)
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                att_shape = tf.shape(input_0)

                H, W = self.att_shape[1:3]
                att_all_0 = tf.reshape(input_0, to_T([-1, H * W]))
                att_min_0 = tf.reduce_min(input_0, axis=[1, 2])
                att_max_0 = tf.reduce_max(input_0, axis=[1, 2])
                att_all_1 = tf.reshape(input_1, to_T([-1, H * W]))
                att_min_1 = tf.reduce_min(input_1, axis=[1, 2])
                att_max_1 = tf.reduce_max(input_1, axis=[1, 2])
                # att_reduced has shape [N, 3]
                att_concat = tf.concat([
                    att_all_0, att_min_0, att_max_0, att_all_1, att_min_1,
                    att_max_1
                ],
                                       axis=1)
                #scores = fc_relu('fc_scores', att_concat, output_dim=self.num_choices)
                scores = fc('fc_scores',
                            att_concat,
                            output_dim=self.num_choices)

        return scores
Exemplo n.º 3
0
def localization_module_batch_score(vis_feat, spatial_feat, lang_feat,
    scope="localization_module", reuse=None):
    # Input:
    #   vis_feat: [N_batch, N_vis, D_vis]
    #   spatial_feat: [N_batch, N_vis, D_spatial]
    #   lang_feat: [N_batch, D_lang]
    # Output:
    #   localization_scores: [N_batch, N_vis, 1]
    #
    # This function is not responsible for initializing the variables. Please
    # handle variable initialization outside.

    with tf.variable_scope(scope, reuse=reuse):
        # An embedding module that maps the visual feature plus the spatial feature
        # linearly to the same dimension as the language feature
        N_batch = tf.shape(vis_feat)[0]
        N_vis = tf.shape(vis_feat)[1]
        D_vis = vis_feat.get_shape().as_list()[-1]
        D_spatial = spatial_feat.get_shape().as_list()[-1]
        D_lang = lang_feat.get_shape().as_list()[-1]

        # flatten the visual and spatial features and embed them to the same
        # dimension as the language feature
        vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=2)
        vis_spatial_feat = tf.reshape(vis_spatial_feat, [-1, D_vis+D_spatial])
        vis_spatial_embed = fc('vis_spatial_embed', vis_spatial_feat, output_dim=D_lang)

        # Reshape visual feature and language feature for broadcast multiplication
        lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang])
        vis_spatial_embed = tf.reshape(vis_spatial_embed, to_T([N_batch, -1, D_lang]))

        # Elementwise multiplication with language feature and l2-normalization
        eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 2)
        eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang])

        # Localization scores as linear classification over the l2-normalized
        localization_scores = fc('localization_scores', eltwise_mult, output_dim=1)
        localization_scores = tf.reshape(localization_scores, to_T([N_batch, N_vis, 1]))

    return localization_scores
Exemplo n.º 4
0
    def SceneModule(self, time_idx, batch_idx, pos_val=3, scope='SceneModule',
        reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        # Mapping: None -> att_grid
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Just output a positive attention everywhere
        N = tf.shape(time_idx)[0]
        att_grid = pos_val*tf.ones(to_T([N]+self.att_shape[1:]))
        return att_grid
Exemplo n.º 5
0
def build_input_unit(input_seq_batch,
                     seq_length_batch,
                     num_vocab,
                     scope='input_unit',
                     reuse=None):
    """
    Preprocess the input sequence with a (single-layer) bidirectional LSTM.

    Input:
        input_seq_batch: [S, N], tf.int32
        seq_length_batch: [N], tf.int32
    Return:
        lstm_seq: [S, N, d], tf.float32
        q_encoding: [N, d], tf.float32
        embed_seq: [S, N, e], tf.float32
    """

    with tf.variable_scope(scope, reuse=reuse):
        # word embedding
        embed_dim = cfg.MODEL.EMBED_DIM
        if cfg.USE_FIXED_WORD_EMBED:
            embed_mat = to_T(np.load(cfg.FIXED_WORD_EMBED_FILE))
        else:
            embed_mat = tf.get_variable(
                'embed_mat', [num_vocab, embed_dim],
                initializer=tf.initializers.random_normal(
                    stddev=np.sqrt(1. / embed_dim)))
        embed_seq = tf.nn.embedding_lookup(embed_mat, input_seq_batch)

        # bidirectional LSTM
        lstm_dim = cfg.MODEL.LSTM_DIM
        assert lstm_dim % 2 == 0, \
            'lstm_dim is the dimension of [fw, bw] and must be a multiply of 2'
        cell_fw = tf.nn.rnn_cell.LSTMCell(lstm_dim // 2,
                                          name='basic_lstm_cell')
        cell_bw = tf.nn.rnn_cell.LSTMCell(lstm_dim // 2,
                                          name='basic_lstm_cell')
        outputs, states = tf.nn.bidirectional_dynamic_rnn(
            cell_fw,
            cell_bw,
            embed_seq,
            dtype=tf.float32,
            sequence_length=seq_length_batch,
            time_major=True)
        # concatenate the hidden state from forward and backward LSTM
        lstm_seq = tf.concat(outputs, axis=2)
        # concatenate the final hidden state of the forward and backward LSTM
        # for question representation
        q_encoding = tf.concat([states[0].h, states[1].h], axis=1)

    return lstm_seq, q_encoding, embed_seq
Exemplo n.º 6
0
    def _build_encoder(self,
                       input_seq_batch,
                       seq_length_batch,
                       scope='encoder',
                       reuse=None):
        lstm_dim = self.lstm_dim
        num_layers = self.num_layers
        apply_dropout = self.encoder_dropout

        with tf.variable_scope(scope, reuse=reuse):
            self.T_encoder = tf.shape(input_seq_batch)[0]
            self.N = tf.shape(input_seq_batch)[1]

            # Step 1: Embedding the input seq
            embedding_mat = tf.get_variable(
                'embedding_mat',
                [self.encoder_num_vocab, self.encoder_embed_dim])
            # input_seq_batch has shape [T, N] and embedded_input_seq has shape [T, N, D].
            # now apply the embedding to input seq batch
            self.embedded_input_seq = tf.nn.embedding_lookup(
                embedding_mat, input_seq_batch)

            # Step 2: Build the RNN(LSTM)
            cell_layers = _get_lstm_cell(num_layers, lstm_dim, apply_dropout)
            # encoder_outputs has shape [T, N, lstm_dim]
            encoder_outputs, self.encoder_states = tf.nn.dynamic_rnn(
                cell_layers,
                self.embedded_input_seq,
                seq_length_batch,
                dtype=tf.float32,
                time_major=True,
                scope='lstm')
            self.encoder_outputs = encoder_outputs

            # Step 3: Flatten the outputs
            # adjust the encoder outputs size to batch-like data for decoder usage
            encoder_h_transformed = fc('encoder_h_transform',
                                       tf.reshape(encoder_outputs,
                                                  [-1, lstm_dim]),
                                       output_dim=lstm_dim)
            # reshape the flattened encoder to [T, N, lstm_dim]
            self.encoder_h_transformed = tf.reshape(
                encoder_h_transformed, to_T([self.T_encoder, self.N,
                                             lstm_dim]))

            # seq_not_finished is a shape [T, N, 1] tensor, where seq_not_finished[t, n]
            # is 1 iff sequence n is not finished at time t, and 0 otherwise
            seq_not_finished = tf.less(
                tf.range(self.T_encoder)[:, tf.newaxis, tf.newaxis],
                seq_length_batch[:, tf.newaxis])
            self.seq_not_finished = tf.cast(seq_not_finished, tf.float32)
Exemplo n.º 7
0
    def __init__(self,
                 input_seq_batch,
                 seq_length_batch,
                 T_decoder,
                 num_vocab_txt,
                 embed_dim_txt,
                 num_vocab_nmn,
                 embed_dim_nmn,
                 lstm_dim,
                 num_layers,
                 assembler,
                 encoder_dropout,
                 decoder_dropout,
                 decoder_sampling,
                 use_gt_layout=None,
                 gt_layout_batch=None,
                 scope='encoder_decoder',
                 reuse=None):
        self.T_decoder = T_decoder
        self.encoder_num_vocab = num_vocab_txt
        self.encoder_embed_dim = embed_dim_txt
        self.decoder_num_vocab = num_vocab_nmn
        self.decoder_embed_dim = embed_dim_nmn
        self.lstm_dim = lstm_dim
        self.num_layers = num_layers
        self.EOS_token = assembler.EOS_idx
        # decoding transition variables
        self.P = to_T(assembler.P, dtype=tf.int32)
        self.W = to_T(assembler.W, dtype=tf.int32)
        self.b = to_T(assembler.b, dtype=tf.int32)

        self.encoder_dropout = encoder_dropout
        self.decoder_dropout = decoder_dropout
        self.decoder_sampling = decoder_sampling

        with tf.variable_scope(scope, reuse=reuse):
            self._build_encoder(input_seq_batch, seq_length_batch)
            self._build_decoder(use_gt_layout, gt_layout_batch)
Exemplo n.º 8
0
def add_spatial_coord_map(image_feat_grid):
    image_feat_shape = tf.shape(image_feat_grid)
    N = image_feat_shape[0]
    # static dimensions
    #H = image_feat_shape[1]
    #W = image_feat_shape[2]
    H, W = image_feat_grid.shape.as_list()[1:3]
    x_map = tf.tile(tf.reshape(tf.linspace(-1., 1., W), [1, 1, -1, 1]),
                    to_T([N, H, 1, 1]))
    y_map = tf.tile(tf.reshape(tf.linspace(-1., 1., H), [1, -1, 1, 1]),
                    to_T([N, 1, W, 1]))

    # stop gradient on coords_map (needed to fix the tile grad error on TF 1.0.0)
    coords_map = tf.stop_gradient(tf.concat([x_map, y_map], axis=3))
    image_feat_with_coords = tf.concat([image_feat_grid, coords_map], axis=3)
    # set shapes of the new feature maps
    image_feat_static_shape = image_feat_grid.get_shape().as_list()
    image_feat_static_shape[3] += 2
    image_feat_with_coords.set_shape(image_feat_static_shape)
    image_feat_static_shape[3] = 2
    coords_map.set_shape(image_feat_static_shape)

    return image_feat_with_coords, coords_map
Exemplo n.º 9
0
  def __init__(self, holders, use_gt_prog, assembler, params, reuse=None):
    self.T_decoder = params['max_dec_len']
    self.encoder_num_vocab = params['text_vocab_size']
    self.encoder_embed_dim = params['text_embed_size']
    self.decoder_num_vocab = params['prog_vocab_size']
    self.decoder_embed_dim = params['prog_embed_size']
    self.lstm_dim = params['lstm_size']
    self.num_layers = params['num_layers']
    self.EOS_token = assembler.EOS_idx
    self.embed_scope = params['embed_scope']
    self.temperature = params.get('temperature', 1)

    # if word vectors need to be used or lstm outputs for attention
    params['use_word_vectors'] = 'wv-att' in params['model']
    params['generator'] = params.get('generator', 'ques')
    self.params = params

    # decoding transition variables
    self.P = to_T(assembler.P, dtype=tf.int32)
    self.W = to_T(assembler.W, dtype=tf.int32)
    self.b = to_T(assembler.b, dtype=tf.int32)

    self.encoder_dropout = params['enc_dropout']
    self.decoder_dropout = params['dec_dropout']
    self.decoder_sampling = params['dec_sampling']

    # detect fake inputs
    if 'fake' in holders: scope = 'enc_dec_cap'
    else: scope = 'enc_dec'
    with tf.variable_scope(scope, reuse=reuse):
      # build a special encoder, if needed
      if 'fake' not in holders and params['generator'] == 'mem':
        self._build_memory_encoder(holders)
      else:
        # build a normal encoder
        self._build_encoder(holders['ques'], holders['ques_len'])
      self._build_decoder(use_gt_prog, holders['prog_gt'])
Exemplo n.º 10
0
    def FindModule(self,
                   time_idx,
                   batch_idx,
                   map_dim=500,
                   scope='FindModule',
                   reuse=None):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: image_feat_grid x text_param -> att_grid
        # Input:
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Elementwise multiplication between image_feat_grid and text_param
        #   2. L2-normalization
        #   3. Linear classification
        with tf.variable_scope(scope, reuse=reuse):
            image_shape = tf.shape(image_feat_grid)
            N = tf.shape(time_idx)[0]
            H = image_shape[1]
            W = image_shape[2]
            D_im = image_feat_grid.get_shape().as_list()[-1]
            D_txt = text_param.get_shape().as_list()[-1]

            # image_feat_mapped has shape [N, H, W, map_dim]
            image_feat_mapped = _1x1_conv('conv_image',
                                          image_feat_grid,
                                          output_dim=map_dim)

            text_param_mapped = fc('fc_text', text_param, output_dim=map_dim)
            text_param_mapped = tf.reshape(text_param_mapped,
                                           to_T([N, 1, 1, map_dim]))

            eltwise_mult = tf.nn.l2_normalize(
                image_feat_mapped * text_param_mapped, 3)
            att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

            # TODO
            # Do we need to take exponential over the scores?
            # No.
            # Does the attention needs to be normalized? (sum up to 1)
            # No, since non-existence should be 0 everywhere

        return att_grid
Exemplo n.º 11
0
    def FindModule(self,
                   time_idx,
                   batch_idx,
                   map_dim=1024,
                   scope='FindModule',
                   reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: image_feat_grid x text_param -> att_grid
        # Input:
        #   image_feat_grid: [N, H, W, D_im]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   1. Elementwise multiplication between image_feat_grid and text_param
        #   2. L2-normalization
        #   3. Linear classification
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                # image_feat_mapped has shape [N, H, W, map_dim]
                image_feat_mapped = _1x1_conv('conv_image',
                                              image_feat_grid,
                                              output_dim=map_dim)

                text_param_mapped = fc('fc_text',
                                       text_param,
                                       output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped,
                                               to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    image_feat_mapped * text_param_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemplo n.º 12
0
    def bbox_offset_loss(self, bbox_ind_batch, bbox_offset_batch):
        if cfg.MODEL.BBOX_REG_AS_FCN:
            N = tf.shape(self.bbox_offset_fcn)[0]
            B = tf.shape(self.bbox_offset_fcn)[1]  # B = H*W
            bbox_offset_flat = tf.reshape(self.bbox_offset_fcn,
                                          to_T([N * B, 4]))
            slice_inds = tf.range(N) * B + bbox_ind_batch
            bbox_offset_sliced = tf.gather(bbox_offset_flat, slice_inds)
            loss_bbox_offset = tf.reduce_mean(
                tf.squared_difference(bbox_offset_sliced, bbox_offset_batch))
        else:
            loss_bbox_offset = tf.reduce_mean(
                tf.squared_difference(self.bbox_offset, bbox_offset_batch))

        return loss_bbox_offset
    def TransformModule(self,
                        input_0,
                        time_idx,
                        batch_idx,
                        kernel_size=3,
                        map_dim=500,
                        scope='TransformModule',
                        reuse=None):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        att_grid = input_0
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x text_param -> att_grid
        # Input:
        #   att_grid: [N, H, W, 1]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid_transformed: [N, H, W, 1]
        #
        # Implementation:
        #   Convolutional layer that also involve text_param
        #   A 'soft' convolutional kernel that is modulated by text_param
        with tf.variable_scope(scope, reuse=reuse):
            att_shape = tf.shape(att_grid)
            N = att_shape[0]
            H = att_shape[1]
            W = att_shape[2]
            att_maps = _conv('conv_maps',
                             att_grid,
                             kernel_size=kernel_size,
                             stride=1,
                             output_dim=map_dim)

            text_param_mapped = fc('text_fc', text_param, output_dim=map_dim)
            text_param_mapped = tf.reshape(text_param_mapped,
                                           to_T([N, 1, 1, map_dim]))

            ######################
            #eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3)
            x = att_maps * text_param_mapped
            square_sum = math_ops.reduce_sum(math_ops.square(x),
                                             3,
                                             keep_dims=True)
            x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, 1e-12))
            eltwise_mult = math_ops.multiply(x, x_inv_norm, name=None)
            att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1)

        return att_grid
Exemplo n.º 14
0
    def TransformModule(self,
                        input_0,
                        time_idx,
                        batch_idx,
                        kernel_size=5,
                        map_dim=250,
                        scope='TransformModule',
                        reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation:
        #   Convolutional layer that also involve text_param
        #   A 'soft' convolutional kernel that is modulated by text_param
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                att_shape = tf.shape(input_0)
                N = att_shape[0]
                H = att_shape[1]
                W = att_shape[2]
                att_maps = _conv('conv_maps',
                                 input_0,
                                 kernel_size=kernel_size,
                                 stride=1,
                                 output_dim=map_dim)

                text_param_mapped = fc('text_fc',
                                       text_param,
                                       output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped,
                                               to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped,
                                                  3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemplo n.º 15
0
def _move_ptr_bw(stack_ptr):
    """
    Move the stack pointer backward (i.e. to pop from stack).
    """
    # Note: in TF, conv1d is implemented as auto-correlation (instead of
    # mathmatical convolution), so no flipping of the filter.
    filter_fw = to_T(np.array([0, 0, 1], np.float32).reshape((3, 1, 1)))
    new_stack_ptr = tf.squeeze(tf.nn.conv1d(stack_ptr[..., ax], filter_fw, 1,
                                            'SAME'),
                               axis=[2])
    # when the stack pointer is already at the stack bottom, keep
    # the pointer in the same location (otherwise the pointer will be all zero)
    if cfg.MODEL.NMN.STACK.GUARD_STACK_PTR:
        stack_len = cfg.MODEL.NMN.STACK.LENGTH
        stack_bottom_mask = tf.one_hot(0, stack_len)
        new_stack_ptr += stack_bottom_mask * stack_ptr
    return new_stack_ptr
Exemplo n.º 16
0
    def SamePropertyModule(self, input_0, input_1, time_idx, batch_idx,
        map_dim=250, scope='SamePropertyModule', reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        #   input_1: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Convolve image features to map_dim
        #   4. Element-wise multiplication of the three, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                text_param_mapped = fc('fc_text', text_param, output_dim=map_dim)

                att_softmax_0 = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))),
                    to_T([N, H, W, 1]))
                att_softmax_1 = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_1, to_T([N, H*W]))),
                    to_T([N, H, W, 1]))
                # att_feat_0, att_feat_1 has shape [N, D_vis]
                att_feat_0 = tf.reduce_sum(image_feat_grid * att_softmax_0, axis=[1, 2])
                att_feat_1 = tf.reduce_sum(image_feat_grid * att_softmax_1, axis=[1, 2])
                att_feat_mapped_0 = tf.reshape(
                    fc('fc_att_0', att_feat_0, output_dim=map_dim),
                    to_T([N, map_dim]))
                att_feat_mapped_1 = tf.reshape(
                    fc('fc_att_1', att_feat_1, output_dim=map_dim),
                    to_T([N, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    att_feat_mapped_0 * text_param_mapped * att_feat_mapped_1, 1)
                scores = fc('fc_eltwise', eltwise_mult, output_dim=self.num_choices)

        return scores
def empty_safe_1x1_conv(name, bottom, output_dim, reuse=None):
    # use this for 1x1 convolution in modules to avoid the crash.
    bottom_shape = tf.shape(bottom)
    input_dim = bottom.get_shape().as_list()[-1]

    # weights and biases variables
    with tf.variable_scope(name, reuse=reuse):
        # initialize the variables
        weights_initializer = tf.contrib.layers.xavier_initializer()
        biases_initializer = tf.constant_initializer(0.)
        weights = tf.get_variable('weights', [input_dim, output_dim],
            initializer=weights_initializer)
        biases = tf.get_variable('biases', output_dim,
            initializer=biases_initializer)

        conv_flat = tf.matmul(tf.reshape(bottom, [-1, input_dim]), weights) + biases
        conv = tf.reshape(conv_flat, to_T([bottom_shape[0], bottom_shape[1], bottom_shape[2], output_dim]))

    return conv
Exemplo n.º 18
0
def refgoog_attbilstm_net(input_batch, bbox_batch, spatial_batch, expr_obj,
    num_vocab, embed_dim, lstm_dim, vgg_dropout, lstm_dropout):
    #   bbox_batch has shape [N_box, 5] and
    #   spatial_batch has shape [N_box, D_spatial] and
    #   expr_obj has shape [T, N_batch]

    N_batch = tf.shape(expr_obj)[1]
    N_box = tf.shape(spatial_batch)[0]

    # Extract visual features
    vis_feat = fastrcnn_vgg_net.vgg_roi_fc7(input_batch, bbox_batch,
        "vgg_local", apply_dropout=vgg_dropout)
    D_vis = vis_feat.get_shape().as_list()[-1]

    # Extract representation using attention
    lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel  = lstm_net.attbilstm(
        expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim,
        lstm_dim=lstm_dim, apply_dropout=lstm_dropout)

    # Score for each bounding box matching the first object
    # scores_obj1 has shape [N_batch, N_box, 1]
    scores_obj1 = modules.localization_module_grid_score(vis_feat,
        spatial_batch, lang_obj1)
    # Score for each bounding box matching the second object
    # scores_obj2 has shape [N_batch, N_box, 1]
    scores_obj2 = modules.localization_module_grid_score(vis_feat,
        spatial_batch, lang_obj2, reuse=True)

    # Scores for each pair of bounding box matching the relationship
    # Tile the scores by broadcasting add
    # scores_rel has shape [N_batch, N_box, N_box, 1]
    scores_rel = modules.relationship_module_spatial_only_grid_score(
        spatial_batch, scores_obj1, spatial_batch, scores_obj2, lang_relation,
        rescale_scores=True)
    tf.add_to_collection("s_pair", scores_rel)

    # marginal_scores has shape [N_batch, N_box, 1]
    marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2)
    final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1]))

    return final_scores, probs_obj1, probs_obj2, probs_rel
Exemplo n.º 19
0
    def instantiate_batch(self, inputs):
        """
      Inputs:
        image feature for the example
        text attention for all modules for the example
        time id for current module
    """
        vis_att, img_feat, _ = inputs
        encode_size = self._params['encode_size']

        with tf.variable_scope(self._module_scope):
            with tf.variable_scope(self._scope, reuse=self._reuse):

                H, W = img_feat.shape.as_list()[1:3]
                att_all = tf.reshape(vis_att, to_T([-1, H * W]))
                att_min = tf.reduce_min(vis_att, axis=[1, 2])
                att_max = tf.reduce_max(vis_att, axis=[1, 2])
                # att_reduced has shape [N, 3]
                att_concat = tf.concat([att_all, att_min, att_max], axis=1)
                context = fc('fc_scores', att_concat, output_dim=encode_size)

        return [context]
Exemplo n.º 20
0
def refgoog_retrieval_baseline(vis_feat, spatial_batch, expr_obj,
    num_vocab, embed_dim, lstm_dim):
    
    N_batch = tf.shape(expr_obj)[1]
    N_box = tf.shape(spatial_batch)[0]

    D_vis = vis_feat.get_shape().as_list()[-1]

    lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm(
        expr_obj, "lstm", num_vocab=num_vocab, embed_dim=embed_dim, lstm_dim=lstm_dim,
        apply_dropout=False)

    scores_obj1 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj1)
    scores_obj2 = modules.localization_module_grid_score(vis_feat, spatial_batch, lang_obj2, reuse=True)
    scores_rel = modules.relationship_module_spatial_only_grid_score(
        spatial_batch, scores_obj1, spatial_batch, scores_obj2, lang_relation,
        rescale_scores=True)

    marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2)
    final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1]))

    return final_scores
Exemplo n.º 21
0
    def CountModule(self,
                    input_0,
                    time_idx,
                    batch_idx,
                    scope='CountModule',
                    kernel_size=5,
                    map_dim=300,
                    reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        # Mapping: att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. linear transform of the attention map (also including max and min)
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                H, W = self.att_shape[1:3]
                att_all = tf.reshape(input_0, to_T([-1, H * W]))
                att_min = tf.reduce_min(input_0, axis=[1, 2])
                att_max = tf.reduce_max(input_0, axis=[1, 2])
                # att_reduced has shape [N, 3]
                att_concat = tf.concat([att_all, att_min, att_max], axis=1)
                scores = fc('fc_scores',
                            att_concat,
                            output_dim=self.num_choices)
                # att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size,stride=1, output_dim=map_dim)
                # att_grid = _1x1_conv("conv_eltwise",att_maps,output_dim=1)
                # att_grid.set_shape(self.att_shape)
                # att_shape = tf.shape(att_grid)
                # H, W = self.att_shape[1:3]
                # att_all = tf.reshape(att_grid, to_T([-1, H*W]))
                # scores = fc('fc_scores', att_all, output_dim=self.num_choices)

        return scores
Exemplo n.º 22
0
    def __init__(self, image_feat_grid, word_vecs, num_choices):
        self.image_feat_grid = image_feat_grid
        self.word_vecs = word_vecs
        self.num_choices = num_choices
        # Capture the variable scope for creating all variables
        with tf.variable_scope('module_variables') as module_variable_scope:
            self.module_variable_scope = module_variable_scope
        # Flatten word vecs for efficient slicing
        # word_vecs has shape [T_decoder, N, D]
        word_vecs_shape = tf.shape(word_vecs)
        T_full = word_vecs_shape[0]
        self.N_full = word_vecs_shape[1]
        D_word = word_vecs.get_shape().as_list()[-1]
        self.word_vecs_flat = tf.reshape(
            word_vecs, to_T([T_full*self.N_full, D_word]))

        # create each dummy modules here so that weights won't get initialized again
        att_shape = image_feat_grid.get_shape().as_list()[:-1] + [1]
        self.att_shape = att_shape
        input_att = tf.placeholder(tf.float32, att_shape)
        time_idx = tf.placeholder(tf.int32, [None])
        batch_idx = tf.placeholder(tf.int32, [None])
        self.SceneModule(time_idx, batch_idx, reuse=False)
        self.FindModule(time_idx, batch_idx, reuse=False)
        self.FindSamePropertyModule(input_att, time_idx, batch_idx, reuse=False)
        self.TransformModule(input_att, time_idx, batch_idx, reuse=False)
        self.AndModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.FilterModule(input_att, time_idx, batch_idx, reuse=False)
        self.OrModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.ExistModule(input_att, time_idx, batch_idx, reuse=False)
        self.CountModule(input_att, time_idx, batch_idx, reuse=False)
        self.EqualNumModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.MoreNumModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.LessNumModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.SamePropertyModule(input_att, input_att, time_idx, batch_idx, reuse=False)
        self.DescribeModule(input_att, time_idx, batch_idx, reuse=False)
Exemplo n.º 23
0
    def TransformModule(self,
                        input_0,
                        time_idx,
                        batch_idx,
                        kernel_size=5,
                        map_dim=1024,
                        scope='TransformModule',
                        reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        # Mapping: att_grid x text_param -> att_grid
        # Input:
        #   input_0: [N, H, W, 1]
        #   text_param: [N, D_txt]
        # Output:
        #   att_grid: [N, H, W, 1]
        #
        # Implementation (Same as FindSamePropertyModule):
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Convolve image features to map_dim
        #   4. Element-wise multiplication of the three, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                # image_feat_mapped has shape [N, H, W, map_dim]
                image_feat_mapped = _1x1_conv('conv_image',
                                              image_feat_grid,
                                              output_dim=map_dim)

                text_param_mapped = fc('fc_text',
                                       text_param,
                                       output_dim=map_dim)
                text_param_mapped = tf.reshape(text_param_mapped,
                                               to_T([N, 1, 1, map_dim]))

                att_softmax = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))),
                    to_T([N, H, W, 1]))
                # att_feat has shape [N, D_vis]
                att_feat = tf.reduce_sum(image_feat_grid * att_softmax,
                                         axis=[1, 2])
                att_feat_mapped = tf.reshape(
                    fc('fc_att', att_feat, output_dim=map_dim),
                    to_T([N, 1, 1, map_dim]))

                eltwise_mult = tf.nn.l2_normalize(
                    image_feat_mapped * text_param_mapped * att_feat_mapped, 3)
                att_grid = _1x1_conv('conv_eltwise',
                                     eltwise_mult,
                                     output_dim=1)

        att_grid.set_shape(self.att_shape)
        return att_grid
Exemplo n.º 24
0
            def loop_fn(time, cell_output, cell_state, loop_state):
                if cell_output is None:  # time == 0
                    next_cell_state = encoder_states
                    next_input = tf.tile(go_embedding, to_T([N, 1]))
                else:  # time > 0
                    next_cell_state = cell_state

                    # compute the attention map over the input sequence
                    # a_raw has shape [T, N, 1]
                    att_raw = tf.reduce_sum(tf.tanh(
                        tf.nn.xw_plus_b(cell_output, W_a, b_a) +
                        self.encoder_h_transformed) * v,
                                            axis=2,
                                            keep_dims=True)
                    # softmax along the first dimension (T) over not finished examples
                    # att has shape [T, N, 1]
                    att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished
                    att = att / tf.reduce_sum(att, axis=0, keep_dims=True)
                    # d has shape [N, lstm_dim]
                    d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0)

                    # token_scores has shape [N, num_vocab]
                    token_scores = tf.nn.xw_plus_b(
                        tf.concat([cell_output, d2], axis=1), W_y, b_y)
                    # predict the next token (behavior depending on parameters)
                    if sampling:
                        # predicted_token has shape [N]
                        logits = token_scores
                        predicted_token = tf.cast(
                            tf.reshape(tf.multinomial(token_scores, 1), [-1]),
                            tf.int32)
                    else:
                        # predicted_token has shape [N]
                        predicted_token = tf.cast(tf.argmax(token_scores, 1),
                                                  tf.int32)
                    predicted_token = gt_layout_batch[time - 1]

                    # token_prob has shape [N], the probability of the predicted token
                    # although token_prob is not needed for predicting the next token
                    # it is needed in output (for policy gradient training)
                    # [N, num_vocab]
                    # mask has shape [N, num_vocab]
                    mask = tf.equal(mask_range,
                                    tf.reshape(predicted_token, [-1, 1]))
                    all_token_probs = tl.activation.pixel_wise_softmax(
                        token_scores)
                    token_prob = tf.reduce_sum(all_token_probs *
                                               tf.cast(mask, tf.float32),
                                               axis=1)
                    neg_entropy = tf.reduce_sum(
                        all_token_probs *
                        tf.log(tf.maximum(1e-5, all_token_probs)),
                        axis=1)

                    # is_eos_predicted is a [N] bool tensor, indicating whether
                    # <eos> has already been predicted previously in each sequence
                    is_eos_predicted = loop_state[2]
                    predicted_token_old = predicted_token
                    # if <eos> has already been predicted, now predict <eos> with
                    # prob 1
                    predicted_token = tf.where(is_eos_predicted, all_eos_pred,
                                               predicted_token)
                    token_prob = tf.where(is_eos_predicted, all_one_prob,
                                          token_prob)
                    neg_entropy = tf.where(is_eos_predicted, all_zero_entropy,
                                           neg_entropy)
                    is_eos_predicted = tf.logical_or(
                        is_eos_predicted,
                        tf.equal(predicted_token_old, EOS_token))

                    # the prediction is from the cell output of the last step
                    # timestep (t-1), feed it as input into timestep t
                    next_input = tf.nn.embedding_lookup(
                        embedding_mat, predicted_token)

                elements_finished = tf.greater_equal(time, T_max)

                # loop_state is a 5-tuple, representing
                #   1) the predicted_tokens
                #   2) the prob of predicted_tokens
                #   3) whether <eos> has already been predicted
                #   4) the negative entropy of policy (accumulated across timesteps)
                #   5) the attention
                if loop_state is None:  # time == 0
                    # Write the predicted token into the output
                    predicted_token_array = tf.TensorArray(dtype=tf.int32,
                                                           size=T_max,
                                                           infer_shape=False)
                    token_prob_array = tf.TensorArray(dtype=tf.float32,
                                                      size=T_max,
                                                      infer_shape=False)
                    att_array = tf.TensorArray(dtype=tf.float32,
                                               size=T_max,
                                               infer_shape=False)
                    next_loop_state = (predicted_token_array, token_prob_array,
                                       tf.zeros(to_T([N]), dtype=tf.bool),
                                       tf.zeros(to_T([N]),
                                                dtype=tf.float32), att_array)
                else:  # time > 0
                    t_write = time - 1
                    next_loop_state = (loop_state[0].write(
                        t_write, predicted_token), loop_state[1].write(
                            t_write, token_prob), is_eos_predicted,
                                       loop_state[3] + neg_entropy,
                                       loop_state[4].write(t_write, att))
                return (elements_finished, next_input, next_cell_state,
                        cell_output, next_loop_state)
Exemplo n.º 25
0
    def _build_decoder(self, gt_layout_batch, scope='decoder', reuse=None):

        # This function is for decoding only. It performs greedy search or sampling.
        # The first input is <go> (its embedding vector) ,and the subsequent inputs
        # are the outputs from previous time step (implementing attention).
        #
        # T_max is the maximum length of decoded sequence (including <eos>)
        # num_vocab does not include <go>

        N = self.N
        encoder_states = self.encoder_states
        T_max = self.T_decoder
        lstm_dim = self.lstm_dim
        num_layers = self.num_layers
        apply_dropout = self.decoder_dropout
        EOS_token = self.EOS_token
        sampling = self.decoder_sampling

        with tf.variable_scope(scope, reuse=reuse):
            embedding_mat = tf.get_variable(
                'embedding_mat',
                [self.decoder_num_vocab, self.decoder_embed_dim])
            # Special embeddign for <go>, which denotes seq start
            go_embedding = tf.get_variable('go_embedding',
                                           [1, self.decoder_embed_dim])

            with tf.variable_scope('att_prediction'):
                v = tf.get_variable('v', [lstm_dim])
                W_a = tf.get_variable(
                    'weights', [lstm_dim, lstm_dim],
                    initializer=tf.contrib.layers.xavier_initializer())
                b_a = tf.get_variable('biases',
                                      lstm_dim,
                                      initializer=tf.constant_initializer(0.))

            # The parameters to predict the next token
            with tf.variable_scope('token_prediction'):
                W_y = tf.get_variable(
                    'weights', [lstm_dim * 2, self.decoder_num_vocab],
                    initializer=tf.contrib.layers.xavier_initializer())
                b_y = tf.get_variable('biases',
                                      self.decoder_num_vocab,
                                      initializer=tf.constant_initializer(0.))

            mask_range = tf.reshape(
                tf.range(self.decoder_num_vocab, dtype=tf.int32), [1, -1])
            all_eos_pred = EOS_token * tf.ones(to_T([N]), tf.int32)
            all_one_prob = tf.ones(to_T([N]), tf.float32)
            all_zero_entropy = tf.zeros(to_T([N]), tf.float32)

            def loop_fn(time, cell_output, cell_state, loop_state):
                if cell_output is None:  # time == 0
                    next_cell_state = encoder_states
                    next_input = tf.tile(go_embedding, to_T([N, 1]))
                else:  # time > 0
                    next_cell_state = cell_state

                    # compute the attention map over the input sequence
                    # a_raw has shape [T, N, 1]
                    att_raw = tf.reduce_sum(tf.tanh(
                        tf.nn.xw_plus_b(cell_output, W_a, b_a) +
                        self.encoder_h_transformed) * v,
                                            axis=2,
                                            keep_dims=True)
                    # softmax along the first dimension (T) over not finished examples
                    # att has shape [T, N, 1]
                    att = tf.nn.softmax(att_raw, dim=0) * self.seq_not_finished
                    att = att / tf.reduce_sum(att, axis=0, keep_dims=True)
                    # d has shape [N, lstm_dim]
                    d2 = tf.reduce_sum(att * self.encoder_outputs, axis=0)

                    # token_scores has shape [N, num_vocab]
                    token_scores = tf.nn.xw_plus_b(
                        tf.concat([cell_output, d2], axis=1), W_y, b_y)
                    # predict the next token (behavior depending on parameters)
                    if sampling:
                        # predicted_token has shape [N]
                        logits = token_scores
                        predicted_token = tf.cast(
                            tf.reshape(tf.multinomial(token_scores, 1), [-1]),
                            tf.int32)
                    else:
                        # predicted_token has shape [N]
                        predicted_token = tf.cast(tf.argmax(token_scores, 1),
                                                  tf.int32)
                    predicted_token = gt_layout_batch[time - 1]

                    # token_prob has shape [N], the probability of the predicted token
                    # although token_prob is not needed for predicting the next token
                    # it is needed in output (for policy gradient training)
                    # [N, num_vocab]
                    # mask has shape [N, num_vocab]
                    mask = tf.equal(mask_range,
                                    tf.reshape(predicted_token, [-1, 1]))
                    all_token_probs = tl.activation.pixel_wise_softmax(
                        token_scores)
                    token_prob = tf.reduce_sum(all_token_probs *
                                               tf.cast(mask, tf.float32),
                                               axis=1)
                    neg_entropy = tf.reduce_sum(
                        all_token_probs *
                        tf.log(tf.maximum(1e-5, all_token_probs)),
                        axis=1)

                    # is_eos_predicted is a [N] bool tensor, indicating whether
                    # <eos> has already been predicted previously in each sequence
                    is_eos_predicted = loop_state[2]
                    predicted_token_old = predicted_token
                    # if <eos> has already been predicted, now predict <eos> with
                    # prob 1
                    predicted_token = tf.where(is_eos_predicted, all_eos_pred,
                                               predicted_token)
                    token_prob = tf.where(is_eos_predicted, all_one_prob,
                                          token_prob)
                    neg_entropy = tf.where(is_eos_predicted, all_zero_entropy,
                                           neg_entropy)
                    is_eos_predicted = tf.logical_or(
                        is_eos_predicted,
                        tf.equal(predicted_token_old, EOS_token))

                    # the prediction is from the cell output of the last step
                    # timestep (t-1), feed it as input into timestep t
                    next_input = tf.nn.embedding_lookup(
                        embedding_mat, predicted_token)

                elements_finished = tf.greater_equal(time, T_max)

                # loop_state is a 5-tuple, representing
                #   1) the predicted_tokens
                #   2) the prob of predicted_tokens
                #   3) whether <eos> has already been predicted
                #   4) the negative entropy of policy (accumulated across timesteps)
                #   5) the attention
                if loop_state is None:  # time == 0
                    # Write the predicted token into the output
                    predicted_token_array = tf.TensorArray(dtype=tf.int32,
                                                           size=T_max,
                                                           infer_shape=False)
                    token_prob_array = tf.TensorArray(dtype=tf.float32,
                                                      size=T_max,
                                                      infer_shape=False)
                    att_array = tf.TensorArray(dtype=tf.float32,
                                               size=T_max,
                                               infer_shape=False)
                    next_loop_state = (predicted_token_array, token_prob_array,
                                       tf.zeros(to_T([N]), dtype=tf.bool),
                                       tf.zeros(to_T([N]),
                                                dtype=tf.float32), att_array)
                else:  # time > 0
                    t_write = time - 1
                    next_loop_state = (loop_state[0].write(
                        t_write, predicted_token), loop_state[1].write(
                            t_write, token_prob), is_eos_predicted,
                                       loop_state[3] + neg_entropy,
                                       loop_state[4].write(t_write, att))
                return (elements_finished, next_input, next_cell_state,
                        cell_output, next_loop_state)

            # The RNN
            cell_layers = _get_lstm_cell(num_layers, lstm_dim, apply_dropout)
            _, _, decodes_ta = tf.nn.raw_rnn(cell_layers,
                                             loop_fn,
                                             scope='lstm')
            predicted_tokens = decodes_ta[0].stack()
            token_probs = decodes_ta[1].stack()
            neg_entropy = decodes_ta[3]
            # atts has shape [T_decoder, T_encoder, N, 1]
            self.atts = decodes_ta[4].stack()
            # word_vec has shape [T_decoder, N, 1]
            word_vecs = tf.reduce_sum(self.atts * self.embedded_input_seq,
                                      axis=1)

            predicted_tokens.set_shape([None, None])
            token_probs.set_shape([None, None])
            neg_entropy.set_shape([None])
            word_vecs.set_shape([None, None, self.encoder_embed_dim])

            self.predicted_tokens = predicted_tokens
            self.token_probs = token_probs
            self.neg_entropy = neg_entropy
            self.word_vecs = word_vecs
Exemplo n.º 26
0
def visual7w_attbilstm_net(input_batch, bbox_batch1, spatial_batch1,
                           bbox_batch2, spatial_batch2, expr_obj, num_vocab,
                           embed_dim, lstm_dim, vgg_dropout, lstm_dropout):
    # a sentence is parsed into [expr_obj1, expr_relation, expr_obj2]
    #   bbox_batch1 has shape [N_batch*N1, 5] and
    #   spatial_batch1 has shape [N_batch, N1, D_spatial] and
    #   bbox_batch2 has shape [N2, 5] and
    #   spatial_batch2 has shape [1, N2, D_spatial] and
    #   expr_obj has shape [T, N_batch]
    # where N1 is the number of choices (= 4 in Visual 7W) and
    # N2 is the number of proposals (~ 300 for RPN in Faster RCNN)

    N_batch = tf.shape(spatial_batch1)[0]
    N1 = tf.shape(spatial_batch1)[1]
    N2 = tf.shape(spatial_batch2)[1]

    # Extract visual features
    vis_feat1 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch,
                                             tf.reshape(bbox_batch1, [-1, 5]),
                                             "vgg_local",
                                             apply_dropout=vgg_dropout)
    D_vis = vis_feat1.get_shape().as_list()[-1]
    vis_feat1 = tf.reshape(vis_feat1, to_T([N_batch, N1, D_vis]))
    vis_feat1.set_shape([None, None, D_vis])

    # Reshape and tile vis_feat2 and spatial_batch2
    vis_feat2 = fastrcnn_vgg_net.vgg_roi_fc7(input_batch,
                                             tf.reshape(bbox_batch2, [-1, 5]),
                                             "vgg_local",
                                             apply_dropout=vgg_dropout,
                                             reuse=True)
    vis_feat2 = tf.reshape(vis_feat2, to_T([1, N2, D_vis]))
    vis_feat2 = tf.tile(vis_feat2, to_T([N_batch, 1, 1]))
    vis_feat2.set_shape([None, None, D_vis])
    spatial_batch2 = tf.tile(spatial_batch2, to_T([N_batch, 1, 1]))

    # Extract representation using attention
    lang_obj1, lang_obj2, lang_relation, probs_obj1, probs_obj2, probs_rel = lstm_net.attbilstm(
        expr_obj,
        "lstm",
        num_vocab=num_vocab,
        embed_dim=embed_dim,
        lstm_dim=lstm_dim,
        apply_dropout=lstm_dropout)

    # Score for each bounding box matching the first object
    # scores_obj1 has shape [N_batch, N1, 1]
    scores_obj1 = modules.localization_module_batch_score(
        vis_feat1, spatial_batch1, lang_obj1)
    # Score for each bounding box matching the second object
    # scores_obj2 has shape [N_batch, N2, 1]
    scores_obj2 = modules.localization_module_batch_score(vis_feat2,
                                                          spatial_batch2,
                                                          lang_obj2,
                                                          reuse=True)

    # Scores for each pair of bounding box matching the relationship
    # Tile the scores by broadcasting add
    # scores_rel has shape [N_batch, N1, N2, 1]
    scores_rel = modules.relationship_module_spatial_only_batch_score(
        spatial_batch1,
        scores_obj1,
        spatial_batch2,
        scores_obj2,
        lang_relation,
        rescale_scores=True)
    # marginal_scores has shape [N_batch, N1, 1]
    tf.add_to_collection("s_pair", scores_rel)

    marginal_scores = tf.reduce_max(scores_rel, reduction_indices=2)
    final_scores = tf.reshape(marginal_scores, to_T([N_batch, -1]))

    return final_scores
Exemplo n.º 27
0
    def DescribeModule(self,
                       input_0,
                       time_idx,
                       batch_idx,
                       map_dim=1024,
                       scope='DescribeModule',
                       reuse=True):
        # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors

        image_feat_grid = self._slice_image_feat_grid(batch_idx)
        text_param = self._slice_word_vecs(time_idx, batch_idx)
        encoder_states = self._slice_encoder_states(batch_idx)
        # Mapping: att_grid -> answer probs
        # Input:
        #   input_0: [N, H, W, 1]
        # Output:
        #   answer_scores: [N, self.num_choices]
        #
        # Implementation:
        #   1. Extract visual features using the input attention map, and
        #      linear transform to map_dim
        #   2. linear transform language features to map_dim
        #   3. Element-wise multiplication of the two, l2_normalize, linear transform.
        with tf.variable_scope(self.module_variable_scope):
            with tf.variable_scope(scope, reuse=reuse):
                image_shape = tf.shape(image_feat_grid)
                N = tf.shape(time_idx)[0]
                H = image_shape[1]
                W = image_shape[2]
                D_im = image_feat_grid.get_shape().as_list()[-1]
                D_txt = text_param.get_shape().as_list()[-1]

                text_param_mapped = fc('fc_text',
                                       text_param,
                                       output_dim=map_dim)

                att_softmax = tf.reshape(
                    tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))),
                    to_T([N, H, W, 1]))

                # att_feat, att_feat_1 has shape [N, D_vis]
                att_feat = tf.reduce_sum(image_feat_grid * att_softmax,
                                         axis=[1, 2])
                att_feat_mapped = tf.reshape(
                    fc('fc_att', att_feat, output_dim=map_dim),
                    to_T([N, map_dim]))

                if encoder_states is not None:
                    # Add in encoder states in the elementwise multiplication
                    encoder_states_mapped = fc('fc_encoder_states',
                                               encoder_states,
                                               output_dim=map_dim)
                    eltwise_mult = tf.nn.l2_normalize(
                        text_param_mapped * att_feat_mapped *
                        encoder_states_mapped, 1)
                else:
                    eltwise_mult = tf.nn.l2_normalize(
                        text_param_mapped * att_feat_mapped, 1)
                scores = fc('fc_eltwise',
                            eltwise_mult,
                            output_dim=self.num_choices)

        return scores
Exemplo n.º 28
0
def build_output_unit_loc(q_encoding,
                          kb_batch,
                          att_last,
                          scope='output_unit_loc',
                          reuse=None):
    """
    Apply a 1-layer convolution network to predict localization scores.
    Apply dropout
    if specified.

    Input:
        kb_batch: [N, H, W, d], tf.float32
        att_last: [N, H, W, 1], tf.float32
    Return:
        loc_scores: [N, H*W], tf.float32
        bbox_offset: [N, 4], tf.float32
    """

    with tf.variable_scope(scope, reuse=reuse):
        if cfg.MODEL.LOC_SCORES_POS_AFFINE:
            # make sure att signs do not flip
            w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', []))
            b = tf.get_variable('loc_scores_affine_b', [])
            loc_scores = w * att_last + b
        else:
            loc_scores = conv('conv_loc',
                              att_last,
                              kernel_size=3,
                              stride=1,
                              output_dim=1)
        loc_scores = tf.reshape(loc_scores,
                                [-1, cfg.MODEL.H_FEAT * cfg.MODEL.W_FEAT])
        # extract the attended features for bounding box regression
        if cfg.MODEL.BBOX_REG_AS_FCN:
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc('fc_q_mapped',
                              q_encoding,
                              output_dim=cfg.MODEL.KB_DIM)
                bbox_offset_input = tf.nn.l2_normalize(q_mapped[:, ax, ax, :] *
                                                       kb_batch,
                                                       axis=-1)
            else:
                bbox_offset_input = kb_batch
            bbox_offset_fcn = conv('conv_bbox_offset',
                                   bbox_offset_input,
                                   1,
                                   1,
                                   output_dim=4)
            N = tf.shape(bbox_offset_fcn)[0]
            B = cfg.MODEL.H_FEAT * cfg.MODEL.W_FEAT  # B = H*W
            # bbox_offset_fcn [N, B, 4] is used for training
            bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4]))
            # bbox_offset [N, 4] is only used for prediction
            bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N * B, 4]))
            slice_inds = tf.range(N) * B + tf.argmax(
                loc_scores, axis=-1, output_type=tf.int32)
            bbox_offset = tf.gather(bbox_offset_flat, slice_inds)
        else:
            bbox_offset_fcn = None
            kb_loc = _extract_softmax_avg(kb_batch, att_last)
            if cfg.MODEL.BBOX_REG_USE_QUESTION:
                q_mapped = fc('fc_q_mapped',
                              q_encoding,
                              output_dim=cfg.MODEL.KB_DIM)
                elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1)
                bbox_offset = fc('fc_bbox_offset_with_q',
                                 elt_prod,
                                 output_dim=4)
            else:
                bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4)
    return loc_scores, bbox_offset, bbox_offset_fcn
Exemplo n.º 29
0
def _spatial_softmax(att_raw):
    att_shape = tf.shape(att_raw)
    N = att_shape[0]
    att_softmax = tf.nn.softmax(tf.reshape(att_raw, to_T([N, -1])), axis=1)
    att_softmax = tf.reshape(att_softmax, att_shape)
    return att_softmax
Exemplo n.º 30
0
def relationship_module_spatial_only_batch_score(spatial_feat1, scores1,
                                     spatial_feat2, scores2, lang_feat,
                                     scope="relationship_module_spatial_only",
                                     rescale_scores=False, reuse=None):
    # Input shape:
    #   spatial_feat1, spatial_feat2 : [N_batch, N1, D_spatial], [N_batch, N2, D_spatial]
    #   scores1, scores2: [N_batch, N1, 1], [N_batch, N2, 1]
    #   lang_feat: [N_batch, D_lang]
    # Output shape:
    #   relationship_scores: [N_batch, N1, N2, 1]
    #
    # This function is not responsible for initializing the variables. Please
    # handle variable initialization outside.

    with tf.variable_scope(scope, reuse=reuse):
        # An embedding module that maps the visual feature plus the spatial feature
        # linearly to the same dimension as the language feature
        N_batch = tf.shape(lang_feat)[0]
        D_lang = lang_feat.get_shape().as_list()[-1]

        N1 = tf.shape(spatial_feat1)[1]
        N2 = tf.shape(spatial_feat2)[1]

        D_spatial = spatial_feat1.get_shape().as_list()[-1]

        # Tiled spatial features of size [N_batch, N1, N2, 5*2], such that
        # spatial_feat_tiled[k, i, j] = [ spatial_feat1[k, i], spatial_feat1[k, j] ]
        spatial_feat_tiled = tf.reshape(tf.concat([
            tf.tile(tf.reshape(spatial_feat1, to_T([N_batch, -1, 1, D_spatial])),
                    to_T([1, 1, N2, 1])),
            tf.tile(tf.reshape(spatial_feat2, to_T([N_batch, 1, -1, D_spatial])),
                    to_T([1, N1, 1, 1]))
        ], axis=2), [-1, D_spatial*2])

        # Embedded spatial feature of size [N_batchxN1xN2, D_lang]
        spatial_embed = fc('spatial_embed', spatial_feat_tiled, output_dim=D_lang)

        # Elementwise multiplication with language feature and l2-normalization
        spatial_embed = tf.reshape(spatial_embed, to_T([N_batch, -1, D_lang]))
        lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang])
        eltwise_mult = tf.nn.l2_normalize(spatial_embed * lang_feat, 2)
        # eltwise_mult has shape [N_batchxN1xN2, D_lang]
        eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang])

        # Localization scores as linear classification over the l2-normalized
        relationship_scores = fc('relationship_scores', eltwise_mult, output_dim=1)
        relationship_scores = tf.reshape(relationship_scores, to_T([N_batch, N1, N2, 1]))
        # Rescale the scores, if specified
        if rescale_scores:
            alpha_obj1 = tf.get_variable("alpha_obj1", shape=[], dtype=tf.float32,
                                         initializer=tf.constant_initializer(1))
            alpha_obj2 = tf.get_variable("alpha_obj2", shape=[], dtype=tf.float32,
                                         initializer=tf.constant_initializer(1))
            alpha_rel = tf.get_variable("alpha_rel", shape=[], dtype=tf.float32,
                                         initializer=tf.constant_initializer(1))
            scores1 = tf.multiply(scores1, alpha_obj1)
            scores2 = tf.multiply(scores2, alpha_obj2)
            relationship_scores = tf.multiply(relationship_scores, alpha_rel)

        final_scores = tf.add(tf.add(tf.reshape(scores1, to_T([N_batch, N1, 1, 1])),
                                     tf.reshape(scores2, to_T([N_batch, 1, N2, 1]))),
                              relationship_scores)
        final_scores.set_shape([None, None, None, 1])

    return final_scores