def FindSamePropertyModule(self, input_0, time_idx, batch_idx, map_dim=250, scope='FindSamePropertyModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x image_feat_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Convolve image features to map_dim # 4. Element-wise multiplication of the three, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) att_softmax = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))), to_T([N, H, W, 1])) # att_feat has shape [N, D_vis] att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2]) att_feat_mapped = tf.reshape( fc('fc_att', att_feat, output_dim=map_dim), to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped * att_feat_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def FindModule(self, time_idx, batch_idx, map_dim=1024, scope='FindModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: image_feat_grid x text_param -> att_grid # Input: # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Elementwise multiplication between image_feat_grid and text_param # 2. L2-normalization # 3. Linear classification with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def instantiate_batch(self, inputs): """ Inputs: output from the previous modules image feature for the example text attention for all modules for the example time id for current module """ vis_att, img_feat, text_att = inputs # text feature dimension, intermediate mapping dimension # batch size, image feature height and width text_dim = text_att.shape.as_list()[-1] map_dim = self._params['map_dim'] encode_size = self._params['encode_size'] N = tf.shape(img_feat)[0] H, W = img_feat.shape.as_list()[1:3] with tf.variable_scope(self._module_scope): with tf.variable_scope(self._scope, reuse=self._reuse): # image_feat_mapped has shape [N, H, W, map_dim] img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim) # nonlinearity img_map = tf.nn.relu(img_map) text_map = fc('fc_text', text_att, output_dim=map_dim) text_map = tf.reshape(text_map, [-1, 1, 1, map_dim]) # nonlinearity text_map = tf.nn.relu(text_map) att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2]) att_map = tf.reshape( fc('fc_att', att_feats, output_dim=map_dim), [N, 1, 1, map_dim]) # interact via element wise map eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) # softmax att_grid_soft = tf.nn.softmax(tf.reshape( att_grid, [-1, H * W])) att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1]) return [att_grid]
def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5, map_dim=250, scope='TransformModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # Convolutional layer that also involve text_param # A 'soft' convolutional kernel that is modulated by text_param with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(input_0) N = att_shape[0] H = att_shape[1] W = att_shape[2] att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size, stride=1, output_dim=map_dim) text_param_mapped = fc('text_fc', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def CountModule(self, input_0, time_idx, batch_idx, map_dim=1024, scope='CountModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) encoder_states = self._slice_encoder_states(batch_idx) # Mapping: att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # Two paths with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): # The first path, same as Describe image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H, W = self.att_shape[1:3] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] text_param_mapped_0 = fc('fc_text_0', text_param, output_dim=map_dim) att_softmax_0 = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))), to_T([N, H, W, 1])) # att_feat, att_feat_1 has shape [N, D_vis] att_feat_0 = tf.reduce_sum(image_feat_grid * att_softmax_0, axis=[1, 2]) att_feat_mapped_0 = tf.reshape( fc('fc_att_0', att_feat_0, output_dim=map_dim), to_T([N, map_dim])) if encoder_states is not None: # Add in encoder states in the elementwise multiplication encoder_states_mapped = fc('fc_encoder_states', encoder_states, output_dim=map_dim) eltwise_mult_0 = tf.nn.l2_normalize( text_param_mapped_0 * att_feat_mapped_0 * encoder_states_mapped, 1) else: eltwise_mult_0 = tf.nn.l2_normalize( text_param_mapped_0 * att_feat_mapped_0, 1) scores_0 = fc('fc_eltwise_0', eltwise_mult_0, output_dim=self.num_choices) # the second path # text agnostic counting, same as Count in CLEVR v0 modules att_all_1 = tf.reshape(input_0, to_T([-1, H * W])) att_min_1 = tf.reduce_min(input_0, axis=[1, 2]) att_max_1 = tf.reduce_max(input_0, axis=[1, 2]) # text aware counting, similar to Find att_mapped_2 = _conv('conv_att_2', input_0, kernel_size=3, stride=1, output_dim=map_dim) text_param_mapped_2 = fc('fc_text_2', text_param, output_dim=map_dim) text_param_mapped_2 = tf.reshape(text_param_mapped_2, to_T([N, 1, 1, map_dim])) eltwise_mult_2 = tf.nn.l2_normalize( att_mapped_2 * text_param_mapped_2, 3) att_grid_2 = _1x1_conv('conv_eltwise_2', eltwise_mult_2, output_dim=1) att_grid_2.set_shape(input_0.get_shape()) att_all_2 = tf.reshape(att_grid_2, to_T([-1, H * W])) att_min_2 = tf.reduce_min(att_grid_2, axis=[1, 2]) att_max_2 = tf.reduce_max(att_grid_2, axis=[1, 2]) att_concat_2 = tf.concat([ att_all_1, att_min_1, att_max_1, att_all_2, att_min_2, att_max_2 ], axis=1) scores_2 = fc('fc_scores_2', att_concat_2, output_dim=self.num_choices) # Fuse the score from both paths scores = scores_0 + scores_2 return scores