def visual_semantic_infer(self, visual_feature_train_pos, visual_feature_train_neg, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print("Building training network...............................\n") transformed_clip_train_mix = fc('v2s_lt', tf.concat([visual_feature_train_pos, visual_feature_train_neg], 0), output_dim=self.semantic_size) transformed_clip_train_norm_mix = tf.nn.l2_normalize(transformed_clip_train_mix, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train_mix = self.cross_modal_comb(transformed_clip_train_norm_mix, tf.tile(transformed_sentence_train_norm, [2,1]), self.batch_size) sim_score_mat_train_mix = vs_multilayer.vs_multilayer(cross_modal_vec_train_mix, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train_mix = tf.reshape(sim_score_mat_train_mix, [self.batch_size*2, 3]) tf.get_variable_scope().reuse_variables() print("Building test network...............................\n") transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train_mix, sim_score_mat_test
def localization_module(vis_feat, spatial_feat, lang_feat, scope="localization_module", reuse=None): # Input: # vis_feat: [N, D_vis] # spatial_feat: [N, D_spatial] # lang_feat: [N, D_lang] # Output: # localization_scores: [N, 1] # # This function is not responsible for initializing the variables. Please # handle variable initialization outside. with tf.variable_scope(scope, reuse=reuse): # An embedding module that maps the visual feature plus the spatial feature # linearly to the same dimension as the language feature D_lang = lang_feat.get_shape().as_list()[-1] vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=1) vis_spatial_embed = fc('vis_spatial_embed', vis_spatial_feat, output_dim=D_lang) # Elementwise multiplication with language feature and l2-normalization eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 1) # Localization scores as linear classification over the l2-normalized localization_scores = fc('localization_scores', eltwise_mult, output_dim=1) return localization_scores
def attbilstm(text_seq_batch, name, num_vocab, embed_dim, lstm_dim, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): T = tf.shape(text_seq_batch)[0] N = tf.shape(text_seq_batch)[1] # 0. Word embedding embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # 1. Encode the sentence into a vector representation, using the final # hidden states in a two-layer bidirectional LSTM network seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True) outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, embedded_seq, seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm1") outputs1 = tf.concat(outputs1_raw, axis=2) outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, outputs1, seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm2") outputs2 = tf.concat(outputs2_raw, axis=2) # q_reshape has shape [T, N, lstm_dim*4] q_reshape = tf.concat([outputs1, outputs2], axis=2) if apply_dropout: q_reshape = drop(q_reshape, 0.5) # 2. three attention units over the words in each sentence with tf.variable_scope("attention"): q_reshape_flat = tf.reshape(q_reshape, to_T([T*N, lstm_dim*4])) score_shape = to_T([T, N, 1]) scores_obj1 = tf.reshape(fc('fc_scores_obj1', q_reshape_flat, output_dim=1), score_shape) scores_obj2 = tf.reshape(fc('fc_scores_obj2', q_reshape_flat, output_dim=1), score_shape) scores_rel = tf.reshape(fc('fc_scores_rel', q_reshape_flat, output_dim=1), score_shape) # 2.4 Compute probability and average BoW representation # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1] # Remove the probability over <pad> (<pad> is 0) is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32) probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True) probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True) probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True) tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel)) # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim] BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0) BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0) BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0) BoW_obj1.set_shape([None, embed_dim]) BoW_obj2.set_shape([None, embed_dim]) BoW_rel.set_shape([None, embed_dim]) return (BoW_obj1, BoW_obj2, BoW_rel)
def relationship_module_spatial_only(spatial_feat1, scores1, spatial_feat2, scores2, lang_feat, scope="relationship_module_spatial_only", reuse=None): # Input shape: # spatial_feat1, spatial_feat2 : [N1, D_spatial], [N2, D_spatial] # scores1, scores2: [N1, 1], [N2, 1] # lang_feat: [1, D_lang] # Output shape: # relationship_scores: [N1, N2, 1] # # This function is not responsible for initializing the variables. Please # handle variable initialization outside. with tf.variable_scope(scope, reuse=reuse): # An embedding module that maps the visual feature plus the spatial feature # linearly to the same dimension as the language feature D_lang = lang_feat.get_shape().as_list()[-1] N1 = tf.shape(spatial_feat1)[0] N2 = tf.shape(spatial_feat2)[0] D_spatial = spatial_feat1.get_shape().as_list()[-1] # Tiled spatial features of size [N1, N2, 5*2], such that # spatial_feat_tiled[i, j] = [ spatial_feat1[i], spatial_feat1[j] ] spatial_feat_tiled = tf.reshape( tf.concat([ tf.tile(tf.reshape(spatial_feat1, [-1, 1, D_spatial]), to_T([1, N2, 1])), tf.tile(tf.reshape(spatial_feat2, [1, -1, D_spatial]), to_T([N1, 1, 1])) ], axis=2), [-1, D_spatial * 2]) spatial_embed = fc('spatial_embed', spatial_feat_tiled, output_dim=D_lang) # Elementwise multiplication with language feature and l2-normalization eltwise_mult = tf.nn.l2_normalize(spatial_embed * lang_feat, 1) # Localization scores as linear classification over the l2-normalized relationship_scores = fc('relationship_scores', eltwise_mult, output_dim=1) relationship_scores = tf.reshape(relationship_scores, to_T([N1, N2, 1])) final_scores = tf.add( tf.add(tf.reshape(scores1, [-1, 1, 1]), tf.reshape(scores2, [1, -1, 1])), relationship_scores) final_scores.set_shape([None, None, 1]) return final_scores
def build_output_unit_loc(q_encoding, kb_batch, att_last, scope='output_unit_loc', reuse=None): """ Apply a 1-layer convolution network to predict localization scores. Apply dropout if specified. Input: kb_batch: [N, H, W, d], tf.float32 att_last: [N, H, W, 1], tf.float32 Return: loc_scores: [N, H*W], tf.float32 bbox_offset: [N, 4], tf.float32 """ with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.LOC_SCORES_POS_AFFINE: # make sure att signs do not flip w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', [])) b = tf.get_variable('loc_scores_affine_b', []) loc_scores = w * att_last + b else: loc_scores = conv( 'conv_loc', att_last, kernel_size=3, stride=1, output_dim=1) loc_scores = tf.reshape( loc_scores, [-1, cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT]) # extract the attended features for bounding box regression if cfg.MODEL.BBOX_REG_AS_FCN: if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc( 'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) bbox_offset_input = tf.nn.l2_normalize( q_mapped[:, ax, ax, :] * kb_batch, axis=-1) else: bbox_offset_input = kb_batch bbox_offset_fcn = conv( 'conv_bbox_offset', bbox_offset_input, 1, 1, output_dim=4) N = tf.shape(bbox_offset_fcn)[0] B = cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT # B = H*W # bbox_offset_fcn [N, B, 4] is used for training bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4])) # bbox_offset [N, 4] is only used for prediction bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N*B, 4])) slice_inds = tf.range(N) * B + tf.argmax( loc_scores, axis=-1, output_type=tf.int32) bbox_offset = tf.gather(bbox_offset_flat, slice_inds) else: bbox_offset_fcn = None kb_loc = _extract_softmax_avg(kb_batch, att_last) if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc( 'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1) bbox_offset = fc( 'fc_bbox_offset_with_q', elt_prod, output_dim=4) else: bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4) return loc_scores, bbox_offset, bbox_offset_fcn
def DescribeModule(self, input_0, time_idx, batch_idx, map_dim=250, scope='DescribeModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Element-wise multiplication of the two, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) att_softmax = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H * W]))), to_T([N, H, W, 1])) # att_feat, att_feat_1 has shape [N, D_vis] att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2]) att_feat_mapped = tf.reshape( fc('fc_att', att_feat, output_dim=map_dim), to_T([N, map_dim])) eltwise_mult = tf.nn.l2_normalize( text_param_mapped * att_feat_mapped, 1) scores = fc('fc_eltwise', eltwise_mult, output_dim=self.num_choices) return scores
def _build_encoder(self, input_seq_batch, seq_len_batch, scope='encoder', reuse=None): lstm_dim = self.lstm_dim num_layers = self.num_layers apply_dropout = self.encoder_dropout with tf.variable_scope(scope, reuse=reuse): #T = tf.shape(input_seq_batch)[0] T = input_seq_batch.shape.as_list()[0] N = tf.shape(input_seq_batch)[1] self.T_encoder = T self.N = N with tf.variable_scope(self.embed_scope, reuse=True): embedding_mat = tf.get_variable('embed_mat', [self.encoder_num_vocab, self.encoder_embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, input_seq_batch) self.embedded_input_seq = embedded_seq # The RNN cell = _get_lstm_cell(num_layers, lstm_dim, apply_dropout) # encoder_outputs has shape [T, N, lstm_dim] encoder_outputs, encoder_states = tf.nn.dynamic_rnn(cell, embedded_seq, seq_len_batch, dtype=tf.float32, time_major=True, scope='lstm') self.encoder_outputs = encoder_outputs self.encoder_states = encoder_states # check if wv flag is set if self.params['use_word_vectors']: # transform the encoder outputs for further attention alignments # encoder_outputs_flat has shape [T, N, lstm_dim] encoder_h_transformed = fc('encoder_h_transform', tf.reshape(embedded_seq, [-1, self.encoder_embed_dim]), output_dim=lstm_dim) else: # transform the encoder outputs for further attention alignments # encoder_outputs_flat has shape [T, N, lstm_dim] encoder_h_transformed = fc('encoder_h_transform', tf.reshape(encoder_outputs, [-1, lstm_dim]), output_dim=lstm_dim) encoder_h_transformed = tf.reshape(encoder_h_transformed, to_T([T, N, lstm_dim])) self.encoder_h_transformed = encoder_h_transformed # seq_not_finished has shape [T, N, 1], where seq_not_finished[t, n] # is 1 iff sequence n is not finished at time t, and 0 otherwise seq_not_finished = tf.less(tf.range(T)[:, tf.newaxis, tf.newaxis], seq_len_batch[:, tf.newaxis]) seq_not_finished = tf.cast(seq_not_finished, tf.float32) self.seq_not_finished = seq_not_finished
def build_regulizer(self): """ context regularization score """ # text and region features text_bilstm_feat = self.text_bilstm_feat text_word_embed_feat = self.text_word_embed_feat word_is_not_pad = self.word_is_not_pad region_visual_feat = self.region_visual_feat region_spatial_feat = self.region_spatial_feat reg_dim = self.config.reg_dim # Tensor dimensionality L = tf.shape(text_bilstm_feat)[0] N1 = tf.shape(text_bilstm_feat)[1] N2 = tf.shape(region_spatial_feat)[0] D1 = text_bilstm_feat.get_shape().as_list()[-1] # lstm_dim*4 D2 = text_word_embed_feat.get_shape().as_list()[-1] # embed_dim D3 = region_spatial_feat.get_shape().as_list()[-1] # spatial_dim D4 = region_visual_feat.get_shape().as_list()[-1] # visual_dim region_feat = tf.concat([region_visual_feat, region_spatial_feat], axis=1) # shape: [N2, D3+D4] with tf.variable_scope('regularizer'): # 1. language-vision association between single RoI and the expression, represented by y^{g} in the paper word_obj_attention_score = fc('word_attention_obj', tf.reshape(text_bilstm_feat, [-1, D1]), output_dim=1) # shape: [L*N1, 1] word_obj_attention_score = tf.reshape(word_obj_attention_score, [L, N1, 1]) word_prob = tf.nn.softmax( word_obj_attention_score, dim=0) * word_is_not_pad #shape: [L, N1, 1] word_prob = word_prob / tf.reduce_sum( word_prob, 0, keep_dims=True) #shape: [L, N1, 1] word_obj_feat = tf.reduce_sum(word_prob * text_word_embed_feat, axis=0) #shape: [N1, D2] # 2. single score for subject region_embed = fc('region_obj_embed', region_feat, output_dim=D2) #shape: [N2, D2] mm_feat = tf.nn.l2_normalize( region_embed[tf.newaxis, ...] * tf.reshape(word_obj_feat, [N1, 1, D2]), dim=2) #shape: [N1, N2, D2] score = fc('single_score', tf.reshape(mm_feat, [-1, D2]), output_dim=1) # shape: [N1*N2, 1] score = tf.reshape(score, [N1, N2]) #shape[N1, N2] self.prior_score = score
def FindSamePropertyModule(self, input_0, time_idx, batch_idx, map_dim=250, scope='FindSamePropertyModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x image_feat_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Extract visual features using the input attention map, and # linear transform to map_dim # 2. linear transform language features to map_dim # 3. Convolve image features to map_dim # 4. Element-wise multiplication of the three, l2_normalize, linear transform. with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) att_softmax = tf.reshape( tf.nn.softmax(tf.reshape(input_0, to_T([N, H*W]))), to_T([N, H, W, 1])) # att_feat has shape [N, D_vis] att_feat = tf.reduce_sum(image_feat_grid * att_softmax, axis=[1, 2]) att_feat_mapped = tf.reshape( fc('fc_att', att_feat, output_dim=map_dim), to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize( image_feat_mapped * text_param_mapped * att_feat_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def localization_module_batch_score(vis_feat, spatial_feat, lang_feat, scope="localization_module", reuse=None): # Input: # vis_feat: [N_batch, N_vis, D_vis] # spatial_feat: [N_batch, N_vis, D_spatial] # lang_feat: [N_batch, D_lang] # Output: # localization_scores: [N_batch, N_vis, 1] # # This function is not responsible for initializing the variables. Please # handle variable initialization outside. with tf.variable_scope(scope, reuse=reuse): # An embedding module that maps the visual feature plus the spatial feature # linearly to the same dimension as the language feature N_batch = tf.shape(vis_feat)[0] N_vis = tf.shape(vis_feat)[1] D_vis = vis_feat.get_shape().as_list()[-1] D_spatial = spatial_feat.get_shape().as_list()[-1] D_lang = lang_feat.get_shape().as_list()[-1] # flatten the visual and spatial features and embed them to the same # dimension as the language feature vis_spatial_feat = tf.concat([vis_feat, spatial_feat], axis=2) vis_spatial_feat = tf.reshape(vis_spatial_feat, [-1, D_vis + D_spatial]) vis_spatial_embed = fc('vis_spatial_embed', vis_spatial_feat, output_dim=D_lang) # Reshape visual feature and language feature for broadcast multiplication lang_feat = tf.reshape(lang_feat, [-1, 1, D_lang]) vis_spatial_embed = tf.reshape(vis_spatial_embed, to_T([N_batch, -1, D_lang])) # Elementwise multiplication with language feature and l2-normalization eltwise_mult = tf.nn.l2_normalize(vis_spatial_embed * lang_feat, 2) eltwise_mult = tf.reshape(eltwise_mult, [-1, D_lang]) # Localization scores as linear classification over the l2-normalized localization_scores = fc('localization_scores', eltwise_mult, output_dim=1) localization_scores = tf.reshape(localization_scores, to_T([N_batch, N_vis, 1])) return localization_scores
def instantiate_batch(self, inputs): """ Inputs: output from the previous modules image feature for the example text attention for all modules for the example time id for current module """ vis_att, img_feat, text_att = inputs # text feature dimension, intermediate mapping dimension # batch size, image feature height and width text_dim = text_att.shape.as_list()[-1] map_dim = self._params['map_dim'] encode_size = self._params['encode_size'] N = tf.shape(img_feat)[0] H, W = img_feat.shape.as_list()[1:3] with tf.variable_scope(self._module_scope): with tf.variable_scope(self._scope, reuse=self._reuse): # image_feat_mapped has shape [N, H, W, map_dim] img_map = _1x1_conv('conv_image', img_feat, output_dim=map_dim) # nonlinearity img_map = tf.nn.relu(img_map) text_map = fc('fc_text', text_att, output_dim=map_dim) text_map = tf.reshape(text_map, [-1, 1, 1, map_dim]) # nonlinearity text_map = tf.nn.relu(text_map) att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2]) att_map = tf.reshape( fc('fc_att', att_feats, output_dim=map_dim), [N, 1, 1, map_dim]) # interact via element wise map eltwise_mult = tf.nn.l2_normalize(img_map * text_map * att_map, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) # softmax att_grid_soft = tf.nn.softmax(tf.reshape( att_grid, [-1, H * W])) att_grid = tf.reshape(att_grid_soft, [-1, H, W, 1]) return [att_grid]
def build_output_unit_vqa(q_encoding, m_last, num_choices, apply_dropout, scope='output_unit', reuse=None): """ Apply a 2-layer fully-connected network to predict answers. Apply dropout if specified. Input: q_encoding: [N, d], tf.float32 m_last: [N, d], tf.float32 Return: vqa_scores: [N, num_choices], tf.float32 """ output_dim = cfg.MODEL.VQA_OUTPUT_DIM with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.VQA_OUTPUT_USE_QUESTION: fc1 = fc_elu( 'fc1', tf.concat([q_encoding, m_last], axis=1), output_dim=output_dim) else: fc1 = fc_elu('fc1_wo_q', m_last, output_dim=output_dim) if apply_dropout: fc1 = tf.nn.dropout(fc1, cfg.TRAIN.DROPOUT_KEEP_PROB) fc2 = fc('fc2', fc1, output_dim=num_choices, biases_initializer=tf.constant_initializer( cfg.TRAIN.VQA_SCORE_INIT)) vqa_scores = fc2 vqa_scores = tf.nn.softmax(vqa_scores) ##TODO edit_vedika to get softmax prob return vqa_scores
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def LessNumModule(self, input_0, input_1, time_idx, batch_idx, scope='LessNumModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors # Mapping: att_grid x att_grid -> answer probs # Input: # input_0: [N, H, W, 1] # input_1: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. linear transform of the attention map (also including max and min) with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(input_0) H, W = self.att_shape[1:3] att_all_0 = tf.reshape(input_0, to_T([-1, H*W])) att_min_0 = tf.reduce_min(input_0, axis=[1, 2]) att_max_0 = tf.reduce_max(input_0, axis=[1, 2]) att_all_1 = tf.reshape(input_1, to_T([-1, H*W])) att_min_1 = tf.reduce_min(input_1, axis=[1, 2]) att_max_1 = tf.reduce_max(input_1, axis=[1, 2]) # att_reduced has shape [N, 3] att_concat = tf.concat([att_all_0, att_min_0, att_max_0, att_all_1, att_min_1, att_max_1], axis=1) scores = fc('fc_scores', att_concat, output_dim=self.num_choices) return scores
def AnswerModule(self, input_0, time_idx, batch_idx, scope='AnswerModule', reuse=None): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors att_grid = input_0 # Mapping: att_grid -> answer probs # Input: # att_grid: [N, H, W, 1] # Output: # answer_scores: [N, self.num_choices] # # Implementation: # 1. Max-pool over att_grid # 2. a linear mapping layer (without ReLU) with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(att_grid) N = att_shape[0] H = att_shape[1] W = att_shape[2] att_min = tf.reduce_min(att_grid, axis=[1, 2]) att_avg = tf.reduce_mean(att_grid, axis=[1, 2]) att_max = tf.reduce_max(att_grid, axis=[1, 2]) # att_reduced has shape [N, 3] att_reduced = tf.concat([att_min, att_avg, att_max], axis=1) scores = fc('fc_scores', att_reduced, output_dim=self.num_choices) return scores
def my_fc_layer(input_batch, name, output_dim, apply_dropout=False): with tf.variable_scope(name): print("input_batch: ", input_batch) fc7 = fc('fc', input_batch, output_dim=output_dim) print("fc7: ", fc7) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def build_output_unit_vqa(q_encoding, m_last, num_choices, apply_dropout, scope='output_unit', reuse=None): """ Apply a 2-layer fully-connected network to predict answers. Apply dropout if specified. Input: q_encoding: [N, d], tf.float32 m_last: [N, d], tf.float32 Return: vqa_scores: [N, num_choices], tf.float32 """ output_dim = cfg.MODEL.VQA_OUTPUT_DIM with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.VQA_OUTPUT_USE_QUESTION: fc1 = fc_elu('fc1', tf.concat([q_encoding, m_last], axis=1), output_dim=output_dim) else: fc1 = fc_elu('fc1_wo_q', m_last, output_dim=output_dim) if apply_dropout: fc1 = tf.nn.dropout(fc1, cfg.TRAIN.DROPOUT_KEEP_PROB) print(cfg.TRAIN.DROPOUT_KEEP_PROB) ipdb.set_trace() fc2 = fc('fc2', fc1, output_dim=num_choices) vqa_scores = fc2 return vqa_scores
def Transform(self, att_stack, stack_ptr, mem_in, c_i, scope='Transform', reuse=None): """ Transforms the previous attention, and updates memory vector. """ with tf.variable_scope(scope, reuse=reuse): # Get attention # 1) linearly map the controller vectors to the KB dimension # 2) extract attended features from the input attention # 2) elementwise product with KB # 3) 1x1 convolution to get attention logits # Pop from stack att_in = _read_from_stack(att_stack, stack_ptr) # stack_ptr = _move_ptr_bw(stack_ptr) # cancel-out below c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM) kb_att_in = _extract_softmax_avg(self.kb_batch, att_in) elt_prod = tf.nn.l2_normalize(self.kb_batch * c_mapped[:, ax, ax, :] * kb_att_in[:, ax, ax, :], axis=-1) att_out = _1x1conv('conv_att_out', elt_prod, output_dim=1) # Push to stack # stack_ptr = _move_ptr_fw(stack_ptr) # cancel-out above att_stack = _write_to_stack(att_stack, stack_ptr, att_out) return att_stack, stack_ptr, self.mem_zero
def Find(self, att_stack, stack_ptr, mem_in, c_i, scope='Find', reuse=None): """ Performs localization, and updates memory vector. """ with tf.variable_scope(scope, reuse=reuse): # Get attention # 1) linearly map the controller vectors to the KB dimension # 2) elementwise product with KB # 3) 1x1 convolution to get attention logits c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM) elt_prod = tf.nn.l2_normalize(self.kb_batch * c_mapped[:, ax, ax, :], axis=-1) att_out = _1x1conv('conv_att_out', elt_prod, output_dim=1) # Push to stack stack_ptr = _move_ptr_fw(stack_ptr) att_stack = _write_to_stack(att_stack, stack_ptr, att_out) return att_stack, stack_ptr, self.mem_zero
def TransformModule(self, input_0, time_idx, batch_idx, kernel_size=5, map_dim=250, scope='TransformModule', reuse=True): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: att_grid x text_param -> att_grid # Input: # input_0: [N, H, W, 1] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # Convolutional layer that also involve text_param # A 'soft' convolutional kernel that is modulated by text_param with tf.variable_scope(self.module_variable_scope): with tf.variable_scope(scope, reuse=reuse): att_shape = tf.shape(input_0) N = att_shape[0] H = att_shape[1] W = att_shape[2] att_maps = _conv('conv_maps', input_0, kernel_size=kernel_size, stride=1, output_dim=map_dim) text_param_mapped = fc('text_fc', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) eltwise_mult = tf.nn.l2_normalize(att_maps * text_param_mapped, 3) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) att_grid.set_shape(self.att_shape) return att_grid
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) input_dim = 1 for d in feat_vis.get_shape().as_list()[1:]: input_dim *= d feat_vis_flatten = tf.reshape(feat_vis, [-1, input_dim]) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis_flatten, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def __init__(self, images, q_encoding, image_valid_batch, num_choices, scope='single_hop', reuse=None): x_loc = self.loc_init(images, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): x_loc_shape = tf.shape(x_loc) B, H, W = x_loc_shape[0], x_loc_shape[1], x_loc_shape[2] dim = x_loc.get_shape().as_list()[-1] # static shape # attention over x_loc proj_q = fc('fc_q_map1', q_encoding, output_dim=dim)[:, ax, ax, :] interactions = tf.nn.l2_normalize(x_loc * proj_q, axis=-1) raw_att = conv('conv_att_score', interactions, kernel_size=1, stride=1, output_dim=1) raw_att = tf.reshape(raw_att, to_T([B, H * W])) # (N, H*W) valid_mask = tf.reshape(image_valid_batch, tf.shape(raw_att)) raw_att = raw_att * valid_mask - 1e18 * (1 - valid_mask) att = tf.nn.softmax(raw_att, axis=-1) # (N, H*W) # collect attended image feature x_att = tf.matmul(tf.reshape(att, to_T([B, 1, H * W])), tf.reshape(x_loc, to_T([B, H * W, dim]))) # (N, 1, D_kb) x_att = tf.reshape(x_att, to_T([B, dim])) # (N, D_kb) # VQA classification eQ = fc('fc_q_map2', q_encoding, output_dim=dim) if cfg.OUT_QUESTION_MUL: features = tf.concat([x_att, eQ, x_att * eQ], axis=-1) else: features = tf.concat([x_att, eQ], axis=-1) fc1 = fc_relu('fc_hidden', features, output_dim=cfg.OUT_CLASSIFIER_DIM) logits = fc('fc_scores', fc1, output_dim=num_choices) self.logits = logits
def build_output_unit_rec(rec_inputs, input_seq_batch, embed_seq, seq_length_batch, num_vocab, scope='output_unit_rec', reuse=None): """ Try to reconstruct the input sequence from the controller outputs with a seq-to-seq LSTM. Input: rec_inputs: [T, N, ?], tf.float32 input_seq_batch: [S, N], tf.int32 embed_seq: [S, N, e], tf.float32 seq_length_batch: [N], tf.int32 Return: loss_rec: [], tf.float32 """ with tf.variable_scope(scope, reuse=reuse): S = tf.shape(input_seq_batch)[0] N = tf.shape(input_seq_batch)[1] lstm_dim = cfg.MODEL.LSTM_DIM # encoder cell_encoder = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim, name='c_encoder') _, states_encoder = tf.nn.dynamic_rnn(cell_encoder, rec_inputs, dtype=tf.float32, time_major=True) # decoder cell_decoder = tf.nn.rnn_cell.BasicLSTMCell(lstm_dim, name='c_decoder') embed_seq_shifted = tf.concat( [tf.zeros_like(embed_seq[:1]), embed_seq[:-1]], axis=0) outputs_decoder, _ = tf.nn.dynamic_rnn( cell_decoder, embed_seq_shifted, sequence_length=seq_length_batch, initial_state=states_encoder, time_major=True) # word prediction outputs_flat = tf.reshape(outputs_decoder, to_T([S * N, lstm_dim])) word_scores_flat = fc('fc_word_scores', outputs_flat, output_dim=num_vocab) word_scores = tf.reshape(word_scores_flat, to_T([S, N, num_vocab])) # cross-entropy loss over the actual sequence words # att_mask: [S, N] att_mask = tf.less(tf.range(S)[:, ax], seq_length_batch) att_mask = tf.cast(att_mask, tf.float32) loss_rec = tf.reduce_sum( att_mask * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=word_scores, labels=input_seq_batch)) / tf.reduce_sum(att_mask) return loss_rec
def deeplab_fc8(input_batch, name, apply_dropout=False): pool5a = deeplab_pool5(input_batch, name) with tf.variable_scope(name): fc6 = fc_relu('fc6', pool5a, output_dim=1024) if apply_dropout: fc6 = drop(fc6, 0.5) fc7 = fc_relu('fc7', fc6, output_dim=1024) if apply_dropout: fc7 = drop(fc7, 0.5) fc8 = fc('fc8', fc7, output_dim=1000) return fc8
def FindModule(self, time_idx, batch_idx, map_dim=500, scope='FindModule', reuse=None): # In TF Fold, batch_idx and time_idx are both [N_batch, 1] tensors image_feat_grid = self._slice_image_feat_grid(batch_idx) text_param = self._slice_word_vecs(time_idx, batch_idx) # Mapping: image_feat_grid x text_param -> att_grid # Input: # image_feat_grid: [N, H, W, D_im] # text_param: [N, D_txt] # Output: # att_grid: [N, H, W, 1] # # Implementation: # 1. Elementwise multiplication between image_feat_grid and text_param # 2. L2-normalization # 3. Linear classification with tf.variable_scope(scope, reuse=reuse): image_shape = tf.shape(image_feat_grid) N = tf.shape(time_idx)[0] H = image_shape[1] W = image_shape[2] D_im = image_feat_grid.get_shape().as_list()[-1] D_txt = text_param.get_shape().as_list()[-1] # image_feat_mapped has shape [N, H, W, map_dim] image_feat_mapped = _1x1_conv('conv_image', image_feat_grid, output_dim=map_dim) text_param_mapped = fc('fc_text', text_param, output_dim=map_dim) text_param_mapped = tf.reshape(text_param_mapped, to_T([N, 1, 1, map_dim])) ######################### #eltwise_mult = tf.nn.l2_normalize(image_feat_mapped * text_param_mapped, 3) #with ops.name_scope(name, "l2_normalize", [x]) as name: x = image_feat_mapped * text_param_mapped square_sum = math_ops.reduce_sum(math_ops.square(x), 3, keep_dims=True) x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, 1e-12)) eltwise_mult = math_ops.multiply(x, x_inv_norm, name=None) att_grid = _1x1_conv('conv_eltwise', eltwise_mult, output_dim=1) # TODO # Do we need to take exponential over the scores? # No. # Does the attention needs to be normalized? (sum up to 1) # No, since non-existence should be 0 everywhere return att_grid
def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None): if dropout == None: dropout = self.dropout text_seq_batch = tf.transpose(sen_data, [1, 0]) # input data is [num_steps, batch_size] with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): if self.embed_w is None: initializer = tf.contrib.layers.xavier_initializer(uniform=True) else: initializer = tf.constant_initializer(self.embed_w) embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=initializer) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # encode phrase based on the last step of hidden states outputs, _, _ = bi_lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False, initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) sen_raw = outputs[-1] vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size]) sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9) vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9) sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, 2*self.lstm_dim]) # bi-directional lstm: hidden_size double vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat(3, [sen_tile, vis_output]) feat_proj_init = msr_init([1, 1, 2*self.lstm_dim+self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 5]) att_conv = conv("att_conv", feat_relu, 1, 1, 5, weights_initializer=att_conv_init) att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop, 5]) att_logits = tf.reshape(att_scores[:, :, 0], [self.batch_size, self.num_prop]) _, pred_ind = tf.nn.top_k(att_logits, self.top_k) pred_ind = tf.reshape(pred_ind, [self.batch_size*self.top_k, 1]) row_ind = tf.reshape(tf.range(0, self.batch_size), [-1, 1]) row_ind = tf.reshape(tf.tile(row_ind, [1, self.top_k]), [self.top_k*self.batch_size, 1]) pred_ind = tf.concat(1, [row_ind, pred_ind]) # (batch_size*top_k) x img_feat_size vis_top = tf.gather_nd(tf.reshape(vis_output, [self.batch_size, self.num_prop, self.img_feat_size]), pred_ind) vis_ref = tf.reduce_mean(tf.reshape(vis_top, [self.batch_size, self.top_k, self.img_feat_size]), 1) ref_feat = tf.concat(1, [vis_ref, sen_bn]) # ref_feat = vis_ref reward_pred = tf.reshape(tf.sigmoid(fc('reward_pred', ref_feat, 1)),[self.batch_size]) return att_scores, reward_pred
def DescribeTwo(self, att_stack, stack_ptr, mem_in, c_i, scope='DescribeTwo', reuse=None): """ Describe using two input attentions. Outputs zero attention. """ with tf.variable_scope(scope, reuse=reuse): # Update memory: # 1) linearly map the controller vectors to the KB dimension # 2) extract attended features from the input attention # 3) elementwise multplication # 2) linearly merge with previous memory vector, find memory # vector and control state att_stack_old, stack_ptr_old = att_stack, stack_ptr # make a copy # Pop from stack att_in_2 = _read_from_stack(att_stack, stack_ptr) stack_ptr = _move_ptr_bw(stack_ptr) att_in_1 = _read_from_stack(att_stack, stack_ptr) # stack_ptr = _move_ptr_bw(stack_ptr) # cancel-out below c_mapped = fc('fc_c_mapped', c_i, output_dim=cfg.MODEL.KB_DIM) kb_att_in_1 = _extract_softmax_avg(self.kb_batch, att_in_1) kb_att_in_2 = _extract_softmax_avg(self.kb_batch, att_in_2) elt_prod = tf.nn.l2_normalize(c_mapped * kb_att_in_1 * kb_att_in_2, axis=-1) mem_out = fc('fc_mem_out', tf.concat([c_i, mem_in, elt_prod], axis=1), output_dim=self.mem_dim) # Push to stack # stack_ptr = _move_ptr_fw(stack_ptr) # cancel-out above att_stack = _write_to_stack(att_stack, stack_ptr, self.att_zero) if cfg.MODEL.NMN.DESCRIBE_TWO.KEEP_STACK: att_stack, stack_ptr = att_stack_old, stack_ptr_old return att_stack, stack_ptr, mem_out
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) layer1=drop(layer1,0.5) outputs = fc('layer2', layer1,output_dim=4) return outputs
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) layer1=drop(layer1,0.5) outputs = fc('layer2', layer1,output_dim=2) return outputs
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test): name="CTRL_Model" with tf.variable_scope(name): print "Building training network...............................\n" transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print "Building test network...............................\n" transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [3]) return sim_score_mat_train, sim_score_mat_test
def visual_semantic_infer(self, visual_feature_train, sentence_embed_train, visual_feature_test, sentence_embed_test, sentence_ph_train_len, sentence_ph_test_len): name="CTRL_Model" with tf.variable_scope(name): print("Building training network...............................\n") transformed_clip_train = fc('v2s_lt', visual_feature_train, output_dim=self.semantic_size) transformed_clip_train_norm = tf.nn.l2_normalize(transformed_clip_train, dim=1) if self.useLSTM: sentence_embed_train = self.lstm_embed(sentence_embed_train, sentence_ph_train_len) transformed_sentence_train = fc('s2s_lt', sentence_embed_train, output_dim=self.semantic_size) transformed_sentence_train_norm = tf.nn.l2_normalize(transformed_sentence_train, dim=1) cross_modal_vec_train = self.cross_modal_comb(transformed_clip_train_norm, transformed_sentence_train_norm, self.batch_size) sim_score_mat_train = vs_multilayer.vs_multilayer(cross_modal_vec_train, "vs_multilayer_lt", middle_layer_dim=1000) sim_score_mat_train = tf.reshape(sim_score_mat_train,[self.batch_size, self.batch_size, 3]) tf.get_variable_scope().reuse_variables() print("Building test network...............................\n") transformed_clip_test = fc('v2s_lt', visual_feature_test, output_dim=self.semantic_size) transformed_clip_test_norm = tf.nn.l2_normalize(transformed_clip_test, dim=1) if self.useLSTM: sentence_embed_test = self.lstm_embed(sentence_embed_test, sentence_ph_test_len) transformed_sentence_test = fc('s2s_lt', sentence_embed_test, output_dim=self.semantic_size) transformed_sentence_test_norm = tf.nn.l2_normalize(transformed_sentence_test, dim=1) cross_modal_vec_test = self.cross_modal_comb(transformed_clip_test_norm, transformed_sentence_test_norm, self.test_batch_size) sim_score_mat_test = vs_multilayer.vs_multilayer(cross_modal_vec_test, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test = tf.reshape(sim_score_mat_test, [self.test_batch_size, self.test_batch_size, 3]) cross_modal_vec_test_1 = self.cross_modal_comb(tf.reshape(transformed_clip_test_norm[1], shape=(1,1024)), tf.reshape(transformed_sentence_test_norm[1], shape=(1,1024)), 1) sim_score_mat_test_1 = vs_multilayer.vs_multilayer(cross_modal_vec_test_1, "vs_multilayer_lt", reuse=True, middle_layer_dim=1000) sim_score_mat_test_1 = tf.reshape(sim_score_mat_test_1, [3]) return sim_score_mat_train, sim_score_mat_test, sim_score_mat_test_1
def instantiate_batch(self, inputs): """ Inputs: output from the previous modules image feature for the example text attention for all modules for the example time id for current module """ vis_att, img_feat, text_att = inputs # text feature dimension, intermediate mapping dimension # batch size, image feature height and width text_dim = text_att.shape.as_list()[-1] map_dim = self._params['map_dim'] encode_size = self._params['encode_size'] N = tf.shape(img_feat)[0] H, W = img_feat.shape.as_list()[1:3] with tf.variable_scope(self._module_scope): with tf.variable_scope(self._scope, reuse=self._reuse): text_map = fc('fc_text', text_att, output_dim=map_dim) # nonlinearity text_map = tf.nn.relu(text_map) # att_feat, att_feat_1 has shape [N, D_vis] att_feats = tf.reduce_sum(img_feat * vis_att, axis=[1, 2]) img_map = tf.reshape( fc('fc_att', att_feats, output_dim=map_dim), [N, map_dim]) # nonlinearity img_map = tf.nn.relu(img_map) eltwise_mult = tf.nn.l2_normalize(img_map * text_map, 1) context = fc('fc_eltwise', eltwise_mult, output_dim=encode_size) return [context]
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(1, [tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 1 (feature dimension) feat_all = tf.concat(1, [tf.nn.l2_normalize(lstm_top_batch, 1), tf.nn.l2_normalize(fc8_crop_batch, 1), spatial_batch]) # Outputs # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) scores = mlp_l2 # Load pretrained model snapshot_saver = tf.train.Saver() sess = tf.Session() snapshot_saver.restore(sess, pretrained_model) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(query_file)) bbox_dict = json.load(open(bbox_file)) imcrop_dict = json.load(open(imcrop_file)) imsize_dict = json.load(open(imsize_file))
def vgg_fc8(input_batch, name, apply_dropout, output_dim=1000): fc7 = vgg_fc7(input_batch, name, apply_dropout) with tf.variable_scope(name): # layer 8 (no ReLU after fc8) fc8 = fc('fc8', fc7, output_dim=output_dim) return fc8