def rpn_net(conv5, im_info, name, feat_stride=16, anchor_scales=(8, 16, 32), phase='TEST'): with tf.variable_scope(name): # rpn_conv/3x3 rpn_conv = conv_relu('rpn_conv/3x3', conv5, kernel_size=3, stride=1, output_dim=512) # rpn_cls_score # Note that we've already subtracted the bg weights from fg weights # and do sigmoid instead of softmax (actually sigmoid is not needed # for ranking) rpn_cls_score = conv('rpn_cls_score', rpn_conv, kernel_size=1, stride=1, output_dim=len(anchor_scales) * 3) # rpn_bbox_pred rpn_bbox_pred = conv('rpn_bbox_pred', rpn_conv, kernel_size=1, stride=1, output_dim=len(anchor_scales) * 3 * 4) rois = tf.py_func(ProposalLayer(feat_stride, anchor_scales, phase), [rpn_cls_score, rpn_bbox_pred, im_info], [tf.float32], stateful=False)[0] rois.set_shape([None, 5]) return rois
def build_output_unit_loc(q_encoding, kb_batch, att_last, scope='output_unit_loc', reuse=None): """ Apply a 1-layer convolution network to predict localization scores. Apply dropout if specified. Input: kb_batch: [N, H, W, d], tf.float32 att_last: [N, H, W, 1], tf.float32 Return: loc_scores: [N, H*W], tf.float32 bbox_offset: [N, 4], tf.float32 """ with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.LOC_SCORES_POS_AFFINE: # make sure att signs do not flip w = tf.abs(tf.get_variable('loc_scores_affine_raw_w', [])) b = tf.get_variable('loc_scores_affine_b', []) loc_scores = w * att_last + b else: loc_scores = conv( 'conv_loc', att_last, kernel_size=3, stride=1, output_dim=1) loc_scores = tf.reshape( loc_scores, [-1, cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT]) # extract the attended features for bounding box regression if cfg.MODEL.BBOX_REG_AS_FCN: if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc( 'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) bbox_offset_input = tf.nn.l2_normalize( q_mapped[:, ax, ax, :] * kb_batch, axis=-1) else: bbox_offset_input = kb_batch bbox_offset_fcn = conv( 'conv_bbox_offset', bbox_offset_input, 1, 1, output_dim=4) N = tf.shape(bbox_offset_fcn)[0] B = cfg.MODEL.H_FEAT*cfg.MODEL.W_FEAT # B = H*W # bbox_offset_fcn [N, B, 4] is used for training bbox_offset_fcn = tf.reshape(bbox_offset_fcn, to_T([N, B, 4])) # bbox_offset [N, 4] is only used for prediction bbox_offset_flat = tf.reshape(bbox_offset_fcn, to_T([N*B, 4])) slice_inds = tf.range(N) * B + tf.argmax( loc_scores, axis=-1, output_type=tf.int32) bbox_offset = tf.gather(bbox_offset_flat, slice_inds) else: bbox_offset_fcn = None kb_loc = _extract_softmax_avg(kb_batch, att_last) if cfg.MODEL.BBOX_REG_USE_QUESTION: q_mapped = fc( 'fc_q_mapped', q_encoding, output_dim=cfg.MODEL.KB_DIM) elt_prod = tf.nn.l2_normalize(q_mapped * kb_loc, axis=-1) bbox_offset = fc( 'fc_bbox_offset_with_q', elt_prod, output_dim=4) else: bbox_offset = fc('fc_bbox_offset', kb_loc, output_dim=4) return loc_scores, bbox_offset, bbox_offset_fcn
def build_kb_batch(image_feat_batch, scope='kb_batch', reuse=None): """ Concatenation image batch and position encoding batch, and apply a 2-layer CNN on top of it. Input: image_feat_batch: [N, H, W, C], tf.float32 Return: kb_batch: [N, H, W, d], tf.float32 """ kb_dim = cfg.MODEL.KB_DIM with tf.variable_scope(scope, reuse=reuse): if cfg.MODEL.INPUT.USE_L2_NORMALIZATION: norm_type = cfg.MODEL.INPUT.L2_NORMALIZATION_TYPE if norm_type == 'global': # Normalize along H, W, C image_feat_batch = tf.nn.l2_normalize(image_feat_batch, axis=[1, 2, 3]) elif norm_type == 'local': # Normalize along C image_feat_batch = tf.nn.l2_normalize(image_feat_batch, axis=-1) else: raise ValueError('Invalid l2 normalization type: ' + norm_type) if cfg.MODEL.INPUT.USE_POSITION_ENCODING: # get positional encoding N = tf.shape(image_feat_batch)[0] _, H, W, _ = image_feat_batch.get_shape().as_list() position_encoding = to_T(get_positional_encoding(H, W), dtype=tf.float32) position_batch = tf.tile(position_encoding, to_T([N, 1, 1, 1])) # apply a two layer convnet with ELU activation conv1 = conv_elu('conv1', tf.concat([image_feat_batch, position_batch], axis=3), kernel_size=1, stride=1, output_dim=kb_dim) conv2 = conv('conv2', conv1, kernel_size=1, stride=1, output_dim=kb_dim) kb_batch = conv2 else: kb_batch = conv('conv_no_pe', image_feat_batch, kernel_size=1, stride=1, output_dim=kb_dim) return kb_batch
def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None): if dropout == None: dropout = self.dropout text_seq_batch = tf.transpose(sen_data, [1, 0]) # input data is [num_steps, batch_size] with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): if self.embed_w is None: initializer = tf.contrib.layers.xavier_initializer(uniform=True) else: initializer = tf.constant_initializer(self.embed_w) embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=initializer) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # encode phrase based on the last step of hidden states outputs, _, _ = bi_lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False, initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) sen_raw = outputs[-1] vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size]) sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9) vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9) sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, 2*self.lstm_dim]) # bi-directional lstm: hidden_size double vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat(3, [sen_tile, vis_output]) feat_proj_init = msr_init([1, 1, 2*self.lstm_dim+self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 5]) att_conv = conv("att_conv", feat_relu, 1, 1, 5, weights_initializer=att_conv_init) att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop, 5]) att_logits = tf.reshape(att_scores[:, :, 0], [self.batch_size, self.num_prop]) _, pred_ind = tf.nn.top_k(att_logits, self.top_k) pred_ind = tf.reshape(pred_ind, [self.batch_size*self.top_k, 1]) row_ind = tf.reshape(tf.range(0, self.batch_size), [-1, 1]) row_ind = tf.reshape(tf.tile(row_ind, [1, self.top_k]), [self.top_k*self.batch_size, 1]) pred_ind = tf.concat(1, [row_ind, pred_ind]) # (batch_size*top_k) x img_feat_size vis_top = tf.gather_nd(tf.reshape(vis_output, [self.batch_size, self.num_prop, self.img_feat_size]), pred_ind) vis_ref = tf.reduce_mean(tf.reshape(vis_top, [self.batch_size, self.top_k, self.img_feat_size]), 1) ref_feat = tf.concat(1, [vis_ref, sen_bn]) # ref_feat = vis_ref reward_pred = tf.reshape(tf.sigmoid(fc('reward_pred', ref_feat, 1)),[self.batch_size]) return att_scores, reward_pred
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor(generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(3, [tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def build_kb_batch(image_feat_batch, scope='kb_batch', reuse=None): """ Concatenation image batch and position encoding batch, and apply a 2-layer CNN on top of it. Input: image_feat_batch: [N, H, W, C], tf.float32 Return: kb_batch: [N, H, W, d], tf.float32 """ kb_dim = cfg.MODEL.KB_DIM with tf.variable_scope(scope, reuse=reuse): # get positional encoding N = tf.shape(image_feat_batch)[0] _, H, W, _ = image_feat_batch.get_shape().as_list() position_encoding = to_T( get_positional_encoding(H, W), dtype=tf.float32) position_batch = tf.tile(position_encoding, to_T([N, 1, 1, 1])) # apply a two layer convnet with ELU activation conv1 = conv_elu( 'conv1', tf.concat([image_feat_batch, position_batch], axis=3), kernel_size=1, stride=1, output_dim=kb_dim) conv2 = conv( 'conv2', conv1, kernel_size=1, stride=1, output_dim=kb_dim) kb_batch = conv2 return kb_batch
def _1x1conv(name, bottom, output_dim, reuse=None): return conv(name, bottom, kernel_size=1, stride=1, output_dim=output_dim, reuse=reuse)
def forward(self, imcrop_batch, text_seq_batch, is_training=True): num_vocab, embed_dim, lstm_dim, mlp_hidden_dims = self.num_vocab, self.embed_dim, self.lstm_dim, self.mlp_hidden_dims deeplab_dropout = self.kwargs[ 'deeplab_dropout'] if 'deeplab_dropout' in self.kwargs else False mlp_dropout = self.kwargs[ 'mlp_dropout'] if 'mlp_dropout' in self.kwargs else False with tf.variable_scope(self.model_name): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv( imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) upsample8s = deconv('upsample8s', mlp_l2, kernel_size=16, stride=8, output_dim=1, bias_term=False) return upsample8s
def vgg_fc8_full_conv(input_batch, name, apply_dropout, output_dim=1000, reuse=None): fc7 = vgg_fc7_full_conv(input_batch, name, apply_dropout, reuse) with tf.variable_scope(name, reuse=reuse): # layer 8 (no ReLU after fc8) fc8 = conv('fc8', fc7, kernel_size=1, stride=1, output_dim=output_dim) return fc8
def conv_net_bn(input_batch, name, phase): with tf.variable_scope(name): #conv1: 2*2@4/2 conv1 = conv_relu_bn('conv1', input_batch, phase, kernel_size=2, stride=2, output_dim=4) print("conv1: ", conv1) #conv2: 2*2@4/1 conv2 = conv_relu_bn('conv2', conv1, phase, kernel_size=2, stride=1, output_dim=4) print("conv2: ", conv2) #conv3: 2*2@8/2 conv3 = conv_relu_bn('conv3', conv2, phase, kernel_size=2, stride=2, output_dim=8) print("conv3: ", conv3) #conv4: 2*2@8/1 conv4 = conv_relu_bn('conv4', conv3, phase, kernel_size=2, stride=1, output_dim=8) print("conv4: ", conv4) #conv5: 2*2@8/2 conv5 = conv_relu_bn('conv5', conv4, phase, kernel_size=2, stride=2, output_dim=8) print("conv5: ", conv5) #conv6: 2*2@8/1 tanh conv6 = conv('conv6', conv5, kernel_size=2, stride=1, output_dim=8) conv6 = tf.contrib.layers.batch_norm(conv6, center=True, scale=True, is_training=phase, scope='bn') print("conv6: ", conv6) tanh = tf.nn.tanh(conv6) return tanh
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = conv_relu('layer1', input_batch, kernel_size=1,stride=1,output_dim=middle_layer_dim) sim_score = conv('layer2', layer1, kernel_size=1,stride=1,output_dim=3) return sim_score
def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None): if dropout == None: dropout = self.dropout text_seq_batch = tf.transpose(sen_data, [1, 0]) # input data is [num_steps, batch_size] with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # we encode phrase based on the last step of hidden states _, states = lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False, initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) # batch normalization for visual and language part sen_raw = states[-1].h vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size]) sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9) vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9) sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, self.lstm_dim]) vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat([sen_tile, vis_output], 3) feat_proj_init = msr_init([1, 1, self.lstm_dim+self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 1]) att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init) att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop]) return att_scores
def loc_init(self, images, scope='kb_batch', reuse=None): """ Linearly transform the input features to a fixed dimension MODEL.KB_DIM """ with tf.variable_scope(scope, reuse=reuse): if cfg.STEM_NORMALIZE: images = tf.nn.l2_normalize(images, axis=-1) # apply a single layer convnet conv1 = conv('conv1', images, kernel_size=1, stride=1, output_dim=cfg.LOC_DIM) return conv1
def __init__(self, images, q_encoding, image_valid_batch, num_choices, scope='single_hop', reuse=None): x_loc = self.loc_init(images, reuse=reuse) with tf.variable_scope(scope, reuse=reuse): x_loc_shape = tf.shape(x_loc) B, H, W = x_loc_shape[0], x_loc_shape[1], x_loc_shape[2] dim = x_loc.get_shape().as_list()[-1] # static shape # attention over x_loc proj_q = fc('fc_q_map1', q_encoding, output_dim=dim)[:, ax, ax, :] interactions = tf.nn.l2_normalize(x_loc * proj_q, axis=-1) raw_att = conv('conv_att_score', interactions, kernel_size=1, stride=1, output_dim=1) raw_att = tf.reshape(raw_att, to_T([B, H * W])) # (N, H*W) valid_mask = tf.reshape(image_valid_batch, tf.shape(raw_att)) raw_att = raw_att * valid_mask - 1e18 * (1 - valid_mask) att = tf.nn.softmax(raw_att, axis=-1) # (N, H*W) # collect attended image feature x_att = tf.matmul(tf.reshape(att, to_T([B, 1, H * W])), tf.reshape(x_loc, to_T([B, H * W, dim]))) # (N, 1, D_kb) x_att = tf.reshape(x_att, to_T([B, dim])) # (N, D_kb) # VQA classification eQ = fc('fc_q_map2', q_encoding, output_dim=dim) if cfg.OUT_QUESTION_MUL: features = tf.concat([x_att, eQ, x_att * eQ], axis=-1) else: features = tf.concat([x_att, eQ], axis=-1) fc1 = fc_relu('fc_hidden', features, output_dim=cfg.OUT_CLASSIFIER_DIM) logits = fc('fc_scores', fc1, output_dim=num_choices) self.logits = logits
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout, is_training): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] #deeplab101 net = deeplab101.DeepLabResNetModel({'data': imcrop_batch}, is_training=is_training) feat_vis = net.layers['fc1_voc12'] # # Local image feature # feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', # apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def conv_net(input_batch, name): with tf.variable_scope(name): #conv1: 2*2@4/2 conv1 = conv_relu('conv1', input_batch, kernel_size=2, stride=2, output_dim=4) print("conv1: ", conv1) #conv2: 2*2@4/1 conv2 = conv_relu('conv2', conv1, kernel_size=2, stride=1, output_dim=4) print("conv2: ", conv2) #conv3: 2*2@8/2 conv3 = conv_relu('conv3', conv2, kernel_size=2, stride=2, output_dim=8) print("conv3: ", conv3) #conv4: 2*2@8/1 conv4 = conv_relu('conv4', conv3, kernel_size=2, stride=1, output_dim=8) print("conv4: ", conv4) #conv5: 2*2@8/2 conv5 = conv_relu('conv5', conv4, kernel_size=2, stride=2, output_dim=8) print("conv5: ", conv5) #conv6: 2*2@8/1 tanh conv6 = conv('conv6', conv5, kernel_size=2, stride=1, output_dim=8) print("conv6: ", conv6) tanh = tf.nn.tanh(conv6) return tanh
def _conv(name, bottom, kernel_size, stride, output_dim, padding='SAME', bias_term=True, weights_initializer=None, biases_initializer=None, reuse=None): g = tf.get_default_graph() with g.gradient_override_map({'Conv2D': 'Conv2D_handle_empty_batch'}): return conv(name, bottom, kernel_size, stride, output_dim, padding, bias_term, weights_initializer, biases_initializer, reuse=reuse)
def recurrent_multimodal(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, feature_vis_dropout, mlp_dropout): _, feat_langs, embedded_seq = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', feature_vis_dropout, output_dim=1000) featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] # Reshape and tile feat_langs, embedded_seq T, N, D_text = embedded_seq.get_shape().as_list() feat_langs = [ tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) for feat_lang in feat_langs ] embedded_seq = [ tf.tile(tf.reshape(_embedded_seq, (N, 1, 1, embed_dim)), [1, featmap_H, featmap_W, 1]) for _embedded_seq in tf.split(embedded_seq, T, 0) ] # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) #concat all features feat_alls = [] for i in range(T): feat_alls.append( tf.concat([ tf.nn.l2_normalize(feat_langs[i], 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ], 3)) #feat_alls.append(tf.concat([feat_langs[i], feat_vis, spatial_batch], 3)) feat_all = tf.stack(feat_alls, 3) feat_all = tf.transpose(feat_all, [0, 3, 1, 2, 4]) print(feat_all.shape) #mlstm print(tf.get_variable_scope().reuse) mlstm_top = rnn.mlstm_layer('mlstm', feat_all, None, 500)[0] print(tf.get_variable_scope().reuse) #MLP classfier with tf.variable_scope('classifier'): mlp_l1 = conv('mlp_l1', mlstm_top, kernel_size=1, stride=1, output_dim=1) #if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) #mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l1
def __init__(self, image_feat_grid, text_seq_batch, seq_length_batch, T_decoder, num_vocab_txt, embed_dim_txt, num_vocab_nmn, embed_dim_nmn, lstm_dim, num_layers, assembler, encoder_dropout, decoder_dropout, decoder_sampling, num_choices, use_qpn, qpn_dropout, reduce_visfeat_dim=False, new_visfeat_dim=256, use_gt_layout=None, gt_layout_batch=None, scope='neural_module_network', reuse=None): with tf.variable_scope(scope, reuse=reuse): # Part 0: Visual feature from CNN self.reduce_visfeat_dim = reduce_visfeat_dim if reduce_visfeat_dim: # use an extrac linear 1x1 conv layer (without ReLU) # to reduce the feature dimension with tf.variable_scope('reduce_visfeat_dim'): image_feat_grid = conv('conv_reduce_visfeat_dim', image_feat_grid, kernel_size=1, stride=1, output_dim=new_visfeat_dim) print('visual feature dimension reduced to %d' % new_visfeat_dim) self.image_feat_grid = image_feat_grid # Part 1: Seq2seq RNN to generate module layout tokensa with tf.variable_scope('layout_generation'): att_seq2seq = AttentionSeq2Seq(text_seq_batch, seq_length_batch, T_decoder, num_vocab_txt, embed_dim_txt, num_vocab_nmn, embed_dim_nmn, lstm_dim, num_layers, assembler, encoder_dropout, decoder_dropout, decoder_sampling, use_gt_layout, gt_layout_batch) self.att_seq2seq = att_seq2seq predicted_tokens = att_seq2seq.predicted_tokens token_probs = att_seq2seq.token_probs word_vecs = att_seq2seq.word_vecs neg_entropy = att_seq2seq.neg_entropy self.atts = att_seq2seq.atts self.predicted_tokens = predicted_tokens self.token_probs = token_probs self.word_vecs = word_vecs self.neg_entropy = neg_entropy # log probability of each generated sequence self.log_seq_prob = tf.reduce_sum(tf.log(token_probs), axis=0) # Part 2: Neural Module Network with tf.variable_scope('layout_execution'): modules = Modules(image_feat_grid, word_vecs, None, num_choices) self.modules = modules # Recursion of modules att_shape = image_feat_grid.get_shape().as_list()[1:-1] + [1] # Forward declaration of module recursion att_expr_decl = td.ForwardDeclaration(td.PyObjectType(), td.TensorType(att_shape)) # _Scene case_scene = td.Record([('time_idx', td.Scalar(dtype='int32')), ('batch_idx', td.Scalar(dtype='int32'))]) case_scene = case_scene >> td.Function(modules.SceneModule) # _Find case_find = td.Record([('time_idx', td.Scalar(dtype='int32')), ('batch_idx', td.Scalar(dtype='int32'))]) case_find = case_find >> td.Function(modules.FindModule) # _Filter case_filter = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar(dtype='int32')), ('batch_idx', td.Scalar(dtype='int32'))]) case_filter = case_filter >> td.Function(modules.FilterModule) # _FindSameProperty case_find_same_property = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar(dtype='int32')), ('batch_idx', td.Scalar(dtype='int32'))]) case_find_same_property = case_find_same_property >> \ td.Function(modules.FindSamePropertyModule) # _Transform case_transform = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_transform = case_transform >> td.Function(modules.TransformModule) # _And case_and = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_and = case_and >> td.Function(modules.AndModule) # _Or case_or = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_or = case_or >> td.Function(modules.OrModule) # _Exist case_exist = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_exist = case_exist >> td.Function(modules.ExistModule) # _Count case_count = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_count = case_count >> td.Function(modules.CountModule) # _EqualNum case_equal_num = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_equal_num = case_equal_num >> td.Function(modules.EqualNumModule) # _MoreNum case_more_num = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_more_num = case_more_num >> td.Function(modules.MoreNumModule) # _LessNum case_less_num = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_less_num = case_less_num >> td.Function(modules.LessNumModule) # _SameProperty case_same_property = td.Record([('input_0', att_expr_decl()), ('input_1', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_same_property = case_same_property >> \ td.Function(modules.SamePropertyModule) # _Describe case_describe = td.Record([('input_0', att_expr_decl()), ('time_idx', td.Scalar('int32')), ('batch_idx', td.Scalar('int32'))]) case_describe = case_describe >> \ td.Function(modules.DescribeModule) recursion_cases = td.OneOf(td.GetItem('module'), { '_Scene': case_scene, '_Find': case_find, '_Filter': case_filter, '_FindSameProperty': case_find_same_property, '_Transform': case_transform, '_And': case_and, '_Or': case_or}) att_expr_decl.resolve_to(recursion_cases) # For invalid expressions, define a dummy answer # so that all answers have the same form dummy_scores = td.Void() >> td.FromTensor(np.zeros(num_choices, np.float32)) output_scores = td.OneOf(td.GetItem('module'), { '_Exist': case_exist, '_Count': case_count, '_EqualNum': case_equal_num, '_MoreNum': case_more_num, '_LessNum': case_less_num, '_SameProperty': case_same_property, '_Describe': case_describe, INVALID_EXPR: dummy_scores}) # compile and get the output scores self.compiler = td.Compiler.create(output_scores) self.scores_nmn = self.compiler.output_tensors[0] # Add a question prior network if specified self.use_qpn = use_qpn self.qpn_dropout = qpn_dropout if use_qpn: self.scores_qpn = question_prior_net(att_seq2seq.encoder_states, num_choices, qpn_dropout) self.scores = self.scores_nmn + self.scores_qpn else: self.scores = self.scores_nmn # Regularization: Entropy + L2 self.entropy_reg = tf.reduce_mean(neg_entropy) module_weights = [v for v in tf.trainable_variables() if (scope in v.op.name and v.op.name.endswith('weights'))] self.l2_reg = tf.add_n([tf.nn.l2_loss(v) for v in module_weights])
def model_structure(self, sen_data, enc_data, dec_data, msk_data, vis_data, batch_size, is_train, dropout=None): def set_drop_test(): return tf.cast(1.0, tf.float32) def set_drop_train(): return tf.cast(self.dropout, tf.float32) dropout = tf.cond(is_train, set_drop_train, set_drop_test) seq_length = tf.reduce_sum(msk_data, 1) text_seq_batch = sen_data with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # we encode phrase based on the last step of hidden states outputs, states = lstm('enc_lstm', embedded_seq, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=False, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) sen_raw = states[-1].h sen_raw = tf.nn.l2_normalize(sen_raw, dim=1) # print sen_raw.get_shape() vis_raw = tf.reshape( vis_data, [self.batch_size * self.num_prop, self.img_feat_size]) sen_output = tf.reshape(sen_raw, [self.batch_size, 1, 1, self.lstm_dim]) vis_output = tf.reshape( vis_raw, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat([sen_tile, vis_output], 3) feat_proj_init = msr_init( [1, 1, self.lstm_dim + self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 1]) att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init) #Generate the visual attention feature att_scores_t = tf.reshape(att_conv, [self.batch_size, self.num_prop]) # att_prob = tf.nn.softmax(att_scores_t) att_prob = tf.nn.relu(att_scores_t) att_scores = tf.reshape(att_prob, [self.batch_size, self.num_prop, 1]) vis_att_feat = tf.reduce_sum( tf.multiply(vis_data, tf.tile(att_scores, [1, 1, self.img_feat_size])), 1) vis_att_featFC = fc_relu( "vis_enc", vis_att_feat, self.lstm_dim, weights_initializer=tf.random_uniform_initializer(minval=-0.002, maxval=0.002)) vis_att_tile = tf.reshape(vis_att_featFC, [self.batch_size, 1, self.lstm_dim]) text_enc_batch = enc_data # embedded_enc: batch_size x phrase_len x lstm_dim with tf.variable_scope('enc_embedding'), tf.device("/cpu:0"): embedding_enc = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_enc = tf.nn.embedding_lookup(embedding_enc, text_enc_batch) # dec_vis_embed = batch_size x phrase_len x (2*lstm_dim) dec_vis_embed = tf.concat([ embedded_enc, tf.concat([ vis_att_tile, tf.zeros((self.batch_size, self.phrase_len - 1, self.lstm_dim)) ], 1) ], 2) # dec_outputs: batch_size x phrase_len x lstm_dim dec_outs, _ = lstm('dec_lstm', dec_vis_embed, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=True, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) dec_outs = tf.reshape( dec_outs, [self.batch_size * self.phrase_len, self.lstm_dim]) # dec_logits: (batch_size*phrase_len) x vocab_size dec_logits = fc( 'dec_logits', dec_outs, self.vocab_size, weights_initializer=tf.contrib.layers.xavier_initializer( uniform=True)) return att_scores_t, dec_logits, vis_data
def vgg_fc8_full_conv(input_batch, name, apply_dropout, output_dim=1000): fc7 = vgg_fc7_full_conv(input_batch, name, apply_dropout) with tf.variable_scope(name): # layer 8 (no ReLU after fc8) fc8 = conv('fc8', fc7, kernel_size=1, stride=1, output_dim=output_dim) return fc8