def vgg_fc7(input_batch, name, apply_dropout, reuse=None): pool5 = vgg_pool5(input_batch, name, reuse) with tf.variable_scope(name, reuse=reuse): # layer 6 fc6 = fc_relu('fc6', pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def vgg_fc7(input_batch, name, apply_dropout): pool5 = vgg_pool5(input_batch, name) with tf.variable_scope(name): # layer 6 fc6 = fc_relu('fc6', pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def vgg_fc7_full_conv(input_batch, name, apply_dropout, reuse=None): pool5 = vgg_pool5(input_batch, name, reuse) with tf.variable_scope(name, reuse=reuse): # layer 6 fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def deeplab_fc8(input_batch, name, apply_dropout=False): pool5a = deeplab_pool5(input_batch, name) with tf.variable_scope(name): fc6 = fc_relu('fc6', pool5a, output_dim=1024) if apply_dropout: fc6 = drop(fc6, 0.5) fc7 = fc_relu('fc7', fc6, output_dim=1024) if apply_dropout: fc7 = drop(fc7, 0.5) fc8 = fc('fc8', fc7, output_dim=1000) return fc8
def vgg_fc7_full_conv(input_batch, name, apply_dropout): pool5 = vgg_pool5(input_batch, name) with tf.variable_scope(name): # layer 6 fc6 = conv_relu('fc6', pool5, kernel_size=7, stride=1, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = conv_relu('fc7', fc6, kernel_size=1, stride=1, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def vgg_roi_fc7_from_conv5(conv5, roi_batch, name, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): # ROI Pooling roi_pool5, _ = roi_pool(conv5, roi_batch, pooled_height=7, pooled_width=7, spatial_scale=1. / 16, name='roi_pool5') # layer 6 fc6 = fc_relu('fc6', roi_pool5, output_dim=4096) if apply_dropout: fc6 = drop(fc6, 0.5) # layer 7 fc7 = fc_relu('fc7', fc6, output_dim=4096) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) input_dim = 1 for d in feat_vis.get_shape().as_list()[1:]: input_dim *= d feat_vis_flatten = tf.reshape(feat_vis, [-1, input_dim]) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis_flatten, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def my_fc_layer(input_batch, name, output_dim, apply_dropout=False): with tf.variable_scope(name): print("input_batch: ", input_batch) fc7 = fc('fc', input_batch, output_dim=output_dim) print("fc7: ", fc7) if apply_dropout: fc7 = drop(fc7, 0.5) return fc7
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8_full_conv(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor(generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(3, [tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def attbilstm(text_seq_batch, name, num_vocab, embed_dim, lstm_dim, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): T = tf.shape(text_seq_batch)[0] N = tf.shape(text_seq_batch)[1] # 0. Word embedding embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # 1. Encode the sentence into a vector representation, using the final # hidden states in a two-layer bidirectional LSTM network seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True) outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, embedded_seq, seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm1") outputs1 = tf.concat(outputs1_raw, axis=2) outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, outputs1, seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm2") outputs2 = tf.concat(outputs2_raw, axis=2) # q_reshape has shape [T, N, lstm_dim*4] q_reshape = tf.concat([outputs1, outputs2], axis=2) if apply_dropout: q_reshape = drop(q_reshape, 0.5) # 2. three attention units over the words in each sentence with tf.variable_scope("attention"): q_reshape_flat = tf.reshape(q_reshape, to_T([T*N, lstm_dim*4])) score_shape = to_T([T, N, 1]) scores_obj1 = tf.reshape(fc('fc_scores_obj1', q_reshape_flat, output_dim=1), score_shape) scores_obj2 = tf.reshape(fc('fc_scores_obj2', q_reshape_flat, output_dim=1), score_shape) scores_rel = tf.reshape(fc('fc_scores_rel', q_reshape_flat, output_dim=1), score_shape) # 2.4 Compute probability and average BoW representation # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1] # Remove the probability over <pad> (<pad> is 0) is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32) probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True) probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True) probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True) tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel)) # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim] BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0) BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0) BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0) BoW_obj1.set_shape([None, embed_dim]) BoW_obj2.set_shape([None, embed_dim]) BoW_rel.set_shape([None, embed_dim]) return (BoW_obj1, BoW_obj2, BoW_rel)
def build_text_feature(self): """Generate text feature using bidirectional LSTM Outputs: self.text_bilstm_feat self.text_word_embed_feat self.word_is_not_pad """ num_vocab = self.config.num_vocab embed_dim = self.config.embed_dim lstm_dim = self.config.lstm_dim text_seq = self.text_seqs with tf.variable_scope('lstm'): L = tf.shape(text_seq)[0] #seq length N1 = tf.shape(text_seq)[1] #batch size # Word embedding embedding_mat = tf.get_variable(name="embedding_mat", shape=[num_vocab, embed_dim]) text_word_embed_feat = tf.nn.embedding_lookup( embedding_mat, text_seq) # [L, N1, embed_dim] # Encode the sentence into a vector representation, using the final # hidden states in a two-layer bidirectional LSTM network seq_length = tf.ones(to_T([N1]), dtype=tf.int32) * L lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True) outputs1_raw, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell, cell_bw=lstm_cell, inputs=text_word_embed_feat, sequence_length=seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm1") outputs1 = tf.concat(outputs1_raw, axis=2) lstm_cell2 = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True) outputs2_raw, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw=lstm_cell2, cell_bw=lstm_cell2, inputs=outputs1, sequence_length=seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_lstm2") outputs2 = tf.concat(outputs2_raw, axis=2) text_bilstm_feat = tf.concat([outputs1, outputs2], axis=2) if self.config.lstm_dropout: text_bilstm_feat = drop(text_bilstm_feat, 0.5) self.text_bilstm_feat = text_bilstm_feat self.text_word_embed_feat = text_word_embed_feat self.word_is_not_pad = tf.cast( tf.not_equal(text_seq, 0)[..., tf.newaxis], tf.float32)
def forward(self, imcrop_batch, text_seq_batch, is_training=True): num_vocab, embed_dim, lstm_dim, mlp_hidden_dims = self.num_vocab, self.embed_dim, self.lstm_dim, self.mlp_hidden_dims deeplab_dropout = self.kwargs[ 'deeplab_dropout'] if 'deeplab_dropout' in self.kwargs else False mlp_dropout = self.kwargs[ 'mlp_dropout'] if 'mlp_dropout' in self.kwargs else False with tf.variable_scope(self.model_name): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] # Local image feature feat_vis = deeplab.deeplab_fc8_full_conv( imcrop_batch, 'deeplab', apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) upsample8s = deconv('upsample8s', mlp_l2, kernel_size=16, stride=8, output_dim=1, bias_term=False) return upsample8s
def vs_multilayer(input_batch, name, middle_layer_dim=1000, reuse=False, test=False): with tf.variable_scope(name): if reuse == True: # print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: pass # print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) if test: layer1 = drop(layer1, 1) else: layer1 = drop(layer1, 0.5) outputs = fc('layer2', layer1, output_dim=4) return outputs
def vs_multilayer(input_batch,name,middle_layer_dim=1000,reuse=False): with tf.variable_scope(name): if reuse==True: print name+" reuse variables" tf.get_variable_scope().reuse_variables() else: print name+" doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) layer1=drop(layer1,0.5) outputs = fc('layer2', layer1,output_dim=4) return outputs
def question_prior_net(encoder_states, num_choices, qpn_dropout, hidden_dim=500, scope='question_prior_net', reuse=None): with tf.variable_scope(scope, reuse=reuse): # concate the LSTM states from all layers assert (isinstance(encoder_states, tuple)) h_list = [] for s in encoder_states: assert (isinstance(s, tf.contrib.rnn.LSTMStateTuple)) h_list.append(s.h) # h_concat has shape [N, D_lstm1 + ... + D_lstm_n] h_concat = tf.concat(h_list, axis=1) if qpn_dropout: h_concat = drop(h_concat, 0.5) fc1 = fc_relu('fc1', h_concat, output_dim=hidden_dim) if qpn_dropout: fc1 = drop(fc1, 0.5) fc2 = fc('fc2', fc1, output_dim=num_choices) return fc2
def vs_multilayer(input_batch, name, middle_layer_dim=1000, output_layer_dim=21 * 3, dropout=True, reuse=False): with tf.variable_scope(name): if reuse == True: print name + " reuse variables" tf.get_variable_scope().reuse_variables() else: print name + " doesn't reuse variables" layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) if dropout: layer1 = drop(layer1, 0.5) sim_score = fc('layer2', layer1, output_dim=output_layer_dim) return sim_score
def text_objseg_full_conv(text_seq_batch, imcrop_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, deeplab_dropout, mlp_dropout, is_training): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim)[0] #deeplab101 net = deeplab101.DeepLabResNetModel({'data': imcrop_batch}, is_training=is_training) feat_vis = net.layers['fc1_voc12'] # # Local image feature # feat_vis = deeplab.deeplab_fc8_full_conv(imcrop_batch, 'deeplab', # apply_dropout=deeplab_dropout) # Reshape and tile LSTM top featmap_H, featmap_W = feat_vis.get_shape().as_list()[1:3] N, D_text = feat_lang.get_shape().as_list() feat_lang = tf.tile(tf.reshape(feat_lang, [N, 1, 1, D_text]), [1, featmap_H, featmap_W, 1]) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 3 (channel dimension) spatial_batch = tf.convert_to_tensor( generate_spatial_batch(N, featmap_H, featmap_W)) feat_all = tf.concat(axis=3, values=[ tf.nn.l2_normalize(feat_lang, 3), tf.nn.l2_normalize(feat_vis, 3), spatial_batch ]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = conv_relu('mlp_l1', feat_all, kernel_size=1, stride=1, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = conv('mlp_l2', mlp_l1, kernel_size=1, stride=1, output_dim=1) return mlp_l2
def text_objseg_region(text_seq_batch, imcrop_batch, spatial_batch, num_vocab, embed_dim, lstm_dim, mlp_hidden_dims, vgg_dropout, mlp_dropout): # Language feature (LSTM hidden state) feat_lang = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature feat_vis = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=vgg_dropout) # L2-normalize the features (except for spatial_batch) # and concatenate them feat_all = tf.concat(1, [tf.nn.l2_normalize(feat_lang, 1), tf.nn.l2_normalize(feat_vis, 1), spatial_batch]) # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) if mlp_dropout: mlp_l1 = drop(mlp_l1, 0.5) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) return mlp_l2
def vs_multilayer(input_batch, name, middle_layer_dim=1000, class_num=20, dropout=False, reuse=False): """This function is inherited from CBR project(https://github.com/jiyanggao/CBR) """ print('--I am using vs_multilayer--') with tf.variable_scope(name): if reuse == True: print(name + " reuse variables") tf.get_variable_scope().reuse_variables() else: print(name + " doesn't reuse variables") layer1 = fc_relu('layer1', input_batch, output_dim=middle_layer_dim) if dropout: layer1 = drop(layer1, 0.5) sim_score = fc('layer2', layer1, output_dim=(class_num + 1) * 3) return sim_score
def attbilstm_simple(text_seq_batch, name, num_vocab, embed_dim, lstm_dim, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): T = tf.shape(text_seq_batch)[0] N = tf.shape(text_seq_batch)[1] # 0. Word embedding embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # 1. Encode the sentence into a vector representation, using the final # hidden states in a bidirectional LSTM network lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_dim, state_is_tuple=True) seq_length = tf.ones(to_T([N]), dtype=tf.int32)*T outputs, _ = tf.nn.bidirectional_dynamic_rnn(lstm_cell, lstm_cell, embedded_seq, seq_length, dtype=tf.float32, time_major=True, scope="bidirectional_stm") q_reshape = tf.concat(outputs, axis=2) if apply_dropout: q_reshape = drop(q_reshape, 0.5) # 2. three attention units over the words in each sentence with tf.variable_scope("attention"): # 2.1 Map the word embedding vectors to the same dimension as q embedded_seq_reshape = tf.reshape(embedded_seq, [-1, embed_dim]) word_seq_embed = fc('attention_embed', embedded_seq_reshape, output_dim=lstm_dim*2) word_seq_embed = tf.reshape(word_seq_embed, to_T([T, N, lstm_dim*2])) # 2.2 Elementwise multiply word_seq_embed with q and l2-normalization eltwise_mult = tf.nn.l2_normalize(word_seq_embed * q_reshape, 2) # 2.3 Classification for attention scores eltwise_mult = tf.reshape(eltwise_mult, [-1, lstm_dim*2]) # scores_obj1, scores_obj2 and scores_rel has shape [T, N, 1] score_shape = to_T([T, N, 1]) scores_obj1 = tf.reshape(fc('fc_scores_obj1', eltwise_mult, output_dim=1), score_shape) scores_obj2 = tf.reshape(fc('fc_scores_obj2', eltwise_mult, output_dim=1), score_shape) scores_rel = tf.reshape(fc('fc_scores_rel', eltwise_mult, output_dim=1), score_shape) # 2.4 Compute probability and average BoW representation # probs_obj1, probs_obj2 and probs_rel has shape [T, N, 1] # Remove the probability over <pad> (<pad> is 0) is_not_pad = tf.cast(tf.not_equal(text_seq_batch, 0)[..., tf.newaxis], tf.float32) probs_obj1 = tf.nn.softmax(scores_obj1, dim=0)*is_not_pad probs_obj2 = tf.nn.softmax(scores_obj2, dim=0)*is_not_pad probs_rel = tf.nn.softmax(scores_rel, dim=0)*is_not_pad probs_obj1 = probs_obj1 / tf.reduce_sum(probs_obj1, 0, keep_dims=True) probs_obj2 = probs_obj2 / tf.reduce_sum(probs_obj2, 0, keep_dims=True) probs_rel = probs_rel / tf.reduce_sum(probs_rel, 0, keep_dims=True) # BoW_obj1, BoW_obj2 and BoW_rel has shape [N, embed_dim] BoW_obj1 = tf.reduce_sum(probs_obj1*embedded_seq, reduction_indices=0) BoW_obj2 = tf.reduce_sum(probs_obj2*embedded_seq, reduction_indices=0) BoW_rel = tf.reduce_sum(probs_rel*embedded_seq, reduction_indices=0) BoW_obj1.set_shape([None, embed_dim]) BoW_obj2.set_shape([None, embed_dim]) BoW_rel.set_shape([None, embed_dim]) tf.add_to_collection("attention_probs", (probs_obj1, probs_obj2, probs_rel)) return (BoW_obj1, BoW_obj2, BoW_rel)