def lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim): # embedding matrix with each row containing the embedding vector of a word # this has to be done on CPU currently with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable("embedding", [num_vocab, embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) lstm_top = lstm('lstm_lang', embedded_seq, None, output_dim=lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False, concat_output=False)[-1] return lstm_top
def lstm_encoder(text_seq_batch, name, num_vocab, embed_dim, lstm_dim, apply_dropout, reuse=None): with tf.variable_scope(name, reuse=reuse): embedding_mat = tf.get_variable("embedding_mat", [num_vocab, embed_dim]) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # Take the output at the final timestep of LSTM. lstm_top = lstm("lstm_lang", embedded_seq, None, output_dim=lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=apply_dropout, concat_output=False)[-1] return lstm_top
def lstm_net_glove(text_seq_batch, embedding, lstm_dim): # Initialize embedding layer with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable("embedding", initializer=embedding, trainable=False) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) lstm_top = lstm('lstm_lang', embedded_seq, None, output_dim=lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False, concat_output=False)[-1] return lstm_top
def model_structure(self, sen_data, vis_data, batch_size, is_train, dropout=None): if dropout == None: dropout = self.dropout text_seq_batch = tf.transpose(sen_data, [1, 0]) # input data is [num_steps, batch_size] with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable("embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # we encode phrase based on the last step of hidden states _, states = lstm('lstm_lang', embedded_seq, None, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=False,concat_output=False, initializer=tf.random_uniform_initializer(minval=-0.08, maxval=0.08)) # batch normalization for visual and language part sen_raw = states[-1].h vis_raw = tf.reshape(vis_data, [self.batch_size*self.num_prop, self.img_feat_size]) sen_bn = bn(sen_raw, is_train, "SEN_BN", 0.9) vis_bn = bn(vis_raw, is_train, "VIS_BN", 0.9) sen_output = tf.reshape(sen_bn, [self.batch_size, 1, 1, self.lstm_dim]) vis_output = tf.reshape(vis_bn, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat([sen_tile, vis_output], 3) feat_proj_init = msr_init([1, 1, self.lstm_dim+self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 1]) att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init) att_scores = tf.reshape(att_conv, [self.batch_size, self.num_prop]) return att_scores
def model_structure(self, sen_data, enc_data, dec_data, msk_data, vis_data, batch_size, is_train, dropout=None): def set_drop_test(): return tf.cast(1.0, tf.float32) def set_drop_train(): return tf.cast(self.dropout, tf.float32) dropout = tf.cond(is_train, set_drop_train, set_drop_test) seq_length = tf.reduce_sum(msk_data, 1) text_seq_batch = sen_data with tf.variable_scope('word_embedding'), tf.device("/cpu:0"): embedding_mat = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_seq = tf.nn.embedding_lookup(embedding_mat, text_seq_batch) # we encode phrase based on the last step of hidden states outputs, states = lstm('enc_lstm', embedded_seq, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=False, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) sen_raw = states[-1].h sen_raw = tf.nn.l2_normalize(sen_raw, dim=1) # print sen_raw.get_shape() vis_raw = tf.reshape( vis_data, [self.batch_size * self.num_prop, self.img_feat_size]) sen_output = tf.reshape(sen_raw, [self.batch_size, 1, 1, self.lstm_dim]) vis_output = tf.reshape( vis_raw, [self.batch_size, self.num_prop, 1, self.img_feat_size]) sen_tile = tf.tile(sen_output, [1, self.num_prop, 1, 1]) feat_concat = tf.concat([sen_tile, vis_output], 3) feat_proj_init = msr_init( [1, 1, self.lstm_dim + self.img_feat_size, self.hidden_size]) feat_proj = conv("feat_proj", feat_concat, 1, 1, self.hidden_size, weights_initializer=feat_proj_init) feat_relu = tf.nn.relu(feat_proj) att_conv_init = msr_init([1, 1, self.hidden_size, 1]) att_conv = conv("att_conv", feat_relu, 1, 1, 1, weights_initializer=att_conv_init) #Generate the visual attention feature att_scores_t = tf.reshape(att_conv, [self.batch_size, self.num_prop]) # att_prob = tf.nn.softmax(att_scores_t) att_prob = tf.nn.relu(att_scores_t) att_scores = tf.reshape(att_prob, [self.batch_size, self.num_prop, 1]) vis_att_feat = tf.reduce_sum( tf.multiply(vis_data, tf.tile(att_scores, [1, 1, self.img_feat_size])), 1) vis_att_featFC = fc_relu( "vis_enc", vis_att_feat, self.lstm_dim, weights_initializer=tf.random_uniform_initializer(minval=-0.002, maxval=0.002)) vis_att_tile = tf.reshape(vis_att_featFC, [self.batch_size, 1, self.lstm_dim]) text_enc_batch = enc_data # embedded_enc: batch_size x phrase_len x lstm_dim with tf.variable_scope('enc_embedding'), tf.device("/cpu:0"): embedding_enc = tf.get_variable( "embedding", [self.vocab_size, self.lstm_dim], tf.float32, initializer=tf.contrib.layers.xavier_initializer(uniform=True)) # text_seq has shape [T, N] and embedded_seq has shape [T, N, D]. embedded_enc = tf.nn.embedding_lookup(embedding_enc, text_enc_batch) # dec_vis_embed = batch_size x phrase_len x (2*lstm_dim) dec_vis_embed = tf.concat([ embedded_enc, tf.concat([ vis_att_tile, tf.zeros((self.batch_size, self.phrase_len - 1, self.lstm_dim)) ], 1) ], 2) # dec_outputs: batch_size x phrase_len x lstm_dim dec_outs, _ = lstm('dec_lstm', dec_vis_embed, None, seq_length, output_dim=self.lstm_dim, num_layers=1, forget_bias=1.0, apply_dropout=True, keep_prob=dropout, concat_output=True, initializer=tf.random_uniform_initializer( minval=-0.08, maxval=0.08)) dec_outs = tf.reshape( dec_outs, [self.batch_size * self.phrase_len, self.lstm_dim]) # dec_logits: (batch_size*phrase_len) x vocab_size dec_logits = fc( 'dec_logits', dec_outs, self.vocab_size, weights_initializer=tf.contrib.layers.xavier_initializer( uniform=True)) return att_scores_t, dec_logits, vis_data