def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_caption_steps, n_video_steps, drop_out_rate, bias_init_vector=None): self.dim_image = dim_image self.n_words = n_words self.dim_hidden = dim_hidden self.batch_size = batch_size self.n_caption_steps = n_caption_steps self.drop_out_rate = drop_out_rate self.n_video_steps = n_video_steps with tf.device(cpu_device): self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb') # decoding LSTM for sentence self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # decoding LSTM for video self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3,output_keep_prob=1 - self.drop_out_rate) self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4,output_keep_prob=1 - self.drop_out_rate) self.vae = VAE(self.dim_hidden * 2, self.dim_hidden) self.encode_image_W = tf.Variable(tf.random_uniform([dim_video_feat, dim_hidden], -0.026, 0.026),name='encode_image_W') self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]), name='encode_image_b') self.decode_image_W = tf.Variable(tf.random_uniform([dim_hidden, dim_image], -0.028, 0.028), name='decode_image_W') self.decode_image_b = tf.Variable(tf.zeros([dim_image]), name='decode_image_b') self.embed_word_W = tf.Variable(tf.random_uniform([dim_hidden, n_words], -0.1,0.1), name='embed_word_W') self.sent_emb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='sent_emb') if bias_init_vector is not None: self.embed_word_b = tf.Variable(bias_init_vector.astype(np.float32), name='embed_word_b') else: self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b') self.loc_matrix = tf.Variable(np.identity(n_words), dtype=tf.float32, name='loc_matrix')
def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_caption_steps, n_video_steps, drop_out_rate, bias_init_vector=None): self.dim_image = dim_image self.n_words = n_words self.dim_hidden = dim_hidden self.batch_size = batch_size self.n_caption_steps = n_caption_steps self.drop_out_rate = drop_out_rate self.n_video_steps = n_video_steps with tf.device("/cpu:0"): self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb') # encoding LSTM for sentence self.lstm2 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # decoding LSTM for sentence self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # decoding LSTM for video self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) self.lstm2_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm2, output_keep_prob=1 - self.drop_out_rate) self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3, output_keep_prob=1 - self.drop_out_rate) self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4, output_keep_prob=1 - self.drop_out_rate) self.vae = VAE(self.dim_hidden * 2, self.dim_hidden) self.encode_image_W = tf.Variable(tf.random_uniform( [dim_image, dim_hidden], -0.1, 0.1), name='encode_image_W') self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]), name='encode_image_b') self.decode_image_W = tf.Variable( tf.random_uniform([dim_hidden, dim_image], -0.1, 0.1, name='decode_image_W')) self.decode_image_b = tf.Variable(tf.random_uniform([dim_image]), name='decode_image_b') self.embed_att_w = tf.Variable(tf.random_uniform([dim_hidden, 1], -0.1, 0.1), name='embed_att_w') self.embed_att_Wa = tf.Variable(tf.random_uniform( [dim_hidden, dim_hidden], -0.1, 0.1), name='embed_att_Wa') self.embed_att_Ua = tf.Variable(tf.random_uniform( [dim_hidden, dim_hidden], -0.1, 0.1), name='embed_att_Ua') self.embed_att_ba = tf.Variable(tf.zeros([dim_hidden]), name='embed_att_ba') self.embed_word_W = tf.Variable(tf.random_uniform( [dim_hidden, n_words], -0.1, 0.1), name='embed_word_W') if bias_init_vector is not None: self.embed_word_b = tf.Variable(bias_init_vector.astype( np.float32), name='embed_word_b') else: self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b') self.embed_nn_Wp = tf.Variable(tf.random_uniform( [3 * dim_hidden, dim_hidden], -0.1, 0.1), name='embed_nn_Wp') self.embed_nn_bp = tf.Variable(tf.zeros([dim_hidden]), name='embed_nn_bp')
def __init__(self, dim_image, n_words, dim_hidden, batch_size, n_caption_steps, n_video_steps, drop_out_rate, bias_init_vector=None): self.dim_image = dim_image self.n_words = n_words self.dim_hidden = dim_hidden self.batch_size = batch_size self.n_caption_steps = n_caption_steps self.drop_out_rate = drop_out_rate self.n_video_steps = n_video_steps with tf.device("/cpu:0"): self.Wemb = tf.Variable(tf.random_uniform([n_words, dim_hidden], -0.1, 0.1), name='Wemb') # encoding LSTM for video self.lstm1 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # encoding LSTM for sentence self.lstm2 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # decoding LSTM for sentence self.lstm3 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) # decoding LSTM for video self.lstm4 = tf.contrib.rnn.LSTMCell(self.dim_hidden, use_peepholes=True, state_is_tuple=True) self.lstm1_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm1, output_keep_prob=1 - self.drop_out_rate) self.lstm2_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm2, output_keep_prob=1 - self.drop_out_rate) self.lstm3_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm3, output_keep_prob=1 - self.drop_out_rate) self.lstm4_dropout = tf.contrib.rnn.DropoutWrapper(self.lstm4, output_keep_prob=1 - self.drop_out_rate) self.vae = VAE(self.dim_hidden * 2, self.dim_hidden) self.encode_image_W = tf.Variable(tf.random_uniform( [dim_video_feat, dim_hidden], -0.1, 0.1), name='encode_image_W') self.encode_image_b = tf.Variable(tf.zeros([dim_hidden]), name='encode_image_b') self.decode_image_W = tf.Variable( tf.random_uniform([dim_hidden, dim_image], -0.1, 0.1, name='decode_image_W')) self.decode_image_b = tf.Variable(tf.random_uniform([dim_image]), name='decode_image_b') self.embed_word_W = tf.Variable(tf.random_uniform( [dim_hidden, n_words], -0.1, 0.1), name='embed_word_W') if bias_init_vector is not None: self.embed_word_b = tf.Variable(bias_init_vector.astype( np.float32), name='embed_word_b') else: self.embed_word_b = tf.Variable(tf.zeros([n_words]), name='embed_word_b') # attribute embedding self.embed_att_w = tf.Variable(tf.random_uniform([dim_att, dim_hidden], -0.1, 0.1), name='embed_att_w') self.embed_att_b = tf.Variable(tf.zeros([dim_hidden]), name='embed_att_b') # learnable coefficient for normalized video and sentence feature self.video_coeff = tf.Variable(tf.ones([1]), name='video_coeff') self.sent_coeff = tf.Variable(tf.ones([1]), name='sent_coeff') self.h2h_w = tf.Variable(tf.random_uniform([dim_hidden, dim_hidden], -0.1, 0.1), name='h2h_w') self.h2h_b = tf.Variable(tf.zeros([dim_hidden]), name='h2h_b') self.h2c_w = tf.Variable(tf.random_uniform([dim_hidden, dim_hidden], -0.1, 0.1), name='h2c_w') self.h2c_b = tf.Variable(tf.zeros([dim_hidden]), name='h2c_b')