def q_y_x(self, captions, lengths, n_classes): """ Returns: x_logits: classifier unnormalized log probabilities """ with tf.variable_scope("net"): with tf.device("/cpu:0"): embedding = tf.get_variable("dec_embeddings", [self.vocab_size, self.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, captions) keep_prob = tf.placeholder_with_default(1.0, (), name='classifier_drop') cell_0 = make_rnn_cell([self.lstm_hidden], base_cell=tf.contrib.rnn.LSTMCell, dropout_keep_prob=keep_prob) zero_state0 = cell_0.zero_state(batch_size=tf.shape( self.images_fv)[0], dtype=tf.float32) initial_state = zero_state0 # _, initial_state = cell_0(self.images_fv, zero_state0) # captions LSTM outputs, final_state = tf.nn.dynamic_rnn( cell_0, inputs=vect_inputs, sequence_length=lengths, initial_state=initial_state, swap_memory=True, dtype=tf.float32) y_logits = tf.layers.dense(final_state[0][1], n_classes) return y_logits
def decoder(self, gen_mode=False): """ Args: gen_mode: set True, will be used for caption generator Returns: x_logits: mapping to vocabulary, for training states: tuple (initial_state, final_state, sample), for generation """ # encoder and decoder have different embeddings but the same image features with tf.variable_scope("net") as scope: with tf.device("/cpu:0"): embedding = tf.get_variable( "dec_embeddings", [self.params.vocab_size, self.params.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, self.captions) dec_lstm_drop = self.params.dec_lstm_drop if gen_mode: dec_lstm_drop = 1.0 cell_0 = make_rnn_cell( [self.params.decoder_hidden for _ in range( self.params.decoder_rnn_layers)], base_cell=tf.contrib.rnn.LSTMCell, dropout_keep_prob=dec_lstm_drop) zero_state0 = cell_0.zero_state( batch_size=tf.shape(self.images_fv)[0], dtype=tf.float32) # run this cell to get initial state _, initial_state0 = cell_0(self.images_fv, zero_state0) if self.c_i is not None and self.params.use_c_v: _, initial_state0 = cell_0(self.c_i, initial_state0) initial_state = rnn_placeholders(initial_state0) # captions LSTM outputs, final_state = tf.nn.dynamic_rnn(cell_0, inputs=vect_inputs, sequence_length=self.lengths, initial_state=initial_state, swap_memory=True, dtype=tf.float32) # output shape [batch_size, seq_length, self.params.decoder_hidden] if gen_mode: # only interested in the last output outputs = outputs[:, -1, :] outputs_r = tf.reshape(outputs, [-1, cell_0.output_size]) x_logits = tf.layers.dense(outputs_r, units=self.data_dict.vocab_size, name='rnn_logits') # for generating sample = None if gen_mode: if self.params.sample_gen == 'sample': sample = tf.multinomial( x_logits / self.params.temperature, 1)[0][0] elif self.params.sample_gen == 'beam_search': sample = tf.nn.softmax(x_logits) else: sample = tf.nn.softmax(x_logits) return x_logits, (initial_state, final_state, sample)
def q_z_xy(self, captions, labels, lengths, images=None): """Calculate approximate posterior q(z|x, y, f(I)) Returns: model: zhusuan model object, can be used for getting probabilities """ if images is not None: self.images_fv = images with zs.BayesianNet() as model: # encoder and decoder have different embeddings but the same image features with tf.device("/cpu:0"): embedding = tf.get_variable( "enc_embeddings", [self.vocab_size, self.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, captions) with tf.name_scope(name="net") as scope1: cell_0 = make_rnn_cell( [self.lstm_hidden], base_cell=tf.contrib.rnn.LSTMCell) zero_state0 = cell_0.zero_state( batch_size=tf.shape(self.images_fv)[0], dtype=tf.float32) # run this cell to get initial state added_shape = self.embed_size + self.params.n_classes im_f = tf.layers.dense(self.images_fv, added_shape) _, initial_state0 = cell_0(im_f, zero_state0) # c = h = tf.layers.dense(self.images_fv, # self.params.decoder_hidden, # name='dec_init_map') # initial_state0 = (tf.nn.rnn_cell.LSTMStateTuple(c, h), ) # x, y y = tf.tile(tf.expand_dims(labels, 1), [1, tf.shape(vect_inputs)[1], 1]) vect_inputs = tf.concat([vect_inputs, tf.to_float(y)], 2) outputs, final_state = tf.nn.dynamic_rnn(cell_0, inputs=vect_inputs, sequence_length=lengths, initial_state=initial_state0, swap_memory=True, dtype=tf.float32, scope=scope1) # [batch_size, 2 * lstm_hidden_size] # final_state = ((c, h), ) final_state = final_state[0][1] lz_mean = layers.dense(inputs=final_state, units=self.latent_size, activation=None) lz_logstd = layers.dense(inputs=final_state, units=self.latent_size, activation=None) lz_std = tf.exp(lz_logstd) # define latent variable`s Stochastic Tensor # add mu_k, sigma_k, CVAe ag-cvae tm_list = [] # means tl_list = [] # log standard deviations z = zs.Normal('z', mean=lz_mean, std=lz_std, group_ndims=1, n_samples=self.z_samples) return model, tm_list, tl_list
def q_net(self): """Calculate approximate posterior q(z|x, f(I)) Returns: model: zhusuan model object, can be used for getting probabilities """ with zs.BayesianNet() as model: # encoder and decoder have different embeddings but the same image features with tf.device("/cpu:0"): embedding = tf.get_variable( "enc_embeddings", [self.params.vocab_size, self.params.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, self.captions) with tf.name_scope(name="encoder0") as scope1: cell_0 = make_rnn_cell([ self.params.encoder_hidden for _ in range(self.params.encoder_rnn_layers) ], base_cell=tf.contrib.rnn.LSTMCell) zero_state0 = cell_0.zero_state(batch_size=tf.shape( self.images_fv)[0], dtype=tf.float32) # run this cell to get initial state _, initial_state0 = cell_0(self.images_fv, zero_state0) if self.c_i != None and self.params.use_c_v: _, initial_state0 = cell_0(self.c_i, initial_state0) outputs, final_state = tf.nn.dynamic_rnn( cell_0, inputs=vect_inputs, sequence_length=self.lengths, initial_state=initial_state0, swap_memory=True, dtype=tf.float32, scope=scope1) # [batch_size, 2 * lstm_hidden_size] # final_state = ((c, h), ) final_state = tf.concat(values=final_state[0], axis=1, name="encoder_hidden") if self.params.prior == 'Normal': lz_mean = layers.dense(inputs=final_state, units=self.params.latent_size, activation=None) lz_logstd = layers.dense(inputs=final_state, units=self.params.latent_size, activation=None) # define latent variable`s Stochastic Tensor # add mu_k, sigma_k, CVAe ag-cvae tm_list = [] # means tl_list = [] # variances if self.params.prior == 'GMM': cluster = tf.squeeze(tf.multinomial(self.c_i_ph, 1)) indices = tf.squeeze(tf.range(tf.shape(self.c_i_ph)[0])) cluster = tf.stack([indices, tf.cast(cluster, tf.int32)], 1) for i in range(90): with tf.variable_scope("gmm_ll_{}".format(i)): lz_mean = layers.dense(inputs=final_state, units=self.params.latent_size) lz_logstd = layers.dense(inputs=final_state, units=self.params.latent_size) tm_list.append(tf.expand_dims(lz_mean, 1)) tl_list.append(tf.expand_dims(lz_logstd, 1)) # [batch_size, 90, z_dim] tm_list = tf.concat(tm_list, 1) tl_list = tf.concat(tl_list, 1) lz_mean = tf.gather_nd(tm_list, cluster) lz_logstd = tf.gather_nd(tl_list, cluster) if self.params.prior == 'AG': #clusters = tf.argmax(self.c_i_ph, 1) # [batch_size, 150]? # ck*N(mu, sigma) for i in range(90): with tf.variable_scope("ag_ll_{}".format(i)): lz_mean = layers.dense(inputs=final_state, units=self.params.latent_size) lz_logstd = layers.dense(inputs=final_state, units=self.params.latent_size) tm_list.append(tf.expand_dims(lz_mean, 1)) tl_list.append(tf.expand_dims(lz_logstd, 1)) # [batch_size, 90, 150] # ob_vector [batch_size, 90] # need [batch_size, 150] tm_list = tf.concat(tm_list, 1) tl_list = tf.concat(tl_list, 1) c_i_exp = tf.expand_dims(self.c_i_ph, 1) lz_mean = tf.squeeze(tf.matmul(c_i_exp, tm_list), 1) lz_logstd = tf.squeeze(tf.matmul(c_i_exp, tl_list), 1) # debug #print(lz_mean) z = zs.Normal('z', lz_mean, lz_logstd, group_event_ndims=1, n_samples=self.params.gen_z_samples) return z, tm_list, tl_list
def px_z_fi(self, observed, gen_mode = False): """ Args: observed: for q, parametrized by encoder, used during training Returns: model: zhusuan model object, can be used for getting probabilities """ with zs.BayesianNet(observed) as model: z_mean = tf.zeros([tf.shape(self.images_fv)[0], self.params.latent_size]) z = zs.Normal('z', mean=z_mean, std=self.params.std, group_event_ndims=1, n_samples=self.params.gen_z_samples) # encoder and decoder have different embeddings but the same image features with tf.variable_scope("net") as scope: with tf.device("/cpu:0"): embedding = tf.get_variable( "dec_embeddings", [self.params.vocab_size, self.params.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, self.captions) # captions dropout if self.params.dec_keep_rate < 1 and not gen_mode: vect_inputs = tf.nn.dropout(vect_inputs, self.params.dec_keep_rate) dec_lstm_drop = self.params.dec_lstm_drop if gen_mode: dec_lstm_drop = 1.0 cell_0 = make_rnn_cell( [self.params.decoder_hidden for _ in range( self.params.decoder_rnn_layers)], base_cell=tf.contrib.rnn.LSTMCell, dropout_keep_prob=dec_lstm_drop) zero_state0 = cell_0.zero_state( batch_size=tf.shape(self.images_fv)[0], dtype=tf.float32) # run this cell to get initial state _, initial_state0 = cell_0(self.images_fv, zero_state0) if self.c_i != None and self.params.use_c_v: _, initial_state0 = cell_0(self.c_i, initial_state0) if self.params.no_encoder: if not gen_mode: print("Not using q(z|x)") initial_state = rnn_placeholders(initial_state0) else: # vector z, mapped into embed_dim z = tf.reshape(z, [-1, self.params.latent_size * self.params.gen_z_samples]) z_dec = layers.dense(z, self.params.embed_size, name='z_rnn') _, z_state = cell_0(z_dec, initial_state0) initial_state = rnn_placeholders(z_state) # captions LSTM # TODO: correct sequence_length implementation outputs, final_state = tf.nn.dynamic_rnn(cell_0, inputs=vect_inputs, sequence_length=None, initial_state=initial_state, swap_memory=True, dtype=tf.float32) # output shape [batch_size, seq_length, self.params.decoder_hidden] if gen_mode: # only interested in the last output outputs = outputs[:, -1, :] outputs_r = tf.reshape(outputs, [-1, cell_0.output_size]) x_logits = tf.layers.dense(outputs_r, units=self.data_dict.vocab_size, name='rnn_logits') # for debugging shpe = (tf.shape(z), tf.shape(outputs_r), tf.shape(outputs)) # for generating sample = None if gen_mode: if self.params.sample_gen == 'sample': sample = tf.multinomial( x_logits / self.params.temperature, 1)[0][0] elif self.params.sample_gen == 'beam_search': sample = tf.nn.softmax(x_logits) else: sample = tf.nn.softmax(x_logits) return model, x_logits, shpe, (initial_state, final_state, sample)
def px_z_fi(self, observed, gen_mode=False): """ Args: observed: for q, parametrized by encoder, used during training Returns: model: zhusuan model object, can be used for getting probabilities """ with zs.BayesianNet(observed) as model: if not gen_mode or self.params.prior != 'AG': z_mean = tf.zeros([tf.shape(self.images_fv)[0], self.params.latent_size]) elif self.params.prior == 'AG' and gen_mode: # choose clusters (currently dont support batch of images) # mean of clusters for the concrete image c_indices = tf.where(self.c_i_ph[0] > 0) # [num_true, indices] pred = tf.equal(tf.shape(c_indices)[0], 0) def false(): return tf.squeeze(tf.transpose(c_indices)) def true(): # cl_range = tf.range( # tf.cast(tf.shape(self.cap_clusters)[0], tf.int64)) # some classes are unused, dont condition on them # 0, 66, 68, 69, 71, 12, 45, 83, 26, 29, 30 un_clusters = {0, 66, 68, 69, 71, 12, 45, 83, 26, 29, 30} cl_num = [i for i in range(self.params.num_clusters + 1) if i not in un_clusters] return tf.convert_to_tensor(cl_num, dtype=tf.int64) c_indices = tf.cond(pred, true, false) # cap_clusers=[num_clusters, num_z] means = tf.gather(self.cap_clusters, c_indices, axis=0) # if only one cluster (any better way?) def false(): return means def true(): return tf.expand_dims(means, 0) pred = tf.equal(tf.shape(means)[0], tf.shape(self.cap_clusters)[1]) means = tf.cond(pred, true, false) # find mean cluster for current picture z_mean = tf.reduce_mean(means, axis=0) z_mean = tf.reshape(z_mean, [1,self.params.latent_size]) z = zs.Normal('z', mean=z_mean, std=self.params.std, group_event_ndims=1, n_samples=self.params.gen_z_samples) # encoder and decoder have different embeddings but the same image features with tf.variable_scope("net") as scope: with tf.device("/cpu:0"): embedding = tf.get_variable( "dec_embeddings", [self.params.vocab_size, self.params.embed_size], dtype=tf.float32) vect_inputs = tf.nn.embedding_lookup(embedding, self.captions) # captions dropout if self.params.dec_keep_rate < 1 and not gen_mode: vect_inputs = tf.nn.dropout(vect_inputs, self.params.dec_keep_rate) dec_lstm_drop = self.params.dec_lstm_drop if gen_mode: dec_lstm_drop = 1.0 cell_0 = make_rnn_cell( [self.params.decoder_hidden for _ in range( self.params.decoder_rnn_layers)], base_cell=tf.contrib.rnn.LSTMCell, dropout_keep_prob=dec_lstm_drop) zero_state0 = cell_0.zero_state( batch_size=tf.shape(self.images_fv)[0], dtype=tf.float32) # run this cell to get initial state _, initial_state0 = cell_0(self.images_fv, zero_state0) if self.c_i is not None and self.params.use_c_v: _, initial_state0 = cell_0(self.c_i, initial_state0) if self.params.no_encoder: if not gen_mode: print("Not using q(z|x)") initial_state = rnn_placeholders(initial_state0) else: # vector z, mapped into embed_dim z = tf.reshape(z, [-1, self.params.latent_size * self.params.gen_z_samples]) z_dec = layers.dense(z, self.params.embed_size, name='z_rnn') _, z_state = cell_0(z_dec, initial_state0) initial_state = rnn_placeholders(z_state) # captions LSTM outputs, final_state = tf.nn.dynamic_rnn(cell_0, inputs=vect_inputs, sequence_length=self.lengths, initial_state=initial_state, swap_memory=True, dtype=tf.float32) # output shape [batch_size, seq_length, self.params.decoder_hidden] if gen_mode: # only interested in the last output outputs = outputs[:, -1, :] outputs_r = tf.reshape(outputs, [-1, cell_0.output_size]) x_logits = tf.layers.dense(outputs_r, units=self.data_dict.vocab_size, name='rnn_logits') # for debugging shpe = (tf.shape(z), tf.shape(outputs_r), tf.shape(outputs)) # for generating sample = None if gen_mode: if self.params.sample_gen == 'sample': sample = tf.multinomial( x_logits / self.params.temperature, 1)[0][0] elif self.params.sample_gen == 'beam_search': sample = tf.nn.softmax(x_logits) else: sample = tf.nn.softmax(x_logits) return model, x_logits, shpe, (initial_state, final_state, sample)
def px_z_y(self, observed, captions=None, lengths=None, gen_mode=False, n_x=None): """ Args: observed: for q, parametrized by encoder, used during training Returns: model: zhusuan model object, can be used for getting probabilities """ if captions is not None and lengths is not None: self.captions = captions self.lengths = lengths if n_x is None: n_x = tf.shape(self.images_fv)[0] with zs.BayesianNet(observed) as model: z_mean = tf.zeros([n_x, self.params.latent_size]) z = zs.Normal('z', mean=z_mean, std=self.params.std, group_ndims=1, n_samples=self.params.gen_z_samples) tf.summary.histogram("distributions/z", z) y_logits = tf.zeros([n_x, self.n_classes]) y = zs.OnehotCategorical('y', y_logits, n_samples=self.params.gen_z_samples) with tf.variable_scope("net"): embedding = tf.get_variable( "dec_embeddings", [self.data_dict.vocab_size, self.params.embed_size], dtype=tf.float32) # word dropout before = tf.reshape(self.captions, [-1]) word_drop_keep = self.params.word_dropout_keep if gen_mode: word_drop_keep = 1.0 captions = tf.nn.dropout(tf.to_float(self.captions), word_drop_keep) after = tf.reshape(tf.to_int32(captions), [-1]) mask_after = tf.to_int32(tf.not_equal(before, after)) to_unk = mask_after * self.data_dict.word2idx['<UNK>'] captions = tf.reshape(tf.add(after, to_unk), [tf.shape(self.images_fv)[0], -1]) vect_inputs = tf.nn.embedding_lookup(embedding, captions) dec_lstm_drop = self.params.dec_lstm_drop if gen_mode: dec_lstm_drop = 1.0 cell_0 = make_rnn_cell([self.params.decoder_hidden], base_cell=tf.contrib.rnn.LSTMCell, dropout_keep_prob=dec_lstm_drop) # zero_state0 = cell_0.zero_state( # batch_size=tf.shape(self.images_fv)[0], # dtype=tf.float32) # run this cell to get initial state added_shape = self.params.gen_z_samples * self.params.n_classes +\ self.params.embed_size # added_shape = self.params.embed_size # f_mapping = tf.layers.dense(self.images_fv, added_shape, # name='f_emb2') c = h = tf.layers.dense(self.images_fv, self.params.decoder_hidden, name='dec_init_map') initial_state0 = (tf.nn.rnn_cell.LSTMStateTuple(c, h), ) # vector z, mapped into embed_dim z = tf.concat([z, tf.to_float(y)], 2) z = tf.reshape(z, [n_x, (self.params.latent_size + self.n_classes)\ * self.params.gen_z_samples]) z_dec = layers.dense(z, added_shape, name='z_rnn') _, z_state = cell_0(z_dec, initial_state0) initial_state = rnn_placeholders(z_state) # concat with inputs y_re = tf.to_float( tf.reshape(y, [ tf.shape(self.images_fv)[0], self.params.gen_z_samples * self.params.n_classes ])) y = tf.tile(tf.expand_dims(y_re, 1), [1, tf.shape(vect_inputs)[1], 1]) vect_inputs = tf.concat([vect_inputs, y], 2) # vect_inputs = tf.Print(vect_inputs, [tf.shape(vect_inputs)], # first_n=1) # captions LSTM outputs, final_state = tf.nn.dynamic_rnn( cell_0, inputs=vect_inputs, sequence_length=self.lengths, initial_state=initial_state, swap_memory=True, dtype=tf.float32) # output shape [batch_size, seq_length, self.params.decoder_hidden] if gen_mode: # only interested in the last output outputs = outputs[:, -1, :] outputs_r = tf.reshape(outputs, [-1, cell_0.output_size]) x_logits = tf.layers.dense(outputs_r, units=self.data_dict.vocab_size, name='rnn_logits') x_logits_r = tf.reshape( x_logits, [tf.shape(outputs)[0], tf.shape(outputs)[1], -1]) x = zs.Categorical('x', x_logits_r, group_ndims=1) # for generating sample = None if gen_mode: if self.params.sample_gen == 'sample': sample = tf.multinomial(x_logits / self.params.temperature, 1)[0][0] elif self.params.sample_gen == 'beam_search': sample = tf.nn.softmax(x_logits) else: sample = tf.nn.softmax(x_logits) return model, x_logits, (initial_state, final_state, sample)