def training(self, neg_elbo): """Sets up the training Ops. Creates an optimizer and applies the gradients to all trainable variables. Args: neg_elbo: neg_elbo tensor, from neg_elbo(). Returns: train_op: The Op for training. """ global_step = get_global_step_var() base_lr = self.config_train['lr'] lr_values = [ base_lr / 10, base_lr, base_lr / 3, base_lr / 10, base_lr / 33 ] boundaries = np.array([0.02, 0.6, 0.75, 0.95 ]) * self.config_train['num_iter'] boundaries = [int(b) for b in boundaries] lr = tf.train.piecewise_constant(global_step, boundaries, lr_values) tf.summary.scalar('learning_rate', lr) optimizer = tf.train.AdamOptimizer(lr, epsilon=1e-3) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(neg_elbo, global_step=global_step) return train_op
def kl_coeff_annealing(self, is_training): """ defines the coefficient used for annealing the KL term. It return 1 for the test graph but, a value between 0 and 1 for the training graph. Args: is_training: a boolean flag indicating whether the network is part of train or test graph. Returns: kl_coeff: a scalar (non-trainable) tensor containing the kl coefficient. """ global_step = get_global_step_var() if is_training: if self.is_struct_pred: # anneal the entropy coefficient in 60% iterations. max_epochs = 0.5 * self.config_train['num_iter'] kl_coeff = tf.maximum( 1. - tf.to_float(global_step) / max_epochs, self.entropy_lower_bound) else: # anneal the KL coefficient in 30% iterations. max_epochs = 0.3 * self.config_train['num_iter'] kl_coeff = tf.minimum( tf.to_float(global_step) / max_epochs, 1.) tf.summary.scalar('kl_coeff', kl_coeff) else: kl_coeff = 1. return kl_coeff
def training(self, neg_elbo, wd_loss): """Sets up the training Ops. Creates an optimizer and applies the gradients to all trainable variables. Args: neg_elbo: neg_elbo tensor, from neg_elbo(). wd_loss: weight decay loss. Returns: train_op: The Op for training. """ global_step = get_global_step_var() base_lr = self.config_train['lr'] lr_values = [ base_lr / 10, base_lr, base_lr / 3, base_lr / 10, base_lr / 33 ] boundaries = np.array([0.02, 0.6, 0.75, 0.95 ]) * self.config_train['num_iter'] boundaries = [int(b) for b in boundaries] lr = tf.train.piecewise_constant(global_step, boundaries, lr_values) tf.summary.scalar('learning_rate', lr) optimizer = tf.train.AdamOptimizer(lr, epsilon=1e-3) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): if self.config_train['use_iw'] and self.config_train['k'] > 1: iw_loss_p, iw_loss_q = neg_elbo grads_vars_q = optimizer.compute_gradients( iw_loss_q + wd_loss, var_list=tf.get_collection('q_collections')) grads_vars_p = optimizer.compute_gradients( iw_loss_p + wd_loss, var_list=tf.get_collection('p_collections')) grads_vars = grads_vars_p + grads_vars_q train_op = optimizer.apply_gradients(grads_vars, global_step=global_step) else: loss = neg_elbo + wd_loss train_op = optimizer.minimize(loss, global_step=global_step) return train_op
def run_training(vae, cont_train, config_train, log_dir): """ The main function that will derive training of a vae. Args: vae: is an object from the class VAE. cont_train: a boolean flag indicating whether train should continue from the checkpoint stored in the log_dir. config_train: a dictionary containing config. training (hyperparameters). log_dir: path to a directory that will used for storing both tensorboard files and checkpoints. Returns: test_neg_ll_value: the value of test log-likelihood. """ use_iw = config_train['use_iw'] Print('Starting training.') batch_size = config_train['batch_size'] # Get the train, val, test sets of on MNIST. data_dir = config_train['data_dir'] eval_batch_size = config_train['eval_batch_size'] data_sets = input_data.read_data_set(data_dir, dataset=config_train['dataset']) # place holder for input. input_placeholder = tf.placeholder(tf.float32, shape=(None, vae.num_input)) # define training graph. if use_iw: Print('using IW obj. function') iw_loss, neg_elbo, sigmoid_output, wd_loss, _ = \ vae.neg_elbo(input_placeholder, is_training=True, k=config_train['k'], use_iw=use_iw) loss = iw_loss + wd_loss # create scalar summary for training loss. tf.summary.scalar('train/neg_iw_loss', iw_loss) sigmoid_output = tf.slice(sigmoid_output, [0, 0], [batch_size, -1]) else: Print('using VAE obj. function') _, neg_elbo, sigmoid_output, wd_loss, _ = \ vae.neg_elbo(input_placeholder, is_training=True, k=config_train['k'], use_iw=use_iw) loss = neg_elbo + wd_loss # create scalar summary for training loss. tf.summary.scalar('train/neg_elbo', neg_elbo) train_op = vae.training(loss) # create images for reconstruction. image = create_reconstruction_image(input_placeholder, sigmoid_output[:batch_size], batch_size) tf.summary.image('recon', image, max_outputs=1) # define graph to generate random samples from model. num_samples = 100 random_samples = vae.generate_samples(num_samples) tiled_samples = tile_image_tf(random_samples, n=int(np.sqrt(num_samples)), m=int(np.sqrt(num_samples)), width=28, height=28) tf.summary.image('generated_sample', tiled_samples, max_outputs=1) # merge all summary for training graph train_summary_op = tf.summary.merge_all() # define a parallel graph for evaluation. Enable parameter sharing by setting is_training to False. _, neg_elbo_eval, _, _, log_iw_eval = vae.neg_elbo(input_placeholder, is_training=False) # the following will create summaries that will be used in the evaluation graph. val_neg_elbo, test_neg_elbo = tf.placeholder( tf.float32, shape=()), tf.placeholder(tf.float32, shape=()) val_neg_ll, test_neg_ll = tf.placeholder( tf.float32, shape=()), tf.placeholder(tf.float32, shape=()) val_summary = tf.summary.scalar('val/neg_elbo', val_neg_elbo) test_summary = tf.summary.scalar('test/neg_elbo', test_neg_elbo) val_ll_summary = tf.summary.scalar('val/neg_ll', val_neg_ll) test_ll_summary = tf.summary.scalar('test/neg_ll', test_neg_ll) eval_summary_op = tf.summary.merge( [val_summary, test_summary, val_ll_summary, test_ll_summary]) # start checkpoint saver. saver = tf.train.Saver(max_to_keep=1) sess = tf.Session() # Run the Op to initialize the variables. if cont_train: ckpt = tf.train.get_checkpoint_state(log_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) init_step = int(ckpt.model_checkpoint_path.split('-')[-1]) + 1 Print('Initializing model from %s from step %d' % (log_dir, init_step)) else: raise ('No Checkpoint was fount at %s' % log_dir) else: init = tf.global_variables_initializer() sess.run(init) init_step = 0 # Instantiate a SummaryWriter to output summaries and the Graph. # Create train/validation/test summary directories summary_writer = tf.summary.FileWriter(log_dir) # And then after everything is built, start the training loop. duration = 0.0 best_val_neg_ll = np.finfo(float).max num_iter = config_train['num_iter'] for step in xrange(init_step, num_iter): start_time = time.time() # perform one iteration of training. feed_dict = fill_feed_dict(data_sets.train, input_placeholder, batch_size) _, neg_elbo_value = sess.run([train_op, neg_elbo], feed_dict=feed_dict) duration += time.time() - start_time # Save a checkpoint and evaluate the model periodically. eval_iter = 20000 if num_iter > 1e5 else 10000 if (step + 1) % eval_iter == 0 or (step + 1) == num_iter: # if vae has rbm in its prior we should update its log Z. if vae.should_compute_log_z(): vae.prior.estimate_log_z(sess) # validate on the validation and test set val_neg_elbo_value, val_neg_ll_value = evaluate( sess, neg_elbo_eval, log_iw_eval, input_placeholder, data_sets.validation, batch_size=eval_batch_size, k_iw=100) test_neg_elbo_value, test_neg_ll_value = evaluate( sess, neg_elbo_eval, log_iw_eval, input_placeholder, data_sets.test, batch_size=eval_batch_size, k_iw=100) summary_str = sess.run(eval_summary_op, feed_dict={ val_neg_elbo: val_neg_elbo_value, test_neg_elbo: test_neg_elbo_value, val_neg_ll: val_neg_ll_value, test_neg_ll: test_neg_ll_value }) summary_writer.add_summary(summary_str, step) Print( 'Step %d: val ELBO = %.2f test ELBO = %.2f, val NLL = %.2f, test NLL = %.2f' % (step, val_neg_elbo_value, test_neg_elbo_value, val_neg_ll_value, test_neg_ll_value)) # save model if it is better on validation set: if val_neg_ll_value < best_val_neg_ll: best_val_neg_ll = val_neg_ll_value saver.save(sess, log_dir + '/', global_step=step) # Write the summaries and print an overview fairly often. report_iter = 1000 if step % report_iter == 0 and step > 500: # print status to stdout. Print('Step %d, %.3f sec per step' % (step, duration / report_iter)) duration = 0.0 # Update the events file. summary_str = sess.run(train_summary_op, feed_dict=feed_dict) summary_writer.add_summary(summary_str, step) # in the last iteration, we load the best model based on the validation performance, and evaluate it on test if (step + 1) == num_iter: Print('Final evaluation using the best saved model') # reload the best model this is good when a model overfits. ckpt = tf.train.get_checkpoint_state(log_dir) saver.restore(sess, ckpt.model_checkpoint_path) Print('Done restoring the model at step: %d' % sess.run(get_global_step_var())) if vae.should_compute_log_z(): vae.prior.estimate_log_z(sess) val_neg_elbo_value, val_neg_ll_value = evaluate( sess, neg_elbo_eval, log_iw_eval, input_placeholder, data_sets.validation, eval_batch_size, k_iw=100) test_neg_elbo_value, test_neg_ll_value = evaluate( sess, neg_elbo_eval, log_iw_eval, input_placeholder, data_sets.test, eval_batch_size, k_iw=config_train['k_iw']) summary_str = sess.run(eval_summary_op, feed_dict={ val_neg_elbo: val_neg_elbo_value, test_neg_elbo: test_neg_elbo_value, val_neg_ll: val_neg_ll_value, test_neg_ll: test_neg_ll_value }) Print( 'Step %d: val ELBO = %.2f test ELBO = %.2f, val NLL = %.2f, test NLL = %.2f' % (step, val_neg_elbo_value, test_neg_elbo_value, val_neg_ll_value, test_neg_ll_value)) summary_writer.add_summary(summary_str, step + 1) summary_writer.flush() sess.close() tf.reset_default_graph() return test_neg_ll_value