Пример #1
0
    def training(self, neg_elbo):
        """Sets up the training Ops.
        Creates an optimizer and applies the gradients to all trainable variables.

        Args:
            neg_elbo: neg_elbo tensor, from neg_elbo().

        Returns:
            train_op: The Op for training.
        """
        global_step = get_global_step_var()
        base_lr = self.config_train['lr']
        lr_values = [
            base_lr / 10, base_lr, base_lr / 3, base_lr / 10, base_lr / 33
        ]
        boundaries = np.array([0.02, 0.6, 0.75, 0.95
                               ]) * self.config_train['num_iter']
        boundaries = [int(b) for b in boundaries]
        lr = tf.train.piecewise_constant(global_step, boundaries, lr_values)

        tf.summary.scalar('learning_rate', lr)
        optimizer = tf.train.AdamOptimizer(lr, epsilon=1e-3)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            train_op = optimizer.minimize(neg_elbo, global_step=global_step)
        return train_op
Пример #2
0
    def kl_coeff_annealing(self, is_training):
        """ defines the coefficient used for annealing the KL term. It return 1 for the test graph but, a value
        between 0 and 1 for the training graph.
        
        Args:
            is_training: a boolean flag indicating whether the network is part of train or test graph. 

        Returns:
            kl_coeff: a scalar (non-trainable) tensor containing the kl coefficient.
        """
        global_step = get_global_step_var()
        if is_training:
            if self.is_struct_pred:
                # anneal the entropy coefficient in 60% iterations.
                max_epochs = 0.5 * self.config_train['num_iter']
                kl_coeff = tf.maximum(
                    1. - tf.to_float(global_step) / max_epochs,
                    self.entropy_lower_bound)
            else:
                # anneal the KL coefficient in 30% iterations.
                max_epochs = 0.3 * self.config_train['num_iter']
                kl_coeff = tf.minimum(
                    tf.to_float(global_step) / max_epochs, 1.)

            tf.summary.scalar('kl_coeff', kl_coeff)
        else:
            kl_coeff = 1.

        return kl_coeff
Пример #3
0
    def training(self, neg_elbo, wd_loss):
        """Sets up the training Ops.
        Creates an optimizer and applies the gradients to all trainable variables.

        Args:
            neg_elbo: neg_elbo tensor, from neg_elbo().
            wd_loss: weight decay loss.

        Returns:
            train_op: The Op for training.
        """
        global_step = get_global_step_var()
        base_lr = self.config_train['lr']
        lr_values = [
            base_lr / 10, base_lr, base_lr / 3, base_lr / 10, base_lr / 33
        ]
        boundaries = np.array([0.02, 0.6, 0.75, 0.95
                               ]) * self.config_train['num_iter']
        boundaries = [int(b) for b in boundaries]
        lr = tf.train.piecewise_constant(global_step, boundaries, lr_values)

        tf.summary.scalar('learning_rate', lr)
        optimizer = tf.train.AdamOptimizer(lr, epsilon=1e-3)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            if self.config_train['use_iw'] and self.config_train['k'] > 1:
                iw_loss_p, iw_loss_q = neg_elbo
                grads_vars_q = optimizer.compute_gradients(
                    iw_loss_q + wd_loss,
                    var_list=tf.get_collection('q_collections'))
                grads_vars_p = optimizer.compute_gradients(
                    iw_loss_p + wd_loss,
                    var_list=tf.get_collection('p_collections'))
                grads_vars = grads_vars_p + grads_vars_q
                train_op = optimizer.apply_gradients(grads_vars,
                                                     global_step=global_step)
            else:
                loss = neg_elbo + wd_loss
                train_op = optimizer.minimize(loss, global_step=global_step)

        return train_op
Пример #4
0
def run_training(vae, cont_train, config_train, log_dir):
    """ The main function that will derive training of a vae.
    Args:
        vae: is an object from the class VAE. 
        cont_train: a boolean flag indicating whether train should continue from the checkpoint stored in the log_dir.
        config_train: a dictionary containing config. training (hyperparameters).
        log_dir: path to a directory that will used for storing both tensorboard files and checkpoints.

    Returns:
        test_neg_ll_value: the value of test log-likelihood.
    """
    use_iw = config_train['use_iw']
    Print('Starting training.')
    batch_size = config_train['batch_size']
    # Get the train, val, test sets of on MNIST.
    data_dir = config_train['data_dir']
    eval_batch_size = config_train['eval_batch_size']
    data_sets = input_data.read_data_set(data_dir,
                                         dataset=config_train['dataset'])

    # place holder for input.
    input_placeholder = tf.placeholder(tf.float32, shape=(None, vae.num_input))
    # define training graph.
    if use_iw:
        Print('using IW obj. function')
        iw_loss, neg_elbo, sigmoid_output, wd_loss, _ = \
            vae.neg_elbo(input_placeholder, is_training=True, k=config_train['k'], use_iw=use_iw)
        loss = iw_loss + wd_loss
        # create scalar summary for training loss.
        tf.summary.scalar('train/neg_iw_loss', iw_loss)
        sigmoid_output = tf.slice(sigmoid_output, [0, 0], [batch_size, -1])
    else:
        Print('using VAE obj. function')
        _, neg_elbo, sigmoid_output, wd_loss, _ = \
            vae.neg_elbo(input_placeholder, is_training=True, k=config_train['k'], use_iw=use_iw)
        loss = neg_elbo + wd_loss
        # create scalar summary for training loss.
        tf.summary.scalar('train/neg_elbo', neg_elbo)

    train_op = vae.training(loss)

    # create images for reconstruction.
    image = create_reconstruction_image(input_placeholder,
                                        sigmoid_output[:batch_size],
                                        batch_size)
    tf.summary.image('recon', image, max_outputs=1)

    # define graph to generate random samples from model.
    num_samples = 100
    random_samples = vae.generate_samples(num_samples)
    tiled_samples = tile_image_tf(random_samples,
                                  n=int(np.sqrt(num_samples)),
                                  m=int(np.sqrt(num_samples)),
                                  width=28,
                                  height=28)
    tf.summary.image('generated_sample', tiled_samples, max_outputs=1)

    # merge all summary for training graph
    train_summary_op = tf.summary.merge_all()

    # define a parallel graph for evaluation. Enable parameter sharing by setting is_training to False.
    _, neg_elbo_eval, _, _, log_iw_eval = vae.neg_elbo(input_placeholder,
                                                       is_training=False)

    # the following will create summaries that will be used in the evaluation graph.
    val_neg_elbo, test_neg_elbo = tf.placeholder(
        tf.float32, shape=()), tf.placeholder(tf.float32, shape=())
    val_neg_ll, test_neg_ll = tf.placeholder(
        tf.float32, shape=()), tf.placeholder(tf.float32, shape=())
    val_summary = tf.summary.scalar('val/neg_elbo', val_neg_elbo)
    test_summary = tf.summary.scalar('test/neg_elbo', test_neg_elbo)
    val_ll_summary = tf.summary.scalar('val/neg_ll', val_neg_ll)
    test_ll_summary = tf.summary.scalar('test/neg_ll', test_neg_ll)
    eval_summary_op = tf.summary.merge(
        [val_summary, test_summary, val_ll_summary, test_ll_summary])

    # start checkpoint saver.
    saver = tf.train.Saver(max_to_keep=1)
    sess = tf.Session()

    # Run the Op to initialize the variables.
    if cont_train:
        ckpt = tf.train.get_checkpoint_state(log_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
            init_step = int(ckpt.model_checkpoint_path.split('-')[-1]) + 1
            Print('Initializing model from %s from step %d' %
                  (log_dir, init_step))
        else:
            raise ('No Checkpoint was fount at %s' % log_dir)
    else:
        init = tf.global_variables_initializer()
        sess.run(init)
        init_step = 0

    # Instantiate a SummaryWriter to output summaries and the Graph.
    # Create train/validation/test summary directories
    summary_writer = tf.summary.FileWriter(log_dir)

    # And then after everything is built, start the training loop.
    duration = 0.0
    best_val_neg_ll = np.finfo(float).max
    num_iter = config_train['num_iter']
    for step in xrange(init_step, num_iter):
        start_time = time.time()
        # perform one iteration of training.
        feed_dict = fill_feed_dict(data_sets.train, input_placeholder,
                                   batch_size)
        _, neg_elbo_value = sess.run([train_op, neg_elbo], feed_dict=feed_dict)
        duration += time.time() - start_time

        # Save a checkpoint and evaluate the model periodically.
        eval_iter = 20000 if num_iter > 1e5 else 10000
        if (step + 1) % eval_iter == 0 or (step + 1) == num_iter:
            # if vae has rbm in its prior we should update its log Z.
            if vae.should_compute_log_z():
                vae.prior.estimate_log_z(sess)

            # validate on the validation and test set
            val_neg_elbo_value, val_neg_ll_value = evaluate(
                sess,
                neg_elbo_eval,
                log_iw_eval,
                input_placeholder,
                data_sets.validation,
                batch_size=eval_batch_size,
                k_iw=100)
            test_neg_elbo_value, test_neg_ll_value = evaluate(
                sess,
                neg_elbo_eval,
                log_iw_eval,
                input_placeholder,
                data_sets.test,
                batch_size=eval_batch_size,
                k_iw=100)
            summary_str = sess.run(eval_summary_op,
                                   feed_dict={
                                       val_neg_elbo: val_neg_elbo_value,
                                       test_neg_elbo: test_neg_elbo_value,
                                       val_neg_ll: val_neg_ll_value,
                                       test_neg_ll: test_neg_ll_value
                                   })
            summary_writer.add_summary(summary_str, step)

            Print(
                'Step %d: val ELBO = %.2f test ELBO = %.2f, val NLL = %.2f, test NLL = %.2f'
                % (step, val_neg_elbo_value, test_neg_elbo_value,
                   val_neg_ll_value, test_neg_ll_value))
            # save model if it is better on validation set:
            if val_neg_ll_value < best_val_neg_ll:
                best_val_neg_ll = val_neg_ll_value
                saver.save(sess, log_dir + '/', global_step=step)

        # Write the summaries and print an overview fairly often.
        report_iter = 1000
        if step % report_iter == 0 and step > 500:
            # print status to stdout.
            Print('Step %d, %.3f sec per step' %
                  (step, duration / report_iter))
            duration = 0.0
            # Update the events file.
            summary_str = sess.run(train_summary_op, feed_dict=feed_dict)
            summary_writer.add_summary(summary_str, step)

        # in the last iteration, we load the best model based on the validation performance, and evaluate it on test
        if (step + 1) == num_iter:
            Print('Final evaluation using the best saved model')
            # reload the best model this is good when a model overfits.
            ckpt = tf.train.get_checkpoint_state(log_dir)
            saver.restore(sess, ckpt.model_checkpoint_path)
            Print('Done restoring the model at step: %d' %
                  sess.run(get_global_step_var()))
            if vae.should_compute_log_z():
                vae.prior.estimate_log_z(sess)

            val_neg_elbo_value, val_neg_ll_value = evaluate(
                sess,
                neg_elbo_eval,
                log_iw_eval,
                input_placeholder,
                data_sets.validation,
                eval_batch_size,
                k_iw=100)
            test_neg_elbo_value, test_neg_ll_value = evaluate(
                sess,
                neg_elbo_eval,
                log_iw_eval,
                input_placeholder,
                data_sets.test,
                eval_batch_size,
                k_iw=config_train['k_iw'])
            summary_str = sess.run(eval_summary_op,
                                   feed_dict={
                                       val_neg_elbo: val_neg_elbo_value,
                                       test_neg_elbo: test_neg_elbo_value,
                                       val_neg_ll: val_neg_ll_value,
                                       test_neg_ll: test_neg_ll_value
                                   })
            Print(
                'Step %d: val ELBO = %.2f test ELBO = %.2f, val NLL = %.2f, test NLL = %.2f'
                % (step, val_neg_elbo_value, test_neg_elbo_value,
                   val_neg_ll_value, test_neg_ll_value))
            summary_writer.add_summary(summary_str, step + 1)
            summary_writer.flush()

            sess.close()
            tf.reset_default_graph()
            return test_neg_ll_value