예제 #1
0
def evaluate():
  # Read vocabulary
  vocab, rev_vocab = _load_vocabulary(FLAGS.vocab_fname)

  with tf.Graph().as_default() as g:
    #Enque data for evaluation
    num_examples_per_epoch, tower_img_embedding, tower_context_length, \
        tower_caption_length, tower_context_id, tower_caption_id, \
        tower_answer_id, tower_context_mask, \
        tower_caption_mask = enqueue(True)

    tower_argmax = []
    # Calculate the gradients for each model tower.
    with tf.variable_scope(tf.get_variable_scope()) as scope:
      for i in xrange(FLAGS.num_gpus):
        with tf.device('/gpu:%d' % i):
          with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
            inputs = [
                tower_img_embedding[i],
                tower_context_length[i],
                tower_caption_length[i],
                tower_context_id[i],
                tower_caption_id[i],
                tower_answer_id[i],
                tower_context_mask[i],
                tower_caption_mask[i]
            ]
            net = CSMN(inputs, ModelConfig(FLAGS, True), is_training= False)
            argmax = net.argmax
            # Reuse variables for the next tower.
            tf.get_variable_scope().reuse_variables()

            # Keep track of the gradients across all towers.
            tower_argmax.append(argmax)
    argmaxs = tf.concat(tower_argmax, 0)
    answer_ids = tf.concat(tower_answer_id, 0)
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.summary.merge_all()

    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir, g)

    #Don't evaluate again for the same checkpoint.
    b_g_s = "0"
    while True:
      c_g_s = _eval_once(
          saver, summary_writer, argmaxs, answer_ids, vocab,
          rev_vocab, num_examples_per_epoch, b_g_s
      )
      b_g_s = c_g_s
      if FLAGS.run_once:
        break
      time.sleep(FLAGS.eval_interval_secs)
예제 #2
0
def train():
    print('training...')
    colorlog.basicConfig(
        filename=None,
        level=logging.INFO,
        format="%(log_color)s[%(levelname)s:%(asctime)s]%(reset)s %(message)s",
        datafmt="%Y-%m-%d %H:%M:%S")

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95)
    gpu_options.allow_growth = True
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                          log_device_placement=False,
                                          gpu_options=gpu_options)) as sess:
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)
        num_examples_per_epoch, tower_img_embedding, tower_context_length, \
            tower_caption_length, tower_context_id, tower_caption_id, \
            tower_answer_id, tower_context_mask, \
            tower_caption_mask = enqueue(False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (num_examples_per_epoch / FLAGS.batch_size /
                                 FLAGS.num_gpus)
        decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(FLAGS.init_lr,
                                        global_step,
                                        decay_steps,
                                        LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.AdamOptimizer(lr)

        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()) as scope:
            for i in xrange(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' % (TOWER_NAME, i)) as scope:
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        inputs = [
                            tower_img_embedding[i],
                            tower_context_length[i],
                            tower_caption_length[i],
                            tower_context_id[i],
                            tower_caption_id[i],
                            tower_answer_id[i],
                            tower_context_mask[i],
                            tower_caption_mask[i],
                        ]
                        # print('loss before')
                        loss = _tower_loss(inputs, scope)

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()
                        # print('reuse after')

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)
                        # print('summaries after')

                        # Calculate the gradients for the batch of data on this CIFAR tower.
                        # Returns:
                        #   A list of (gradient, variable) pairs. Variable is always present, but gradient can be None.
                        grads = opt.compute_gradients(loss)
                        # print('grads after')

                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)
                        # print('tower_grads')

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = _average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))
        clipped_grads_and_vars = [(tf.clip_by_norm(gv[0], \
            FLAGS.max_grad_norm), gv[1]) for gv in grads]
        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(clipped_grads_and_vars,
                                                global_step=global_step)
        # Create a saver.
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        print('initing...')
        sess.run(init)

        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            print('Restoring from checkpoint...')
            saver.restore(sess, ckpt.model_checkpoint_path)
        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([apply_gradient_op, loss])
            # print('step and loss : ', step, loss_value)
            duration = time.time() - start_time
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if (step + 1) % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                c_g_step = int(global_step.eval(session=sess))
                print(format_str % (datetime.now(), c_g_step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if (step + 1) % 25 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, c_g_step)

            # Save the model checkpoint periodically.
            if (step + 1) % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=c_g_step)