Exemplo n.º 1
0
def train(loss, init_fn, hparams):
  """Wraps slim.learning.train to run a training loop.

  Args:
    loss: a loss tensor
    init_fn: A callable to be executed after all other initialization is done.
    hparams: a model hyper parameters
  """
  optimizer = create_optimizer(hparams)

  if FLAGS.sync_replicas:
    replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
    optimizer = tf.LegacySyncReplicasOptimizer(
        opt=optimizer,
        replicas_to_aggregate=FLAGS.replicas_to_aggregate,
        replica_id=replica_id,
        total_num_replicas=FLAGS.total_num_replicas)
    sync_optimizer = optimizer
    startup_delay_steps = 0
  else:
    startup_delay_steps = 0
    sync_optimizer = None

  train_op = slim.learning.create_train_op(
      loss,
      optimizer,
      summarize_gradients=True,
      clip_gradient_norm=FLAGS.clip_gradient_norm)
  return train_op
Exemplo n.º 2
0
def train(loss, init_fn, hparams):
    """Wraps slim.learning.train to run a training loop.

  Args:
    loss: a loss tensor
    init_fn: A callable to be executed after all other initialization is done.
    hparams: a model hyper parameters
  """
    with tf.device("/cpu:0"):
        global_step = slim.get_or_create_global_step()

    with tf.device("/cpu:0"):
        optimizer = create_optimizer(hparams)

    if FLAGS.sync_replicas:
        replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
        optimizer = tf.LegacySyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            replica_id=replica_id,
            total_num_replicas=FLAGS.total_num_replicas)
        sync_optimizer = optimizer
        startup_delay_steps = 0
    else:
        startup_delay_steps = 0
        sync_optimizer = None

    #train_op = slim.learning.create_train_op(
    #    loss,
    #    optimizer,
    #    summarize_gradients=True,
    #    clip_gradient_norm=FLAGS.clip_gradient_norm)
    grad = optimizer.compute_gradients(loss)
    clipped_grad = tf.contrib.training.clip_gradient_norms(
        grad, FLAGS.clip_gradient_norm)
    update = optimizer.apply_gradients(clipped_grad, global_step=global_step)
    with tf.control_dependencies([update]):
        train_op = tf.identity(loss, name='train_op')

    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True

    with tf.device("/cpu:0"):
        slim.learning.train(train_op=train_op,
                            logdir=FLAGS.train_log_dir,
                            graph=loss.graph,
                            master=FLAGS.master,
                            is_chief=(FLAGS.task == 0),
                            number_of_steps=FLAGS.max_number_of_steps,
                            save_summaries_secs=FLAGS.save_summaries_secs,
                            save_interval_secs=FLAGS.save_interval_secs,
                            startup_delay_steps=startup_delay_steps,
                            sync_optimizer=sync_optimizer,
                            init_fn=init_fn,
                            session_config=session_config)
Exemplo n.º 3
0
def train(loss, init_fn, hparams):
  """Wraps slim.learning.train to run a training loop.

  Args:
    loss: a loss tensor
    init_fn: A callable to be executed after all other initialization is done.
    hparams: a model hyper parameters
  """
  optimizer = create_optimizer(hparams)

  if FLAGS.sync_replicas:
    replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
    optimizer = tf.LegacySyncReplicasOptimizer(
        opt=optimizer,
        replicas_to_aggregate=FLAGS.replicas_to_aggregate,
        replica_id=replica_id,
        total_num_replicas=FLAGS.total_num_replicas)
    sync_optimizer = optimizer
    startup_delay_steps = 0
  else:
    startup_delay_steps = 0
    sync_optimizer = None

  train_op = slim.learning.create_train_op(
      loss,
      optimizer,
      summarize_gradients=True,
      aggregation_method=2, 
      clip_gradient_norm=FLAGS.clip_gradient_norm)

  gpu_config = tf.ConfigProto()
  #gpu_config.gpu_options.visible_device_list= '1,2,0,3'
  #gpu_config.gpu_options.visible_device_list= '2'
  gpu_config.gpu_options.allow_growth = True
  #gpu_config.gpu_options.per_process_gpu_memory_fraction = 1.0
  #gpu_config.gpu_options.allocator_type = 'BFC'
  #gpu_config.log_device_placement = True
  #gpu_config.allow_soft_placement = True
  
  slim.learning.train(
      train_op=train_op,
      logdir=FLAGS.train_log_dir,
      graph=loss.graph,
      master=FLAGS.master,
      is_chief=(FLAGS.task == 0),
      number_of_steps=FLAGS.max_number_of_steps,
      save_summaries_secs=FLAGS.save_summaries_secs,
      save_interval_secs=FLAGS.save_interval_secs,
      startup_delay_steps=startup_delay_steps,
      sync_optimizer=sync_optimizer,
      init_fn=init_fn,
      session_config=gpu_config)
Exemplo n.º 4
0
def train(loss, init_fn, CharacterAccuracy, SequenceAccuracy, hparams):
    """Wraps slim.learning.train to run a training loop.

    Args:
      loss: a loss tensor
      init_fn: A callable to be executed after all other initialization is done.
      hparams: a model hyper parameters
    """
    optimizer = create_optimizer(hparams)

    if FLAGS.sync_replicas:
        replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
        optimizer = tf.LegacySyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            replica_id=replica_id,
            total_num_replicas=FLAGS.total_num_replicas)
        sync_optimizer = optimizer
        startup_delay_steps = 0
    else:
        startup_delay_steps = 0
        sync_optimizer = None

    train_op = slim.learning.create_train_op(
        loss,
        optimizer,
        summarize_gradients=True,
        clip_gradient_norm=FLAGS.clip_gradient_norm)

    slim.learning.train(train_op=train_op,
                        train_step_fn=train_step,
                        train_step_kwargs={
                            'CharacterAccuracy': CharacterAccuracy,
                            'SequenceAccuracy': SequenceAccuracy
                        },
                        logdir=FLAGS.train_log_dir,
                        graph=loss.graph,
                        master=FLAGS.master,
                        is_chief=(FLAGS.task == 0),
                        number_of_steps=FLAGS.max_number_of_steps,
                        save_summaries_secs=FLAGS.save_summaries_secs,
                        save_interval_secs=FLAGS.save_interval_secs,
                        startup_delay_steps=startup_delay_steps,
                        sync_optimizer=sync_optimizer,
                        init_fn=init_fn)
Exemplo n.º 5
0
def train_multigpu(losses, init_fn, hparams):
    """Wraps slim.learning.train to run a training loop.

  Args:
    loss: a loss tensor
    init_fn: A callable to be executed after all other initialization is done.
    hparams: a model hyper parameters
  """
    with tf.device("/cpu:0"):
        global_step = slim.create_global_step()

    with tf.device("/cpu:0"):
        optimizer = create_optimizer(hparams)

    if FLAGS.sync_replicas:
        replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
        optimizer = tf.LegacySyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            replica_id=replica_id,
            total_num_replicas=FLAGS.total_num_replicas)
        sync_optimizer = optimizer
        startup_delay_steps = 0
    else:
        startup_delay_steps = 0
        sync_optimizer = None

    #train_op = slim.learning.create_train_op(
    #    loss,
    #    optimizer,
    #    summarize_gradients=True,
    #    clip_gradient_norm=FLAGS.clip_gradient_norm)
    #with tf.device("/cpu:0"):
    #  tf.summary.scalar('TotalLoss_all', total_loss)
    #  grad = optimizer.compute_gradients(total_loss)
    #with tf.device("/cpu:0"):
    #  with ops.name_scope('summarize_grads'):
    #    add_gradients_summaries(grad)
    #  clipped_grad = tf.contrib.training.clip_gradient_norms(grad, FLAGS.clip_gradient_norm)
    #  update = optimizer.apply_gradients(clipped_grad, global_step=global_step)
    #with tf.control_dependencies([update]):
    #  train_op = tf.identity(total_loss, name='train_op')

    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, "clone_0")

    grads = []
    total_loss = []
    for loss, i in losses:
        with tf.device("/gpu:{0}".format(i)):
            scaled_loss = tf.div(loss, 1.0 * FLAGS.num_clones)
            if i == 0:
                regularization_loss = tf.add_n(
                    tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
                scaled_loss = scaled_loss + regularization_loss
            total_loss.append(scaled_loss)
            grad = optimizer.compute_gradients(scaled_loss)
            #if i == 0:
            #  with tf.device("/cpu:0"):
            #    with ops.name_scope("summarize_grads_{0}".format(i)):
            #      add_gradients_summaries(grad)
            grads.append(grad)
    total_loss = tf.add_n(total_loss)
    with tf.device("/cpu:0"):
        tf.summary.scalar('Total_Loss', total_loss)
    sum_grad = _sum_clones_gradients(grads)
    clipped_grad = tf.contrib.training.clip_gradient_norms(
        sum_grad, FLAGS.clip_gradient_norm)
    update = optimizer.apply_gradients(clipped_grad, global_step=global_step)
    update_ops.append(update)

    with tf.control_dependencies([tf.group(*update_ops)]):
        train_op = tf.identity(total_loss, name='train_op')

    session_config = tf.ConfigProto()
    session_config.gpu_options.allow_growth = True
    #session_config.log_device_placement = True

    with tf.device("/cpu:0"):
        slim.learning.train(train_op=train_op,
                            logdir=FLAGS.train_log_dir,
                            graph=total_loss.graph,
                            master=FLAGS.master,
                            is_chief=(FLAGS.task == 0),
                            number_of_steps=FLAGS.max_number_of_steps,
                            save_summaries_secs=FLAGS.save_summaries_secs,
                            trace_every_n_steps=1000,
                            save_interval_secs=FLAGS.save_interval_secs,
                            startup_delay_steps=startup_delay_steps,
                            sync_optimizer=sync_optimizer,
                            init_fn=init_fn,
                            session_config=session_config)
Exemplo n.º 6
0
def train(hparams):
    """Run training loop."""
    data_iterator, clause_metadata = load_data(random_start=True)

    with tf.device(tf.train.replica_device_setter(FLAGS.ps_tasks)):
        # The following three lines prevent hangs during distributed training.
        vs = tf.get_variable_scope()
        if vs.caching_device is None:
            vs.set_caching_device(lambda op: op.device)

        # Build the graph.
        global_step = slim.variables.get_or_create_global_step()
        if FLAGS.model_type == 'tree':
            m = cnf_model.CNFTreeModel(data_iterator, hparams, clause_metadata)
        else:
            m = cnf_model.CNFSequenceModel(data_iterator, hparams,
                                           clause_metadata)

        variables = tf.trainable_variables()

        learning_rate = tf.train.exponential_decay(
            hparams.learning_rate,
            global_step,
            hparams.decay_steps,
            hparams.learning_rate_decay_factor,
            staircase=True)

        if hparams.optimizer == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        elif hparams.optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        elif hparams.optimizer == 'rmsprop':
            optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate,
                                                  decay=0.9,
                                                  momentum=0.9,
                                                  epsilon=1e-5)
        else:
            raise RuntimeError('Unknown optimizer %s' % hparams.optimizer)

        if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas:
            replica_id = tf.constant(FLAGS.task, tf.int32, shape=())
            optimizer = tf.LegacySyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                replica_id=replica_id,
                total_num_replicas=FLAGS.worker_replicas)

        tf.contrib.deprecated.scalar_summary('lr', learning_rate)
        tf.contrib.deprecated.scalar_summary('loss', m.loss)
        for metric_name, metric_value in m.metrics.items():
            tf.contrib.deprecated.scalar_summary('metric/' + metric_name,
                                                 metric_value)

        grads_and_vars = optimizer.compute_gradients(m.loss, variables)
        if hparams.grad_max_norm > 0:
            g, v = zip(*grads_and_vars)
            g, global_norm = tf.clip_by_global_norm(g, hparams.grad_max_norm)
            tf.contrib.deprecated.scalar_summary('global_norm', global_norm)
            grads_and_vars = zip(g, v)
        train_op = optimizer.apply_gradients(grads_and_vars, global_step)
        summary_op = tf.get_summary_op()

        if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas:
            init_token_op = optimizer.get_init_tokens_op()
            chief_queue_runner = optimizer.get_chief_queue_runner()

        saver = tf.Saver(keep_checkpoint_every_n_hours=1.0)

        supervisor = tf.Supervisor(
            is_chief=(FLAGS.task == 0),
            logdir=FLAGS.tf_log_dir,
            global_step=global_step,
            saver=saver,
            # We are going to compute summaries ourselves.
            summary_op=None,
            save_model_secs=FLAGS.save_model_secs,
            # But we set this so that this computes global_step/sec.
            save_summaries_secs=FLAGS.save_summaries_secs)
        sess = supervisor.prepare_or_wait_for_session(FLAGS.master)

        # TODO(ricshin):
        # Rewrite this to use supervisor.managed_session().
        # Look at how slim/learning.py handles SyncReplicas, in particular
        # init_token_op.  Use normal text summaries once they exist.
        # Use supervisor.should_stop().
        if FLAGS.task == 0:
            if FLAGS.master not in ('', 'local') and FLAGS.sync_replicas:
                supervisor.start_queue_runners(sess, [chief_queue_runner])
                sess.run(init_token_op)

            sampling_temps = [
                float(x) for x in FLAGS.sampling_temps.split(',')
            ]

            def summarize():
                try:
                    summary_strs, global_step_val = sess.run(
                        [summary_op, global_step])
                    summaries = tf.Summary.FromString(summary_strs)

                    for i, temp in itertools.product(
                            xrange(FLAGS.num_summary_samples), sampling_temps):
                        cnf = textwrap.wrap(
                            cnf_utils.unparse_cnf(m.sample(sess)))
                        summaries.value.add(
                            tag='formula_temp%g_%d' % (temp, i),
                            tensor=make_tensor_proto('\n'.join(cnf)))

                    supervisor.summary_writer.add_summary(
                        summaries.SerializeToString(), global_step_val)
                    status_str = ', '.join('%s=%f' %
                                           (value.tag, value.simple_value)
                                           for value in summaries.value
                                           if value.HasField('simple_value'))
                    tf.logging.info('step=%d: %s', global_step_val, status_str)
                except:
                    # The supervisor eats the backtrace, so print it here.
                    traceback.print_exc()
                    raise

            supervisor.loop(FLAGS.save_summaries_secs, summarize)

        # Run the trainer.
        for unused_i in xrange(hparams.max_steps):
            sess.run(train_op)