Exemplo n.º 1
0
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.

  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'

  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()

  # Build inference Graph.
  logits = cifar10.inference(images)

  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)

  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)

  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')

  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    tf.contrib.deprecated.scalar_summary(loss_name, l)

  return total_loss
def tower_loss(scope):
  """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
  # Get images and labels for CIFAR-10.
  images, labels = cifar10.distorted_inputs()
  # Build inference Graph.
  logits = cifar10.inference(images)
  # Build the portion of the Graph calculating the losses. Note that we will
  # assemble the total_loss using a custom function below.
  _ = cifar10.loss(logits, labels)
  # Assemble all of the losses for the current tower only.
  losses = tf.get_collection('losses', scope)
  # Calculate the total loss for the current tower.
  total_loss = tf.add_n(losses, name='total_loss')
  # Compute the moving average of all individual losses and the total loss.
  loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
  loss_averages_op = loss_averages.apply(losses + [total_loss])
  # Attach a scalar summary to all individual losses and the total loss; do the
  # same for the averaged version of the losses.
  for l in losses + [total_loss]:
    # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
    # session. This helps the clarity of presentation on tensorboard.
    loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
    tf.scalar_summary(loss_name +' (raw)', l)
    tf.scalar_summary(loss_name, loss_averages.average(l))
  with tf.control_dependencies([loss_averages_op]):
    total_loss = tf.identity(total_loss)
  return total_loss
Exemplo n.º 3
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar10.distorted_inputs()

        logits = cifar10_resnet(images)

        loss = cifar10.loss(logits, labels)

        train_op = cifar10.train(loss, global_step)

        summary_op = tf.merge_all_summaries()

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        for step in xrange(FLAGS.max_steps):
            _, loss_value = sess.run([train_op, loss])

            if step % 10 == 0:
                print 'step %d, loss = %.3f' % (step, loss_value)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference6(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 5
0
def tower_loss(scope):
    """Calculate the total loss on a single tower running the CIFAR model.
  Args:
    scope: unique prefix string identifying the CIFAR tower, e.g. 'tower_0'
  Returns:
     Tensor of shape [] containing the total loss for a batch of data
  """
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    # Build inference Graph.
    logits = cifar10.inference(images)
    # Build the portion of the Graph calculating the losses. Note that we will
    # assemble the total_loss using a custom function below.
    _ = cifar10.loss(logits, labels)
    # Assemble all of the losses for the current tower only.
    losses = tf.get_collection('losses', scope)
    # Calculate the total loss for the current tower.
    total_loss = tf.add_n(losses, name='total_loss')
    # Compute the moving average of all individual losses and the total loss.
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    loss_averages_op = loss_averages.apply(losses + [total_loss])
    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
        # Remove 'tower_[0-9]/' from the name in case this is a multi-GPU training
        # session. This helps the clarity of presentation on tensorboard.
        loss_name = re.sub('%s_[0-9]*/' % cifar10.TOWER_NAME, '', l.op.name)
        # Name each loss as '(raw)' and name the moving average version of the loss
        # as the original loss name.
        tf.scalar_summary(loss_name + ' (raw)', l)
        tf.scalar_summary(loss_name, loss_averages.average(l))
    with tf.control_dependencies([loss_averages_op]):
        total_loss = tf.identity(total_loss)
    return total_loss
Exemplo n.º 6
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()
        print('Finished getting images & labels')

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = modified_inference(images)
        print('Finished building inference graph')

        # Calculate loss.
        loss = cifar10.loss(logits, labels)
        print('Finished building loss graph')

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)
        print('Finished building train graph')

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1

            def before_run(self, run_context):
                self._step += 1
                self._start_time = time.time()
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                duration = time.time() - self._start_time
                loss_value = run_values.results
                if self._step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    _LoggerHook()
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:
            print('Hooks attached, starting training')
            while not mon_sess.should_stop():
                mon_sess.run(train_op)
Exemplo n.º 7
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.contrib.framework.get_or_create_global_step()

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    #images, labels = cifar10.inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    class _LoggerHook(tf.train.SessionRunHook):
      """Logs loss and runtime."""

      def begin(self):
        self._step = -1

      def before_run(self, run_context):
        self._step += 1
        self._start_time = time.time()
        return tf.train.SessionRunArgs(loss)  # Asks for loss value.

      def after_run(self, run_context, run_values):
        duration = time.time() - self._start_time
        loss_value = run_values.results
        if self._step % 10 == 0:
          num_examples_per_step = FLAGS.batch_size
          examples_per_sec = num_examples_per_step / duration
          sec_per_batch = float(duration)

          format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
          print (format_str % (datetime.now(), self._step, loss_value,
                               examples_per_sec, sec_per_batch))

    with tf.train.MonitoredTrainingSession(
        checkpoint_dir=FLAGS.train_dir,
        hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
               tf.train.NanTensorHook(loss),
               _LoggerHook()],
        config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)) as mon_sess:
      while not mon_sess.should_stop():
        mon_sess.run(train_op)
Exemplo n.º 8
0
def multilevel_train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        init = tf.initialize_all_variables()
        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        # train_op = cifar10.train(loss, global_step)
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Create a saver.
        # saver = tf.train.Saver(tf.all_variables())
        model_dir_exp = os.path.expanduser(
            "/home/chenz/Workspace/Analysis/Parameters")
        ckpt_file = "model.ckpt-1500"
        meta_file = "model.ckpt-1500.meta"
        saver = tf.train.import_meta_graph(
            os.path.join(model_dir_exp, meta_file))
        #saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file))
        saver.restore(sess, os.path.join(model_dir_exp, ckpt_file))
        #saver = load_model("/home/chenz/Workspace/Analysis/Parameters", "model.ckpt-1500.meta", "model.ckpt-1500")

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.

        # Start running operations on the Graph.
        #sess = tf.Session(config=tf.ConfigProto(
        #    log_device_placement=FLAGS.log_device_placement))
        #sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        loss_value = sess.run(loss)
        print(loss_value)
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    images, labels = cifar10.distorted_inputs()

    logits = cifar10.inference(images)
    
    ## EDIT: Softmax activation
    softmax = tf.nn.softmax(logits)

    loss = cifar10.loss(softmax, labels)
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()


    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 10
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)
    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()
    print(labels)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)
    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())
    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()
    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()
    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)
    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                            graph_def=sess.graph_def)
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # Save the model checkpoint periodically.
      if step % 10 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        print("module save")
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 11
0
def test_read_cifar10():
    from tensorflow.models.image.cifar10 import cifar10
    FLAGS = tf.app.flags.FLAGS
    tf.app.flags.DEFINE_string(
        'my_train_dir', '../cifar10_model/model1',
        """Directory where to write event logs """
        """and checkpoint.""")
    cifar10.maybe_download_and_extract()
    if tf.gfile.Exists(FLAGS.my_train_dir):
        tf.gfile.DeleteRecursively(FLAGS.my_train_dir)
    tf.gfile.MakeDirs(FLAGS.my_train_dir)
    with tf.Session() as sess:
        images, labels = cifar10.distorted_inputs()
        sess.run(tf.initialize_all_variables())
        a, b = sess.run([images, labels])
        print(len(a), len(a[0]))
Exemplo n.º 12
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    #GETTING THE TRAINING IMAGES
    images, labels = cifar10.distorted_inputs()
    # DATA FOR GRAPH.
    logits = cifar10.inference(images)
    # LOSS FUNCTION
    loss = cifar10.loss(logits, labels)

    # CREATING AND RUNNING A TENSORBOARD GRAPH
    train_op = cifar10.train(loss, global_step)
    saver = tf.train.Saver(tf.all_variables())
    summary_op = tf.merge_all_summaries()
    init = tf.initialize_all_variables()
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # START THE QUEUE
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time
      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)
        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
      # SAVE CHECKPOINT TO EVALUATE PERIODICALLY
      if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 13
0
def train():
    # ops
    global_step = tf.Variable(0, trainable=False)
    images, labels = cifar10.distorted_inputs()
    logits = cifar10.inference(tf.image.resize_images(images, cifar10.IMAGE_SIZE, cifar10.IMAGE_SIZE))
    loss = cifar10.loss(logits, labels)
    train_op = cifar10.train(loss, global_step)
    summary_op = tf.merge_all_summaries()

    with tf.Session() as sess:
        saver = tf.train.Saver(tf.all_variables(), max_to_keep=21)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        # restore or initialize variables
        ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            sess.run(tf.initialize_all_variables())

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        start = sess.run(global_step)
        for step in xrange(start, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            print '%d: %f (%.3f sec/batch)' % (step, loss_value, duration)

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
            if step % 500 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def train():
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  print ('PS hosts are: %s' % ps_hosts)
  print ('Worker hosts are: %s' % worker_hosts)

  server = tf.train.Server(
      {'ps': ps_hosts, 'worker': worker_hosts},
      job_name = FLAGS.job_name,
      task_index=FLAGS.task_id)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()

  is_chief = (FLAGS.task_id == 0)
  if is_chief:
    if tf.gfile.Exists(FLAGS.train_dir):
      tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

  """Train CIFAR-10 for a number of steps."""
  cluster = tf.train.ClusterSpec({'ps': ps_hosts, 'worker': worker_hosts})
  device_setter = tf.train.replica_device_setter(cluster=cluster)
  with tf.device(device_setter):
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
                                    global_step,
                                    decay_steps,
                                    LEARNING_RATE_DECAY_FACTOR,
                                    staircase=True)
    tf.scalar_summary('learning_rate', lr)
    opt = tf.train.GradientDescentOptimizer(lr)


    # Track the moving averages of all trainable variables.
    exp_moving_averager = tf.train.ExponentialMovingAverage(
        MOVING_AVERAGE_DECAY, global_step)
    variables_to_average = (
        tf.trainable_variables() + tf.moving_average_variables())

    opt = tf.train.SyncReplicasOptimizer(
        opt,
        replicas_to_aggregate=len(worker_hosts),
        replica_id=FLAGS.task_id,
        total_num_replicas=len(worker_hosts),
        variable_averages=exp_moving_averager,
        variables_to_average=variables_to_average)


    # Compute gradients with respect to the loss.
    grads = opt.compute_gradients(loss)

    # Add histograms for gradients.
    for grad, var in grads:
      if grad is not None:
        tf.histogram_summary(var.op.name + '/gradients', grad)

    apply_gradients_op = opt.apply_gradients(grads, global_step=global_step)

    with tf.control_dependencies([apply_gradients_op]):
      train_op = tf.identity(loss, name='train_op')


    chief_queue_runners = [opt.get_chief_queue_runner()]
    init_tokens_op = opt.get_init_tokens_op()

    saver = tf.train.Saver()
    # We run the summaries in the same thread as the training operations by
    # passing in None for summary_op to avoid a summary_thread being started.
    # Running summaries and training operations in parallel could run out of
    # GPU memory.
    sv = tf.train.Supervisor(is_chief=is_chief,
                             logdir=FLAGS.train_dir,
                             init_op=tf.initialize_all_variables(),
                             summary_op=tf.merge_all_summaries(),
                             global_step=global_step,
                             saver=saver,
                             save_model_secs=60)

    tf.logging.info('%s Supervisor' % datetime.now())

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)

    print ("Before session init")
    # Get a session.
    sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
    print ("Before session init done")

    # Start the queue runners.
    queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
    sv.start_queue_runners(sess, queue_runners)
    print ('Started %d queues for processing input data.' % len(queue_runners))

    sv.start_queue_runners(sess, chief_queue_runners)
    sess.run(init_tokens_op)

    print ('Start training')
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, gs = sess.run([train_op, loss, global_step])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, gs, loss_value,
                             examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Exemplo n.º 15
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        if FLAGS.checkpoint_dir is not None:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_dir)
            print("checkpoint path is %s" % ckpt.model_checkpoint_path)
            tf.train.Saver().restore(sess, ckpt.model_checkpoint_path)

        # Start the queue runners.
        print("FLAGS.checkpoint_dir is %s" % FLAGS.checkpoint_dir)
        tf.train.start_queue_runners(sess=sess)
        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        cur_step = sess.run(global_step)
        print("current step is %s" % cur_step)
        interrupt_check_duration = 0.0
        elapsed_time = time.time()
        flag = 0
        for step in xrange(cur_step, FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            interrupt_check_duration += duration
            if float(interrupt_check_duration) > 5.0:
                print("checking for interruption: %s", interrupt_check_duration)
                if decision_for_migration():
                    print("have to migrate")
                    checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                    print("checkpoint path is %s" % checkpoint_path)
                    saver.save(sess, checkpoint_path, global_step=step)
                    random_id = generate_random_prefix()
                    start_new_instance(checkpoint_path, step, random_id)
                    upload_checkpoint_to_s3(checkpoint_path, step, "mj-bucket-1", random_id)
                    break
                else:
                    print("not interrupted")
                interrupt_check_duration = 0.0
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            elapsed = (int(time.time() - elapsed_time))
            if elapsed % 300 == 0 and flag == 0:
                print("uploading current status")
                uploading_current_status_to_rds(step)
                flag = 1
            elif elapsed % 300 != 0 and flag == 1:
                flag = 0
Exemplo n.º 16
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        with tf.variable_scope("model") as scope:
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()
            images_eval, labels_eval = cifar10.inputs(eval_data=True)

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)
            scope.reuse_variables()
            logits_eval = cifar10.inference(images_eval)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)

            # For evaluation
            top_k = tf.nn.in_top_k(logits, labels, 1)
            top_k_eval = tf.nn.in_top_k(logits_eval, labels_eval, 1)

            # Add precision summary
            summary_train_prec = tf.placeholder(tf.float32)
            summary_eval_prec = tf.placeholder(tf.float32)
            tf.scalar_summary('precision/train', summary_train_prec)
            tf.scalar_summary('precision/eval', summary_eval_prec)

            # Build a Graph that trains the model with one batch of examples and
            # updates the model parameters.
            train_op = cifar10.train(loss, global_step)

            # Create a saver.
            saver = tf.train.Saver(tf.all_variables())

            # Build the summary operation based on the TF collection of Summaries.
            summary_op = tf.merge_all_summaries()

            # Build an initialization operation to run below.
            init = tf.initialize_all_variables()

            # Start running operations on the Graph.
            sess = tf.Session(config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement))
            sess.run(init)

            # Start the queue runners.
            tf.train.start_queue_runners(sess=sess)

            summary_writer = tf.train.SummaryWriter(FLAGS.train_dir,
                                                    graph_def=sess.graph_def)

            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value = sess.run([train_op, loss])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), step, loss_value,
                                        examples_per_sec, sec_per_batch))

                EVAL_STEP = 10
                EVAL_NUM_EXAMPLES = 1024
                if step % EVAL_STEP == 0:
                    prec_train = evaluate_set(sess, top_k, EVAL_NUM_EXAMPLES)
                    prec_eval = evaluate_set(sess, top_k_eval,
                                             EVAL_NUM_EXAMPLES)
                    print('%s: precision train = %.3f' %
                          (datetime.now(), prec_train))
                    print('%s: precision eval  = %.3f' %
                          (datetime.now(), prec_eval))

                if step % 100 == 0:
                    summary_str = sess.run(summary_op,
                                           feed_dict={
                                               summary_train_prec: prec_train,
                                               summary_eval_prec: prec_eval
                                           })
                    summary_writer.add_summary(summary_str, step)

                # Save the model checkpoint periodically.
                if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print('PS hosts are: %s' % ps_hosts)
    print('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server({
        'ps': ps_hosts,
        'worker': worker_hosts
    },
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)

    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(
                allow_soft_placement=True,
                log_device_placement=FLAGS.log_device_placement)

            print("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)
            print("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print('Started %d queues for processing input data.' %
                  len(queue_runners))
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(
                    loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = (
                        '%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)'
                    )
                    print(format_str % (datetime.now(), step, gs, loss_value,
                                        examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess,
                   os.path.join(FLAGS.train_dir, 'model.ckpt'),
                   global_step=global_step)
Exemplo n.º 18
0
                          ksize=[1, 3, 3, 1],
                          strides=[1, 2, 2, 1],
                          padding='SAME')


cifar10.maybe_download_and_extract()
with tf.Graph().as_default(), tf.device('/cpu:0'):
    opt = tf.train.AdamOptimizer(1e-4)

    # Calculate the gradients for each model tower.
    tower_grads = []
    for i in xrange(NUM_GPU):
        with tf.device('/gpu:%d' % i):
            with tf.name_scope('%s_%d' % ('tower', i)) as scope:
                #train augmentation code를 포함하고 있으며 tensorflow/models/image/cifar10.py에 있음.
                images, labels = cifar10.distorted_inputs()

                # conv1. 이해가 쉽도록 짧은 layer를 사용
                with tf.variable_scope('h_conv1') as scope:
                    W_conv1 = weight_variable([5, 5, 3, 64], 'W_conv1')
                    b_conv1 = bias_variable([64], 'b_conv1')
                    h_conv1 = tf.nn.relu(conv2d(images, W_conv1) + b_conv1,
                                         name=scope.name)

                # pool1
                h_pool1 = tf.nn.max_pool(h_conv1,
                                         ksize=[1, 3, 3, 1],
                                         strides=[1, 2, 2, 1],
                                         padding='SAME',
                                         name='h_pool1')
def main(unused_argv):
    cifar10.maybe_download_and_extract()
    if FLAGS.download_only:
        sys.exit(0)
    if FLAGS.job_name is None or FLAGS.job_name == "":
        raise ValueError("Must specify an explicit `job_name`")
    if FLAGS.task_index is None or FLAGS.task_index == "":
        raise ValueError("Must specify an explicit `task_index`")

    print("job name = %s" % FLAGS.job_name)
    print("task index = %d" % FLAGS.task_index)

    #Construct the cluster and start the server
    ps_spec = FLAGS.ps_hosts.split(",")
    worker_spec = FLAGS.worker_hosts.split(",")

    #Approximation Layers
    approx_layers = FLAGS.layers_to_train.split(",")
    len_approx_layers = len(approx_layers)

    # Get the number of workers.
    num_workers = len(worker_spec)
    num_ps = len(ps_spec)

    cluster = tf.train.ClusterSpec({"ps": ps_spec, "worker": worker_spec})

    if not FLAGS.existing_servers:
        # Not using existing servers. Create an in-process server.
        server = tf.train.Server(cluster,
                                 job_name=FLAGS.job_name,
                                 task_index=FLAGS.task_index)
        if FLAGS.job_name == "ps":
            server.join()

    is_chief = (FLAGS.task_index == 0)
    if FLAGS.num_gpus > 0:
        if FLAGS.num_gpus < num_workers:
            raise ValueError("number of gpus is less than number of workers")
        # Avoid gpu allocation conflict: now allocate task_num -> #gpu
        # for each worker in the corresponding machine
        gpu = (FLAGS.task_index % FLAGS.num_gpus)
        worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
    elif FLAGS.num_gpus == 0:
        # Just allocate the CPU to worker server
        cpu = 0
        worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
    # The device setter will automatically place Variables ops on separate
    # parameter servers (ps). The non-Variable ops will be placed on the workers.
    # The ps use CPU and workers use corresponding GPU
    with tf.device(
            tf.train.replica_device_setter(
                worker_device=worker_device,
                ps_device="/job:ps/cpu:0",
                cluster=cluster,
                ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                    num_ps, tf.contrib.training.byte_size_load_fn))):
        global_step = tf.Variable(0, name="global_step", trainable=False)
        #variables_to_update = tf.Placeholder(, name="variables_to_update")

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        #train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=None)

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()
        # Variables that affect learning rate.
        num_batches_per_epoch = 50000 / FLAGS.batch_size
        decay_steps = int(num_batches_per_epoch * 350)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(0.1,
                                        global_step,
                                        decay_steps,
                                        0.1,
                                        staircase=True)

        opt = tf.train.GradientDescentOptimizer(lr)

        if FLAGS.sync_replicas:
            if FLAGS.replicas_to_aggregate is None:
                replicas_to_aggregate = num_workers
            else:
                replicas_to_aggregate = FLAGS.replicas_to_aggregate

            opt = tf.train.SyncReplicasOptimizerV2(
                opt,
                replicas_to_aggregate=replicas_to_aggregate,
                total_num_replicas=num_workers,
                name="cifar10_sync_replicas")

        #trainable_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        train_step = opt.minimize(loss, global_step=global_step)

        # Approximation Training
        var_list = []
        for i in range(len_approx_layers):
            var_list = var_list + tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope=approx_layers[i])

        train_step_approx = opt.minimize(loss,
                                         global_step=global_step,
                                         var_list=var_list)

        if FLAGS.sync_replicas:
            local_init_op = opt.local_step_init_op
            if is_chief:
                local_init_op = opt.chief_init_op

            ready_for_local_init_op = opt.ready_for_local_init_op

            # Initial token and chief queue runners required by the sync_replicas mode
            chief_queue_runner = opt.get_chief_queue_runner()
            sync_init_op = opt.get_init_tokens_op()

        init_op = tf.global_variables_initializer()
        train_dir = tempfile.mkdtemp(dir="/mnt",
                                     suffix="data",
                                     prefix="cifar10_train")

        if FLAGS.sync_replicas:
            sv = tf.train.Supervisor(
                is_chief=is_chief,
                logdir=train_dir,
                init_op=init_op,
                local_init_op=local_init_op,
                saver=None,
                summary_op=summary_op,
                save_summaries_secs=120,
                save_model_secs=600,
                checkpoint_basename='model.ckpt',
                ready_for_local_init_op=ready_for_local_init_op,
                recovery_wait_secs=1,
                global_step=global_step)
        else:
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=train_dir,
                                     init_op=init_op,
                                     saver=None,
                                     summary_op=summary_op,
                                     save_summaries_secs=120,
                                     save_model_secs=600,
                                     checkpoint_basename='model.ckpt',
                                     recovery_wait_secs=1,
                                     global_step=global_step)

        sess_config = tf.ConfigProto(allow_soft_placement=True,
                                     log_device_placement=False,
                                     device_filters=[
                                         "/job:ps",
                                         "/job:worker/task:%d" %
                                         FLAGS.task_index
                                     ])

        # The chief worker (task_index==0) session will prepare the session,
        # while the remaining workers will wait for the preparation to complete.
        if is_chief:
            print("Worker %d: Initializing session..." % FLAGS.task_index)
        else:
            print("Worker %d: Waiting for session to be initialized..." %
                  FLAGS.task_index)

        if FLAGS.existing_servers:
            server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
            print("Using existing server at: %s" % server_grpc_url)

            sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                                  config=sess_config)
        else:
            sess = sv.prepare_or_wait_for_session(server.target,
                                                  config=sess_config)

        print("Worker %d: Session initialization complete." % FLAGS.task_index)

        if FLAGS.sync_replicas and is_chief:
            # Chief worker will start the chief queue runner and call the init op.
            sess.run(sync_init_op)
            sv.start_queue_runners(sess, [chief_queue_runner])

        # Restore from Checkpoint
        if FLAGS.checkpoint_restore > 0:
            checkpoint_directory = FLAGS.checkpoint_dir + str(
                FLAGS.checkpoint_restore)
            ckpt = tf.train.get_checkpoint_state(checkpoint_directory)
            if ckpt and ckpt.model_checkpoint_path:
                # Restores from checkpoint
                saver.restore(sess, ckpt.model_checkpoint_path)
                # Assuming model_checkpoint_path looks something like:
                #   /my-favorite-path/cifar10_train/model.ckpt-0,
                # extract global_step from it.
                #global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
            else:
                print('No checkpoint file found')
                return

        # Perform training
        time_begin = time.time()
        print("Training begins @ %f" % time_begin)

        local_step = 0
        num_examples_per_step = 128
        f = open('/mnt/train_output.log', 'w')
        #f.write("Training begins @ " + str(time_begin) +"\n")
        f.write(
            "Duration\tWorker\tLocalStep\tGlobalStep\tLoss\tExamplesPerSec\n")
        f.close()
        last = time_begin
        while True:
            start_time = time.time()
            if local_step < FLAGS.approx_step:
                _, step, loss_value = sess.run([train_step, global_step, loss])
            else:
                if local_step % FLAGS.approx_interval == 0:
                    _, step, loss_value = sess.run(
                        [train_step_approx, global_step, loss])
                else:
                    _, step, loss_value = sess.run(
                        [train_step, global_step, loss])

            duration = time.time() - start_time
            local_step += 1
            if local_step % 10 == 0:
                now = time.time()
                examples_per_sec = 10 * num_examples_per_step / (now - last)
                print(
                    "%f: Worker %d: step %d (global step: %d of %d) loss = %.2f examples_per_sec = %.2f \n"
                    % (now - last, FLAGS.task_index, local_step, step,
                       FLAGS.train_steps, loss_value, examples_per_sec))
                f = open('/mnt/train_output.log', 'a')
                f.write(
                    str(now - last) + "\t" + str(FLAGS.task_index) + "\t" +
                    str(local_step) + "\t" + str(step) + "\t" +
                    str(loss_value) + "\t" + str(examples_per_sec) + "\n")
                f.close()
                last = now

            if step >= FLAGS.train_steps:
                break

            if sv.should_stop():
                print('Stopped due to abort')
                break
            # Save the model checkpoint periodically.
            #if is_chief and (step % 1000 == 0 or (step + 1) == FLAGS.train_steps):
            if (step % 1000 == 0 or (step + 1) == FLAGS.train_steps):
                print('Taking a Checkpoint @ Global Step ' + str(step))
                checkpoint_dir = "/mnt/checkpoint" + str(step)
                if tf.gfile.Exists(checkpoint_dir):
                    tf.gfile.DeleteRecursively(checkpoint_dir)
                tf.gfile.MakeDirs(checkpoint_dir)
                checkpoint_path = os.path.join(checkpoint_dir, "model.ckpt")
                saver.save(sess, checkpoint_path, global_step=step)

        time_end = time.time()
        print("Training ends @ %f" % time_end)
        f = open('/mnt/train_output.log', 'a')
        #f.write("Training ends @ " + str(time_end) +"\n")
        training_time = time_end - time_begin
        print("Training elapsed time: %f s" % training_time)
        f.write("Training elapsed time: " + str(training_time) + " s\n")
        f.close()
Exemplo n.º 20
0
FLAGS = tf.app.flags.FLAGS

# import cifar10 data
from tensorflow.models.image.cifar10 import cifar10
cifar10.maybe_download_and_extract()

# global variable to select which (and how many) GPU's to use
# (tensorflow can be hungry with resources if not properly controlled)
gpus_to_use = [3]

# network input (data and correct labels)
# x = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
# y_ = tf.placeholder(tf.float32, shape=[None, 10])

train_images, train_labels = cifar10.distorted_inputs()
test_images, test_labels = cifar10.inputs(eval_data=True)

# select stream to use (train or test)
select_test = tf.placeholder(dtype=bool,shape=[],name='select_test')
x = tf.cond(
    select_test,
    lambda:test_images,
    lambda:train_images
)
y_ = tf.cond(
    select_test,
    lambda:test_labels,
    lambda:train_labels
)
def train():
    print("\nSource code of training file {}:\n\n{}".format(__file__, open(__file__).read()))

    log('loading CIFAR')
    # Import data
    training_batch = cifar10.distorted_inputs()

    lm = LayerManager(forward_biased_estimate=False)
    batch = tf.Variable(0)

    with tf.name_scope('input'):
        fed_input_data = tf.placeholder(tf.float32, [None, IMAGE_SIZE, IMAGE_SIZE, 3])
        fed_input_labels = tf.placeholder(tf.int32, [None])
        drop_probs = [tf.Variable(tf.constant(DEFAULT_KEEP_PROB, shape=[1, 1, 1, ], dtype=tf.float32), trainable=False, collections=['Dropout']) for _ in range(NUM_DROPOUT_LAYERS)]

    with tf.name_scope('posterior'):
        training_batch_error, _, _, _ = full_model(lm, drop_probs, *training_batch)
    training_merged = lm.summaries.merge_all_summaries()
    lm.is_training = False
    tf.get_variable_scope().reuse_variables()
    lm.summaries.reset()
    with tf.name_scope('test'):
        _, test_percent_error, _, _ = full_model(lm, drop_probs, *cifar10.inputs(eval_data=True))
    with tf.name_scope('forward'):
        _, _, forward_per_example_error, forward_incorrect_examples = full_model(lm, drop_probs, fed_input_data, fed_input_labels)

    def compute_test_percent_error():
        return numpy.mean([sess.run([test_percent_error]) for _ in range(int(numpy.ceil(FLAGS.num_test_examples / FLAGS.batch_size)))])

    saver = tf.train.Saver(tf.trainable_variables() + tf.get_collection('BatchNormInternal'))

    learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, batch, 5000, 0.8, staircase=True)

    train_step = tf.train.AdamOptimizer(learning_rate).minimize(training_batch_error, global_step=batch, var_list=lm.filter_factory.variables + lm.weight_factory.variables + lm.bias_factory.variables + lm.scale_factory.variables)

    fed_drop_probs = tf.placeholder(tf.float32, [None, None, None, None])
    update_drop_probs = [tf.assign(drop_prob, fed_drop_probs, validate_shape=False) for drop_prob in drop_probs]

    with tf.Session() as sess:
        sess.run(tf.initialize_all_variables())
        sess.run(tf.initialize_variables(tf.get_collection('BatchNormInternal')))
        sess.run(tf.initialize_variables(tf.get_collection('Dropout')))

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        if TRAIN:
            train_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/train', sess.graph)
            # test_writer = tf.train.SummaryWriter(FLAGS.summaries_dir + '/test')
            try:
                log('starting training')
                for i in range(FLAGS.max_steps):
                    if i % 1000 == 999: # Do test set
                        err = compute_test_percent_error()
                        for j in range(NUM_DROPOUT_LAYERS):
                            sess.run([update_drop_probs[j]], feed_dict={fed_drop_probs: [[[[1.0]]]]})
                        det_err = compute_test_percent_error()
                        for j in range(NUM_DROPOUT_LAYERS):
                            sess.run([update_drop_probs[j]], feed_dict={fed_drop_probs: [[[[DEFAULT_KEEP_PROB]]]]})
                        log('batch %s: Random test classification error = %s%%, deterministic test classification error = %s%%' % (i, err, det_err))
                    if i % 100 == 99: # Record a summary
                        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
                        run_metadata = tf.RunMetadata()
                        summary, _ = sess.run([training_merged, train_step],
                                              options=run_options,
                                              run_metadata=run_metadata)
                        train_writer.add_summary(summary, i)
                        train_writer.add_run_metadata(run_metadata, 'batch%d' % i)
                    else:
                        sess.run([train_step])
            finally:
                log('saving')
                saver.save(sess, FLAGS.train_dir, global_step=batch)
                log('done')
        else:
            restore_latest(saver, sess, '/tmp/derandomizing_dropout', suffix='-100000')

        if DERANDOMIZE_DROPOUT:
            # NUM_RUNS = 10
            # runs = []
            # for _ in range(NUM_RUNS):
            #     new_output_probs, = sess.run([forward_output], feed_dict={fed_input_data: mnist.train.images, fed_input_labels: mnist.train.labels})
            #     new_output = numpy.argmax(new_output_probs, 1)
            #     runs.append(new_output)
            #
            # all_runs = numpy.vstack(runs).T
            # entropy = numpy.array([scipy.stats.entropy(numpy.bincount(row), base=2.0) for row in all_runs])


            derandomized_drop_probs = [DEFAULT_KEEP_PROB * numpy.ones((1, HIDDEN_LAYER_SIZE)) for _ in range(NUM_DROPOUT_LAYERS)]

            num_tests_performed = 0

            for pass_count in range(1):
                for j in range(HIDDEN_LAYER_SIZE):
                    for i in range(NUM_DROPOUT_LAYERS):  # range(NUM_DROPOUT_LAYERS-1,-1,-1):
                        if derandomized_drop_probs[i][0, j] == 0.0 or derandomized_drop_probs[i][0, j] == 1.0:
                            continue
                        num_tests_performed += 1
                        for k in range(NUM_DROPOUT_LAYERS):
                            if k == i:
                                # curr_drop_probs = numpy.tile(derandomized_drop_probs[i], (BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, 1))
                                # to_randomize = HIDDEN_LAYER_SIZE - j - 1
                                # randperms = numpy.argsort(numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, to_randomize), axis=1)
                                #
                                # to_keep = max(int(HIDDEN_LAYER_SIZE*DEFAULT_KEEP_PROB-derandomized_drop_probs[i][:j].sum()), 1)
                                # curr_drop_probs[:, j+1:] = (randperms < to_keep)


                                curr_drop_probs = (numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE, HIDDEN_LAYER_SIZE) < derandomized_drop_probs[i]).astype(numpy.float32)
                                curr_drop_probs[:, j] = 0.0
                                # curr_drop_probs[:, j+1:j+2] = 1.0
                                sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: curr_drop_probs})
                            else:
                                sess.run([update_drop_probs[k]], feed_dict={fed_drop_probs: numpy.random.rand(BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE, HIDDEN_LAYER_SIZE) < derandomized_drop_probs[k]})

                        #indices = numpy.argmax(entropy[:, numpy.newaxis] + -numpy.log(-numpy.log(numpy.random.rand(entropy.shape[0], BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE))), axis=0)

                        #  indices = [numpy.argmax(1000*entropy + -numpy.log(-numpy.log(numpy.random.rand(*entropy.shape)))) for _ in range(BATCHES_PER_DERANDOMIZE_STEP*BATCH_SIZE)]
                        # examples = mnist.train.images[indices, :]
                        # labels = mnist.train.labels[indices]
                        # Collect a bunch of 64-example batches together
                        examples, labels = [numpy.concatenate(things, axis=0) for things in zip(*[sess.run(training_batch) for _ in range(BATCHES_PER_DERANDOMIZE_STEP)])]

                        # Might want to use cross entropy, but why not not use percent error since we're not differentiating?
                        # Using "test" expressions so we can manually feed in data, but we are feeding training data (same data for obj0 and obj1)
                        err0, cross_entropies0 = sess.run([forward_incorrect_examples, forward_per_example_error], feed_dict={fed_input_data: examples, fed_input_labels: labels})
                        curr_drop_probs[:, j] = 1.0
                        # curr_drop_probs[:, j+1:] = (randperms < to_keep - 1)
                        # curr_drop_probs[:, j+1:j+2] = 0.0
                        sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: curr_drop_probs})
                        err1, cross_entropies1 = sess.run([forward_incorrect_examples, forward_per_example_error], feed_dict={fed_input_data: examples, fed_input_labels: labels})

                        # One-sided paired-sample t-test
                        cross_entropy_diff = cross_entropies0 - cross_entropies1
                        t = numpy.sqrt(BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE)*cross_entropy_diff.mean()/cross_entropy_diff.std(ddof=1)
                        p = scipy.stats.t.sf(-t, df=BATCHES_PER_DERANDOMIZE_STEP * BATCH_SIZE - 1)

                        b = numpy.sum(err0 & ~err1)
                        c = numpy.sum(err1 & ~err0)
                        # if b + c < BINOMIAL_TEST_CUTOFF:
                        #     p = 0.5
                        #     stat_message = "too small"
                        # else:
                        #     # McNemar's test
                        #     if b + c >= CHI2_TEST_CUTOFF:
                        #         chi2 = (b-c)**2/(b+c)
                        #         p = scipy.stats.distributions.chi2.sf(chi2, df=1)  # Two-sided
                        #     else:
                        #         p = scipy.stats.binom_test([b,c]) - scipy.stats.binom.pmf(b, b+c, 0.5)  # Mid-p test
                        #     # Form one-sided p-value
                        #     if b > c:
                        #         p = 1-0.5*p
                        #     else:
                        #         p = 0.5*p
                        #     if b + c >= CHI2_TEST_CUTOFF:
                        #         stat_message = "p = %.4f, chi square test" % p
                        #     else:
                        #         stat_message = "p = %.4f, binomial mid-p test" % p

                        if p < SIGNIFICANCE_LEVEL:  # cross_entropies0.mean() <= cross_entropies1.mean():  # b <= c:
                            new_drop_prob = 0.0
                            neuron_status = "drop"
                        elif p > 1 - SIGNIFICANCE_LEVEL:
                            new_drop_prob = 1.0
                            neuron_status = "keep"
                        else:
                            new_drop_prob = DEFAULT_KEEP_PROB
                            neuron_status = "hmmm"

                        #log(neuron_status + ' L{} N{}: b + c = {}, {}'.format(i, j, b+c, stat_message))
                        log(neuron_status + ' P{} L{} N{}: b = {}, c = {}, p = {}'.format(pass_count, i, j, b, c, p))
                        derandomized_drop_probs[i][0, j] = new_drop_prob
                for i in range(NUM_DROPOUT_LAYERS):
                    num_dropped = (derandomized_drop_probs[i] == 0.0).sum()
                    num_kept = (derandomized_drop_probs[i] == 1.0).sum()
                    num_hmmm = HIDDEN_LAYER_SIZE - num_dropped - num_kept
                    sess.run([update_drop_probs[i]], feed_dict={fed_drop_probs: numpy.ceil(derandomized_drop_probs[i])})

                    log('layer {}: {} neurons dropped, {} kept, {} undecided'.format(i, num_dropped, num_kept, num_hmmm))
                log('Performed {} statistical tests'.format(num_tests_performed))
            log('saving')
            saver.save(sess, FLAGS.train_dir, global_step=batch+1)
            log('done')
        else:
            restore_latest(saver, sess, '/tmp/derandomizing_dropout', suffix='-100001')

        err, = compute_test_percent_error()
        log('Test classification error = %s%%' % err)

        coord.request_stop()
        coord.join(threads)
        sess.close()
Exemplo n.º 22
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

        summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
        summary_writer1 = tf.train.SummaryWriter(FLAGS.train_dir1)
        summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
        summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
        summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
        summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
        summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
        summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
        summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
        summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
        summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
        summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
        summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
        summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
        summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
        summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
        summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
        summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
        summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
        summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)
                summary_writer0.add_summary(summary_str, step)
                summary_writer1.add_summary(summary_str, step)
                summary_writer2.add_summary(summary_str, step)
                summary_writer3.add_summary(summary_str, step)
                summary_writer4.add_summary(summary_str, step)
                summary_writer5.add_summary(summary_str, step)
                summary_writer6.add_summary(summary_str, step)
                summary_writer7.add_summary(summary_str, step)
                summary_writer8.add_summary(summary_str, step)
                summary_writer9.add_summary(summary_str, step)
                summary_writer10.add_summary(summary_str, step)
                summary_writer11.add_summary(summary_str, step)
                summary_writer12.add_summary(summary_str, step)
                summary_writer13.add_summary(summary_str, step)
                summary_writer14.add_summary(summary_str, step)
                summary_writer15.add_summary(summary_str, step)
                summary_writer16.add_summary(summary_str, step)
                summary_writer17.add_summary(summary_str, step)
                summary_writer18.add_summary(summary_str, step)
                summary_writer19.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
            #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
            #   saver.save(sess, checkpoint_path, global_step=step/100)

            # hard cord here!!!
            if step == 100:
                checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 200:
                checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

            if step == 300:
                checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 400:
                checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 500:
                checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 600:
                checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 700:
                checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 800:
                checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 900:
                checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1000:
                checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1100:
                checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1200:
                checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1300:
                checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1400:
                checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1500:
                checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1600:
                checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1700:
                checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1800:
                checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 1900:
                checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
            if step == 2000:
                checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 23
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)


    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

    summary_writer0 = tf.train.SummaryWriter(FLAGS.train_dir0)
    summary_writer1= tf.train.SummaryWriter(FLAGS.train_dir1)
    summary_writer2 = tf.train.SummaryWriter(FLAGS.train_dir2)
    summary_writer3 = tf.train.SummaryWriter(FLAGS.train_dir3)
    summary_writer4 = tf.train.SummaryWriter(FLAGS.train_dir4)
    summary_writer5 = tf.train.SummaryWriter(FLAGS.train_dir5)
    summary_writer6 = tf.train.SummaryWriter(FLAGS.train_dir6)
    summary_writer7 = tf.train.SummaryWriter(FLAGS.train_dir7)
    summary_writer8 = tf.train.SummaryWriter(FLAGS.train_dir8)
    summary_writer9 = tf.train.SummaryWriter(FLAGS.train_dir9)
    summary_writer10 = tf.train.SummaryWriter(FLAGS.train_dir10)
    summary_writer11 = tf.train.SummaryWriter(FLAGS.train_dir11)
    summary_writer12 = tf.train.SummaryWriter(FLAGS.train_dir12)
    summary_writer13 = tf.train.SummaryWriter(FLAGS.train_dir13)
    summary_writer14 = tf.train.SummaryWriter(FLAGS.train_dir14)
    summary_writer15 = tf.train.SummaryWriter(FLAGS.train_dir15)
    summary_writer16 = tf.train.SummaryWriter(FLAGS.train_dir16)
    summary_writer17 = tf.train.SummaryWriter(FLAGS.train_dir17)
    summary_writer18 = tf.train.SummaryWriter(FLAGS.train_dir18)
    summary_writer19 = tf.train.SummaryWriter(FLAGS.train_dir19)
   


    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'


      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        summary_writer0.add_summary(summary_str, step)
        summary_writer1.add_summary(summary_str, step)
        summary_writer2.add_summary(summary_str, step)
        summary_writer3.add_summary(summary_str, step)
        summary_writer4.add_summary(summary_str, step)
        summary_writer5.add_summary(summary_str, step)
        summary_writer6.add_summary(summary_str, step)
        summary_writer7.add_summary(summary_str, step)
        summary_writer8.add_summary(summary_str, step)
        summary_writer9.add_summary(summary_str, step)
        summary_writer10.add_summary(summary_str, step)
        summary_writer11.add_summary(summary_str, step)
        summary_writer12.add_summary(summary_str, step)
        summary_writer13.add_summary(summary_str, step)
        summary_writer14.add_summary(summary_str, step)
        summary_writer15.add_summary(summary_str, step)
        summary_writer16.add_summary(summary_str, step)
        summary_writer17.add_summary(summary_str, step)
        summary_writer18.add_summary(summary_str, step)
        summary_writer19.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      # if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
      #   checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #   saver.save(sess, checkpoint_path, global_step=step/100)

        # hard cord here!!!
      if step==100:
        checkpoint_path = os.path.join(FLAGS.train_dir0, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==200:
        checkpoint_path = os.path.join(FLAGS.train_dir1, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)

      if step==300:
        checkpoint_path = os.path.join(FLAGS.train_dir2, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==400:
        checkpoint_path = os.path.join(FLAGS.train_dir3, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==500:
        checkpoint_path = os.path.join(FLAGS.train_dir4, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==600:
        checkpoint_path = os.path.join(FLAGS.train_dir5, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==700:
        checkpoint_path = os.path.join(FLAGS.train_dir6, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==800:
        checkpoint_path = os.path.join(FLAGS.train_dir7, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==900:
        checkpoint_path = os.path.join(FLAGS.train_dir8, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1000:
        checkpoint_path = os.path.join(FLAGS.train_dir9, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1100:
        checkpoint_path = os.path.join(FLAGS.train_dir10, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1200:
        checkpoint_path = os.path.join(FLAGS.train_dir11, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1300:
        checkpoint_path = os.path.join(FLAGS.train_dir12, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1400:
        checkpoint_path = os.path.join(FLAGS.train_dir13, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1500:
        checkpoint_path = os.path.join(FLAGS.train_dir14, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1600:
        checkpoint_path = os.path.join(FLAGS.train_dir15, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1700:
        checkpoint_path = os.path.join(FLAGS.train_dir16, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1800:
        checkpoint_path = os.path.join(FLAGS.train_dir17, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==1900:
        checkpoint_path = os.path.join(FLAGS.train_dir18, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
      if step==2000:
        checkpoint_path = os.path.join(FLAGS.train_dir19, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 24
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # #Adding dropout
        # keep_drop_prob = tf.placeholder(tf.float32)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # ###########Changes for visualization ###############
        with tf.variable_scope('conv1') as scope_conv:
            tf.get_variable_scope().reuse_variables()
            weights = tf.get_variable('weights')
            grid_x = grid_y = 8  # to get a square grid for 64 conv1 features
            grid = put_kernels_on_grid(weights, (grid_y, grid_x))
            tf.image_summary('conv1/features', grid, max_images=1)
        # ####################################################


        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in range(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                              'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def train():
    ps_hosts = FLAGS.ps_hosts.split(',')
    worker_hosts = FLAGS.worker_hosts.split(',')
    print ('PS hosts are: %s' % ps_hosts)
    print ('Worker hosts are: %s' % worker_hosts)

    server = tf.train.Server(
        {'ps': ps_hosts, 'worker': worker_hosts},
        job_name = FLAGS.job_name,
        task_index=FLAGS.task_id)

    if FLAGS.job_name == 'ps':
        server.join()

    is_chief = (FLAGS.task_id == 0)
    if is_chief:
        if tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.DeleteRecursively(FLAGS.train_dir)
        tf.gfile.MakeDirs(FLAGS.train_dir)
  
    device_setter = tf.train.replica_device_setter(ps_tasks=1)
    with tf.device('/job:worker/task:%d' % FLAGS.task_id):
        with tf.device(device_setter):
            global_step = tf.Variable(0, trainable=False)

            # Get images and labels for CIFAR-10.
            images, labels = cifar10.distorted_inputs()

            # Build a Graph that computes the logits predictions from the
            # inference model.
            logits = cifar10.inference(images)

            # Calculate loss.
            loss = cifar10.loss(logits, labels)
            train_op = cifar10.train(loss, global_step)

            saver = tf.train.Saver()
            # We run the summaries in the same thread as the training operations by
            # passing in None for summary_op to avoid a summary_thread being started.
            # Running summaries and training operations in parallel could run out of
            # GPU memory.
            sv = tf.train.Supervisor(is_chief=is_chief,
                                     logdir=FLAGS.train_dir,
                                     init_op=tf.initialize_all_variables(),
                                     summary_op=tf.merge_all_summaries(),
                                     global_step=global_step,
                                     saver=saver,
                                     save_model_secs=60)

            tf.logging.info('%s Supervisor' % datetime.now())

            sess_config = tf.ConfigProto(allow_soft_placement=True,
                                         log_device_placement=FLAGS.log_device_placement)

            print ("Before session init")
            # Get a session.
            sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)
            print ("Session init done")

            # Start the queue runners.
            queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
            sv.start_queue_runners(sess, queue_runners)
            print ('Started %d queues for processing input data.' % len(queue_runners))
  
            """Train CIFAR-10 for a number of steps."""
            for step in xrange(FLAGS.max_steps):
                start_time = time.time()
                _, loss_value, gs = sess.run([train_op, loss, global_step])
                duration = time.time() - start_time

                assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

                if step % 10 == 0:
                    num_examples_per_step = FLAGS.batch_size
                    examples_per_sec = num_examples_per_step / duration
                    sec_per_batch = float(duration)

                    format_str = ('%s: step %d (global_step %d), loss = %.2f (%.1f examples/sec; %.3f sec/batch)')
                    print (format_str % (datetime.now(), step, gs, loss_value, examples_per_sec, sec_per_batch))

    if is_chief:
        saver.save(sess, os.path.join(FLAGS.train_dir, 'model.ckpt'), global_step=global_step)
Exemplo n.º 26
0
		#test_model.index = ii
		
		
		
		
		#print test_model.weights
		

		
		models.append(test_model)
		with test_model.g.as_default():
			
			global_step = tf.Variable(0, trainable=False)

			# Get images and labels for CIFAR-10.
			images, labels = cifar10.distorted_inputs()
			test_images, test_labels = cifar10.inputs(eval_data='test')

			# Build a Graph that computes the logits predictions from the
			# inference model.
			logits = test_model.predict(images)
			logit_test = test_model.predict(test_images)

			# Calculate loss.
			loss = cifar10.loss(logits, labels)

			# Build a Graph that trains the model with one batch of examples and
			# updates the model parameters.
			train_op = cifar10.train(loss, global_step)

Exemplo n.º 27
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    eval_data = FLAGS.eval_data == 'test'
    #timages, tlabels = cifar10.inputs(eval_data=eval_data)
    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    #tlogits = cifar10.inference(timages)
    # Calculate loss.
    top_k_op = tf.nn.in_top_k(logits, labels, 1)
    loss = cifar10.loss(logits, labels)
    #precision = tf.Variable(0.8, name='precision')
    #tf.scalar_summary('accuracy', precision)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=FLAGS.log_device_placement))
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)
    sess.graph.finalize()

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value = sess.run([train_op, loss])
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 100 == 0:

    # Build a Graph that computes the logits predictions from the
    # inference model.

    # Calculate predictions.
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))
	num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
        true_count = 0  # Counts the number of correct predictions.
        total_sample_count = num_iter * FLAGS.batch_size
        i_step = 0
        while i_step < num_iter:
          predictions = sess.run([top_k_op])
          true_count += np.sum(predictions)
          i_step += 1

      #Compute precision @ 1.
      	#sess.run(precision.assign(true_count / total_sample_count))
      	prec = true_count / total_sample_count
      	print(prec)
	summary = tf.Summary()
        summary.ParseFromString(sess.run(summary_op))
        summary.value.add(tag='accuracy', simple_value=prec)
        summary_writer.add_summary(summary, step)

	#summary_str = sess.run(summary_op)

        #summary_writer.add_summary(summary_str, step)
       	#summary_writer.flush()

      # Save the model checkpoint periodically.
      if step % 100 == 0 or (step + 1) == FLAGS.max_steps:
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 28
0
def train():
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    #with tf.device('/gpu:%d' % FLAGS.gpu_number):
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)
    loss_per_batch = cifar10.loss_per_batch(logits, labels)
    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    train_op = cifar10.train(loss, global_step, FLAGS.gpu_number)

    # Create a saver.
    saver = tf.train.Saver(tf.all_variables(), max_to_keep=None)

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    config = tf.ConfigProto()
    config.gpu_options.allow_growth=True
    config.allow_soft_placement=True
    config.log_device_placement=FLAGS.log_device_placement
    sess = tf.Session(config=config)
    
    tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                         "cifar10_train.pb", False)
    sess.run(init)

    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

    train_start_time = time.time()
    for step in xrange(FLAGS.max_steps):
      start_time = time.time()
      _, loss_value, logits_value, loss_per_batch_value, labels_value = sess.run([train_op, loss, logits, loss_per_batch, labels])
      duration = time.time() - start_time
      #logits_str = print_logits(logits_value, labels_value, loss_per_batch_value)
      
      #with open(os.path.join(FLAGS.train_dir, 'logits_%d.log' % step),'w') as f:
      #  f.write("%s" % logits_str)

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = FLAGS.batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        log_str  = (format_str % (datetime.now(), step, loss_value,
                                  examples_per_sec, sec_per_batch))
        print(log_str)
        with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
          f.write("%s\n" % log_str)

      if step % 500 == 0:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
        checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
        save_path = saver.save(sess, checkpoint_path, global_step=step)
    train_duration = time.time() - train_start_time

    log_str = ("Finishing. Training %d batches of %d images took %fs\n" %
               (FLAGS.max_steps, FLAGS.batch_size, float(train_duration)))
    print(log_str)
    with open(os.path.join(FLAGS.train_dir, 'train.log'),'a+') as f:
      f.write("%s" % log_str)
def main(unused_argv):
  #mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
  cifar10.maybe_download_and_extract()
  if FLAGS.download_only:
    sys.exit(0)
  #cifar10.maybe_download_and_extract()
  if FLAGS.job_name is None or FLAGS.job_name == "":
    raise ValueError("Must specify an explicit `job_name`")
  if FLAGS.task_index is None or FLAGS.task_index =="":
    raise ValueError("Must specify an explicit `task_index`")

  print("job name = %s" % FLAGS.job_name)
  print("task index = %d" % FLAGS.task_index)

  #Construct the cluster and start the server
  ps_spec = FLAGS.ps_hosts.split(",")
  worker_spec = FLAGS.worker_hosts.split(",")

  # Get the number of workers.
  num_workers = len(worker_spec)

  cluster = tf.train.ClusterSpec({
      "ps": ps_spec,
      "worker": worker_spec})

  if not FLAGS.existing_servers:
    # Not using existing servers. Create an in-process server.
    server = tf.train.Server(
        cluster, job_name=FLAGS.job_name, task_index=FLAGS.task_index)
    if FLAGS.job_name == "ps":
      server.join()

  is_chief = (FLAGS.task_index == 0)
  if FLAGS.num_gpus > 0:
    if FLAGS.num_gpus < num_workers:
      raise ValueError("number of gpus is less than number of workers")
    # Avoid gpu allocation conflict: now allocate task_num -> #gpu 
    # for each worker in the corresponding machine
    gpu = (FLAGS.task_index % FLAGS.num_gpus)
    worker_device = "/job:worker/task:%d/gpu:%d" % (FLAGS.task_index, gpu)
  elif FLAGS.num_gpus == 0:
    # Just allocate the CPU to worker server
    cpu = 0
    worker_device = "/job:worker/task:%d/cpu:%d" % (FLAGS.task_index, cpu)
  # The device setter will automatically place Variables ops on separate
  # parameter servers (ps). The non-Variable ops will be placed on the workers.
  # The ps use CPU and workers use corresponding GPU
  with tf.device(
      tf.train.replica_device_setter(
          worker_device=worker_device,
          ps_device="/job:ps/cpu:0",
          cluster=cluster)):
    cifar10.maybe_download_and_extract()
    global_step = tf.Variable(0, name="global_step", trainable=False)

    # # Variables of the hidden layer
    # hid_w = tf.Variable(
    #     tf.truncated_normal(
    #         [IMAGE_PIXELS * IMAGE_PIXELS, FLAGS.hidden_units],
    #         stddev=1.0 / IMAGE_PIXELS),
    #     name="hid_w")
    # hid_b = tf.Variable(tf.zeros([FLAGS.hidden_units]), name="hid_b")

    # # Variables of the softmax layer
    # sm_w = tf.Variable(
    #     tf.truncated_normal(
    #         [FLAGS.hidden_units, 10],
    #         stddev=1.0 / math.sqrt(FLAGS.hidden_units)),
    #     name="sm_w")
    # sm_b = tf.Variable(tf.zeros([10]), name="sm_b")

    # # Ops: located on the worker specified with FLAGS.task_index
    # x = tf.placeholder(tf.float32, [None, IMAGE_PIXELS * IMAGE_PIXELS])
    # y_ = tf.placeholder(tf.float32, [None, 10])

    # hid_lin = tf.nn.xw_plus_b(x, hid_w, hid_b)
    # hid = tf.nn.relu(hid_lin)

    # y = tf.nn.softmax(tf.nn.xw_plus_b(hid, sm_w, sm_b))
    # cross_entropy = -tf.reduce_sum(y_ * tf.log(tf.clip_by_value(y, 1e-10, 1.0)))

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.
    logits = cifar10.inference(images)

    # Calculate loss.
    loss = cifar10.loss(logits, labels)

    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.
    #train_op = cifar10.train(loss, global_step)

    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries();
    # Variables that affect learning rate.
    num_batches_per_epoch = 50000 / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * 350)

    # Decay the learning rate exponentially based on the number of steps.
    lr = tf.train.exponential_decay(0.1,
                                    global_step,
                                    decay_steps,
                                    0.1,
                                    staircase=True)

    # Generate moving averages of all losses and associated summaries.
    #loss_averages_op = _add_loss_summaries(total_loss)

    opt = tf.train.GradientDescentOptimizer(lr)
    
    #opt = tf.train.AdamOptimizer(FLAGS.learning_rate)

    if FLAGS.sync_replicas:
      if FLAGS.replicas_to_aggregate is None:
        replicas_to_aggregate = num_workers
      else:
        replicas_to_aggregate = FLAGS.replicas_to_aggregate

      opt = tf.train.SyncReplicasOptimizerV2(
          opt,
          replicas_to_aggregate=replicas_to_aggregate,
          total_num_replicas=num_workers,
          name="cifar10_sync_replicas")

    train_step = opt.minimize(loss, global_step=global_step)

    if FLAGS.sync_replicas:
      local_init_op = opt.local_step_init_op
      if is_chief:
        local_init_op = opt.chief_init_op

      ready_for_local_init_op = opt.ready_for_local_init_op

      # Initial token and chief queue runners required by the sync_replicas mode
      chief_queue_runner = opt.get_chief_queue_runner()
      sync_init_op = opt.get_init_tokens_op()

    init_op = tf.global_variables_initializer()
    train_dir = tempfile.mkdtemp(dir="/mnt")

    if FLAGS.sync_replicas:
      sv = tf.train.Supervisor(
          is_chief=is_chief,
          logdir=train_dir,
          init_op=init_op,
          local_init_op=local_init_op,
          ready_for_local_init_op=ready_for_local_init_op,
          recovery_wait_secs=1,
          global_step=global_step)
    else:
      sv = tf.train.Supervisor(
          is_chief=is_chief,
          logdir=train_dir,
          init_op=init_op,
          recovery_wait_secs=1,
          global_step=global_step)

    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        device_filters=["/job:ps", "/job:worker/task:%d" % FLAGS.task_index])

    # The chief worker (task_index==0) session will prepare the session,
    # while the remaining workers will wait for the preparation to complete.
    if is_chief:
      print("Worker %d: Initializing session..." % FLAGS.task_index)
    else:
      print("Worker %d: Waiting for session to be initialized..." %
            FLAGS.task_index)

    if FLAGS.existing_servers:
      server_grpc_url = "grpc://" + worker_spec[FLAGS.task_index]
      print("Using existing server at: %s" % server_grpc_url)

      sess = sv.prepare_or_wait_for_session(server_grpc_url,
                                            config=sess_config)
    else:
      sess = sv.prepare_or_wait_for_session(server.target, config=sess_config)

    print("Worker %d: Session initialization complete." % FLAGS.task_index)

    if FLAGS.sync_replicas and is_chief:
      # Chief worker will start the chief queue runner and call the init op.
      sess.run(sync_init_op)
      sv.start_queue_runners(sess, [chief_queue_runner])

    # Perform training
    time_begin = time.time()
    print("Training begins @ %f" % time_begin)

    local_step = 0
    while True:
      start_time = time.time()
      _, step = sess.run([train_step, global_step])
      duration = time.time() - start_time
      local_step += 1
      #assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      #if step % 10 == 0:
      #  num_examples_per_step = FLAGS.batch_size
      #  examples_per_sec = num_examples_per_step / duration
      #  sec_per_batch = float(duration)
#	loss_value = 0
 #       format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
#                      'sec/batch)')
#        print (format_str % (datetime.now(), local_step, loss_value,
#                             examples_per_sec, sec_per_batch))
      now = time.time()
      print("%f: Worker %d: training step %d done (global step: %d)" % (now, FLAGS.task_index, local_step, step))

      if step >= FLAGS.train_steps:
        break

      #if step % 100 == 0:
      #  summary_str = sess.run(summary_op)
      #  summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      #if step % 1000 == 0 or (step + 1) == FLAGS.train_steps:
      #  checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
      #  saver.save(sess, checkpoint_path, global_step=step)
    # local_step = 0
    # while True:
    #   # Training feed
    #   batch_xs, batch_ys = mnist.train.next_batch(FLAGS.batch_size)
    #   train_feed = {x: batch_xs, y_: batch_ys}

    #   _, step = sess.run([train_step, global_step], feed_dict=train_feed)
    #   local_step += 1

    #   now = time.time()
    #   print("%f: Worker %d: training step %d done (global step: %d)" %
    #         (now, FLAGS.task_index, local_step, step))

    #   if step >= FLAGS.train_steps:
    #     break

    time_end = time.time()
    print("Training ends @ %f" % time_end)
    training_time = time_end - time_begin
    print("Training elapsed time: %f s" % training_time)
Exemplo n.º 30
0
def multilevel_train_1ord():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        # Get images and labels for CIFAR-10.
        images, labels = cifar10.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar10.inference(images)

        # Calculate loss.
        loss = cifar10.loss(logits, labels)

        #Accurarcy
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar10.train(loss, global_step)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph.
        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(FLAGS.train_dir, sess.graph)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            accurarcy = sess.run(top_k_op)
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            output_list = []
            # Do something with intermediate data (intermediate)
            # Save data on iterations of 0, 1000, 2000, 3000
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print(v.name)
                        output_list.append(
                            tf.get_default_graph().get_tensor_by_name(v.name))
                        break
                if (step == 0):
                    conv1_data_0 = sess.run(output_list)
                if (step == 1000):
                    conv1_data_1000 = sess.run(output_list)
                if (step == 2000):
                    conv1_data_2000 = sess.run(output_list)
                if (step == 3000):
                    conv1_data_3000 = sess.run(output_list)
                    (A, B, C, D, E) = np.array(conv1_data_3000).shape

            # do something.
            # do experiments
            if step == 3000 or (step + 1) == FLAGS.max_steps:
                print("************\n Chen process executing")
                _, new_data = process.exp_2_commMax(conv1_data_0,
                                                    conv1_data_1000,
                                                    conv1_data_2000,
                                                    conv1_data_3000)
                for v in tf.all_variables():
                    if "conv1/weights:" in v.name:
                        print("start assign: ")
                        sess.run(
                            tf.assign(
                                tf.get_default_graph().get_tensor_by_name(
                                    v.name), new_data[0]))
                        break
                value = sess.run(loss)
                pred = process.Count(accurarcy)
                print("new loss value is: " + str(value) + " accurarcy :" +
                      str(pred))

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)
                predict = process.Count(accurarcy)
                format_str = (
                    '%s: step %d, loss = %.2f, accu = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value, predict,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
Exemplo n.º 31
0
		def SGDBead(self, bead, thresh, maxindex):
			
			finalerror = 0.
			
			#thresh = .05

			# Parameters
			learning_rate = 0.001
			training_epochs = 15
			batch_size = 100
			display_step = 1
			
			curWeights, curBiases = self.AllBeads[bead]
			#test_model = multilayer_perceptron(w=curWeights, b=curBiases)
			test_model = convnet(w=curWeights, b=curBiases)

			
			with test_model.g.as_default():

				global_step = tf.Variable(0, trainable=False)

				# Get images and labels for CIFAR-10.
				images, labels = cifar10.distorted_inputs()
				test_images, test_labels = cifar10.inputs(eval_data='test')

				# Build a Graph that computes the logits predictions from the
				# inference model.
				logits = test_model.predict(images)
				logit_test = test_model.predict(test_images)

				# Calculate loss.
				loss = cifar10.loss(logits, labels)

				# Build a Graph that trains the model with one batch of examples and
				# updates the model parameters.
				train_op = cifar10.train(loss, global_step)


				top_k_op = tf.nn.in_top_k(logit_test, test_labels, 1)


				# Build an initialization operation to run below.
				init = tf.initialize_all_variables()

				# Start running operations on the Graph.
				#sess = tf.Session(config=tf.ConfigProto(
				#    log_device_placement=FLAGS.log_device_placement))

				with tf.Session(config=tf.ConfigProto(
					log_device_placement=False)) as sess:
					sess.run(init)

					tf.train.start_queue_runners(sess=sess)

					step = 0
					stopcond = True
					while step < max_steps and stopcond:


						start_time = time.time()
						_, loss_value = sess.run([train_op, loss])
						duration = time.time() - start_time

						assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

						if step % 10 == 0:
							num_examples_per_step = batch_size
							examples_per_sec = num_examples_per_step / duration
							sec_per_batch = float(duration)

							format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
									  'sec/batch)')
							print (format_str % (datetime.now(), step, loss_value,
											 examples_per_sec, sec_per_batch))

						if step % 100 == 0:

							num_iter = int(math.ceil(num_examples / batch_size))
							true_count = 0  # Counts the number of correct predictions.
							total_sample_count = num_iter * batch_size
							stepp = 0
							while stepp < num_iter:
								predictions = sess.run([top_k_op])
								true_count += np.sum(predictions)
								stepp += 1


							# Compute precision @ 1.
							precision = true_count / total_sample_count
							print('%s: precision @ 1 = %.3f' % (datetime.now(), precision))

							if precision > 1 - thresh:
								stopcond = False
								test_model.params = sess.run(test_model.weightslist), sess.run(test_model.biaseslist)
								self.AllBeads[bead]=test_model.params
								finalerror = 1 - precision
								print ("Final bead error: ",str(finalerror))
								
						step += 1        
				return finalerror