示例#1
0
def train():
    with tf.Graph().as_default():
        global_step = tf.Variable(0, trainable=False)

        images, labels = cifar100.distorted_inputs()

        logits = cifar100.inference(images)

        loss = cifar100.loss(logits, labels)

        train_op = cifar100.train(loss, global_step)

        saver = tf.train.Saver(tf.all_variables())

        init = tf.initialize_all_variables()

        sess = tf.Session(config=tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        tf.train.start_queue_runners(sess=sess)

        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value)

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = float(duration)

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

                # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
示例#2
0
def train():
    """Train CIFAR-100 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.contrib.framework.get_or_create_global_step()

        # Get images and labels for CIFAR-100.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = cifar100.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = cifar100.inference(images)

        # Calculate loss.
        loss = cifar100.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = cifar100.train(loss, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    self._last_loss = loss_value
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))

            def last_loss(self):
                return self._last_loss

        loghook = _LoggerHook()
        with tf.train.MonitoredTrainingSession(
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss), loghook
                ],
                config=tf.ConfigProto(log_device_placement=FLAGS.
                                      log_device_placement)) as mon_sess:

            t1 = time.time()
            while not mon_sess.should_stop():
                mon_sess.run(train_op)

            t2 = time.time()
            print('spent %f seconds to train %d step' %
                  (t2 - t1, FLAGS.max_steps))
            print('spent %f seconds to train %d step' %
                  (t2 - t1, FLAGS.max_steps))
            print('last loss value: %.2f ' % loghook.last_loss())
示例#3
0
def train():
    """Train CIFAR-100 for a number of steps."""
    output = open('output_data/output_' + str(time.time()) + '.txt', 'w')
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-100.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
          images, labels = cifar100.distorted_inputs()

        # Build a Graph that computes the logits predictions from the
        # inference model.
        
        logitsA,logitsB = cifar100.inference(images)

        # Calculate loss.
        lossA = cifar100.loss(logitsA, labels)
        lossB = cifar100.loss(logitsB, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_opA = cifar100.train(lossA, global_step)
        train_opB = cifar100.train(lossB, global_step)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""

            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(lossA)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value = run_values.results
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                  'sec/batch)')
                    print(format_str % (datetime.now(), self._step, loss_value,
                                        examples_per_sec, sec_per_batch))
                    print((str(self._step) + '\t' +
                           str(loss_value) + '\n'), file=output)

        with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.train_dir,
            hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                   tf.train.NanTensorHook(lossA),
                   tf.train.NanTensorHook(lossB),
                   _LoggerHook()],
            config=tf.ConfigProto(
                log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            
            file_writer = tf.summary.FileWriter('tb-logs/', mon_sess.graph)

            while not mon_sess.should_stop():
                print("stepA")
                mon_sess.run(train_opA)
                print("stepB")
                mon_sess.run(train_opB)
        output.close()
def train():
    """Train CIFAR-100 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar100.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch *
                          cifar100.NUM_EPOCHS_PER_DECAY)

        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar100.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar100.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)

        # Get images and labels for CIFAR-100.
        images, labels = cifar100.distorted_inputs()
        #batch_queue = tf.contrib.slim.prefetch_queue.prefetch_queue(
        #      [images, labels], capacity=2 * FLAGS.num_gpus)
        # Calculate the gradients for each model tower.
        tower_grads = []
        with tf.variable_scope(tf.get_variable_scope()):
            for i in xrange(FLAGS.num_gpus):
                with tf.device('/gpu:%d' % i):
                    with tf.name_scope('%s_%d' %
                                       (cifar100.TOWER_NAME, i)) as scope:
                        # Dequeues one batch for the GPU
                        #image_batch, label_batch = batch_queue.dequeue()
                        # Calculate the loss for one tower of the CIFAR model. This function
                        # constructs the entire CIFAR model but shares the variables across
                        # all towers.
                        #loss = tower_loss(scope, image_batch, label_batch)
                        loss = tower_loss(scope, images, labels)

                        # Reuse variables for the next tower.
                        tf.get_variable_scope().reuse_variables()

                        # Retain the summaries from the final tower.
                        summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                      scope)

                        # Calculate the gradients for the batch of data on this CIFAR tower.
                        grads = opt.compute_gradients(loss)

                        # Keep track of the gradients across all towers.
                        tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.summary.scalar('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad is not None:
                summaries.append(
                    tf.summary.histogram(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.summary.histogram(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar100.MOVING_AVERAGE_DECAY, global_step)
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op)

        # Create a saver.
        saver = tf.train.Saver(tf.global_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.summary.merge(summaries)

        # Build an initialization operation to run below.
        init = tf.global_variables_initializer()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=FLAGS.log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        t1 = time.time()
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * FLAGS.num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / FLAGS.num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                checkpoint_path = os.path.join(FLAGS.train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)

        t2 = time.time()
        print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps))
        print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps))
        print('last loss value: %.2f ' % loss_value)
def train():
    print('FLAGS.data_dir: %s' % FLAGS.data_dir)
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")
    # Create a cluster from the parameter server and worker hosts.
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.task_index)
    if FLAGS.job_name == 'ps':
        server.join()
    is_chief = (FLAGS.task_index == 0)
    with tf.device(
            tf.train.replica_device_setter(
                worker_device="/job:worker/task:%d" % FLAGS.task_index,
                ps_device="/job:ps/task:0",
                cluster=cluster)):
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        # Get images and labels for CIFAR-100.
        images, labels = cifar100.distorted_inputs()
        num_workers = len(worker_hosts)
        num_replicas_to_aggregate = num_workers
        logits = cifar100.inference(images)
        # Calculate loss.
        loss = cifar100.loss(logits, labels)
        # Retain the summaries from the chief.
        # Calculate the learning rate schedule.
        num_batches_per_epoch = (cifar100.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
                                 FLAGS.batch_size)
        decay_steps = int(num_batches_per_epoch *
                          cifar100.NUM_EPOCHS_PER_DECAY)
        # Decay the learning rate exponentially based on the number of steps.
        lr = tf.train.exponential_decay(cifar100.INITIAL_LEARNING_RATE,
                                        global_step,
                                        decay_steps,
                                        cifar100.LEARNING_RATE_DECAY_FACTOR,
                                        staircase=True)
        if is_chief:
            summaries = tf.get_collection(tf.GraphKeys.SUMMARIES)
            # Add a summary to track the learning rate.
            summaries.append(tf.summary.scalar('learning_rate', lr))

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)
        opt = tf.train.SyncReplicasOptimizer(
            opt,
            replicas_to_aggregate=num_replicas_to_aggregate,
            total_num_replicas=num_workers,
            #use_locking=True)
            use_locking=False)
        # Calculate the gradients for the batch
        grads = opt.compute_gradients(loss)
        # Add histograms for gradients at the chief worker.
        if is_chief:
            for grad, var in grads:
                if grad is not None:
                    summaries.append(
                        tf.summary.histogram(var.op.name + '/gradients', grad))
        # apply gradients to variable
        train_op = opt.apply_gradients(grads, global_step=global_step)
        # Add histograms for trainable variables.
        if is_chief:
            for var in tf.trainable_variables():
                summaries.append(tf.summary.histogram(var.op.name, var))

        #variable_averages = tf.train.ExponentialMovingAverage(
        #      cifar100.MOVING_AVERAGE_DECAY, global_step)
        #variables_averages_op = variable_averages.apply(tf.trainable_variables())
        #train_op = tf.group(train_op, variables_averages_op)

        if is_chief:
            #Build the summary operation at the chief worker
            summary_op = tf.summary.merge(summaries)

    chief_queue_runner = opt.get_chief_queue_runner()
    init_token_op = opt.get_init_tokens_op()
    # Build an initialization operation to run below.
    init_op = tf.global_variables_initializer()
    # Create a saver.
    saver = tf.train.Saver(tf.global_variables())

    sv = tf.train.Supervisor(is_chief=is_chief,
                             global_step=global_step,
                             init_op=init_op)
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=FLAGS.log_device_placement)

    with sv.prepare_or_wait_for_session(server.target,
                                        config=sess_config) as sess:
        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        # start sync queue runner and run the init token op at the chief worker
        queue_runners = tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS)
        sv.start_queue_runners(sess, queue_runners)

        if is_chief:
            sv.start_queue_runners(sess, [chief_queue_runner])
            sess.run(init_token_op)
        #open the summary writer
        summary_writer = tf.summary.FileWriter(FLAGS.train_dir, sess.graph)

        t1 = time.time()
        for step in xrange(FLAGS.max_steps):
            start_time = time.time()
            _, loss_value = sess.run([train_op, loss])
            duration = time.time() - start_time
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % 10 == 0:
                num_examples_per_step = FLAGS.batch_size * num_workers
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / num_workers
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))
            if step % 100 == 0:
                if is_chief:
                    summary_str = sess.run(summary_op)
                    summary_writer.add_summary(summary_str, step)
            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == FLAGS.max_steps:
                if is_chief:
                    checkpoint_path = os.path.join(FLAGS.train_dir,
                                                   'model.ckpt')
                    saver.save(sess, checkpoint_path, global_step=step)

        t2 = time.time()
        print('spent %f seconds to train %d step' % (t2 - t1, FLAGS.max_steps))
        logger.info('spent %f seconds to train %d step' %
                    (t2 - t1, FLAGS.max_steps))
        logger.info('last loss value: %.2f ' % loss_value)