Пример #1
0
def setConfig():    
    config = network_config.getConfig()
    config['train_dir'] = 'cifar10_train_highway'
    config['max_steps'] = 78125 # 200 epochs # 64000 #1000000 # Vinh: use ResNet's max steps
    config['log_device_placement'] = False
    config['batch_size'] = 128
    config['data_dir'] = 'cifar10_data'
Пример #2
0
def inputs(eval_data):
    """Construct input for CIFAR evaluation using the Reader ops.

  Args:
    eval_data: bool, indicating if one should use the train or eval data set.

  Returns:
    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
    labels: Labels. 1D tensor of [batch_size] size.

  Raises:
    ValueError: If no data_dir
  """
    config = network_config.getConfig()
    data_dir = config['data_dir']
    batch_size = config['batch_size']

    # Vinh: debug
    print("THe batch size for input test is %d " % (batch_size))

    #if not FLAGS.data_dir:
    #  raise ValueError('Please supply a data_dir')
    data_dir = os.path.join(data_dir, 'cifar-10-batches-bin')
    return cifar10_input.inputs(eval_data=eval_data,
                                data_dir=data_dir,
                                batch_size=batch_size)
def main(argv=None):  # pylint: disable=unused-argument
    cifar10_train.setConfig()
    setConfig()
    eval_dir = network_config.getConfig()['eval_dir']
    cifar10.maybe_download_and_extract()
    if gfile.Exists(eval_dir):
        gfile.DeleteRecursively(eval_dir)
    gfile.MakeDirs(eval_dir)
    evaluate()
def main(argv=None):  # pylint: disable=unused-argument
    setConfig()
    config = network_config.getConfig()
    train_dir = config['train_dir']

    cifar10.maybe_download_and_extract()
    if gfile.Exists(train_dir):
        gfile.DeleteRecursively(train_dir)
    gfile.MakeDirs(train_dir)
    train()
def setConfig():
    config = network_config.getConfig()
    config['eval_dir'] = 'cifar10_eval'
    config['eval_data'] = 'test'
    config['checkpoint_dir'] = 'cifar10_train'
    config['eval_interval_secs'] = 60 * 5
    config['num_examples'] = 10000
    config['run_once'] = False
    config[
        'batch_size'] = 100  # To make sure all the test instances are evaluated
Пример #6
0
def main(argv=None):  # pylint: disable=unused-argument
  # Have to set config first
  # TODO: remove the need for this, will check how Python initialize a module
  setConfig()
  cifar10.maybe_download_and_extract()
  config = network_config.getConfig()
  train_dir = config['train_dir']
  if gfile.Exists(train_dir):
    gfile.DeleteRecursively(train_dir)
  gfile.MakeDirs(train_dir)
  train()
Пример #7
0
def loss(logits, labels):
    """Add L2Loss to all the trainable variables.

  Add summary for for "Loss" and "Loss/avg".
  Args:
    logits: Logits from inference().
    labels: Labels from distorted_inputs or inputs(). 1-D tensor
            of shape [batch_size]

  Returns:
    Loss tensor of type float.
  """
    # Reshape the labels into a dense Tensor of
    # shape [batch_size, NUM_CLASSES].
    config = network_config.getConfig()
    batch_size = config['batch_size']

    sparse_labels = tf.reshape(labels, [batch_size, 1])
    indices = tf.reshape(tf.range(batch_size), [batch_size, 1])
    concated = tf.concat(1, [indices, sparse_labels])
    dense_labels = tf.sparse_to_dense(concated, [batch_size, NUM_CLASSES], 1.0,
                                      0.0)

    #   # Vinh: NAN problem turns out exactly as I suspected:
    #     # http://stackoverflow.com/questions/33712178/tensorflow-nan-bug
    #     # Can't prevent NAN using this method because I have no control over the scaled
    #     # logits. Why didn't they prevent this in their (presumably C++?) implementation
    #     # of softmax_cross_entropy_with_logits?
    #     #cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits, onehot_labels)
    #     # Thanks to the solution from the above link
    #     # The NAN fix makes tf.reduce_meloss converge very slowly
    #     logits = tf.clip_by_value(tf.nn.softmax(logits), 1e-10,1.0)
    #     cross_entropy = - onehot_labels * tf.log(logits)
    #
    #     # Why it changes so much in this case?
    #     loss = tf.reduce_mean(cross_entropy, name = 'xentropy_mean')
    #
    #     # In case of highway network, reduce_sum converges much faster
    #     # than reduce_mean is, which is a bit strange because reduce_mean
    #     # is supposed to reduce the variance in stochastic gradient descent
    #     # Will fix this after I get back or tomorrow
    #     #loss = tf.reduce_sum(cross_entropy, name = 'xentropy_sum')

    # Calculate the average cross entropy loss across the batch.
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
        logits, dense_labels, name='cross_entropy_per_example')
    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
    tf.add_to_collection('losses', cross_entropy_mean)

    # The total loss is defined as the cross entropy loss plus all of the weight
    # decay terms (L2 loss).
    return tf.add_n(tf.get_collection('losses'), name='total_loss')
Пример #8
0
def distorted_inputs():
    """Construct distorted input for CIFAR training using the Reader ops.

  Returns:
    images: Images. 4D tensor of [batch_size, IMAGE_SIZE, IMAGE_SIZE, 3] size.
    labels: Labels. 1D tensor of [batch_size] size.

  Raises:
    ValueError: If no data_dir
  """
    #   if not FLAGS.data_dir:
    #     raise ValueError('Please supply a data_dir')

    config = network_config.getConfig()
    data_dir = config['data_dir']
    batch_size = config['batch_size']
    data_dir = os.path.join(data_dir, 'cifar-10-batches-bin')
    return cifar10_input.distorted_inputs(data_dir=data_dir,
                                          batch_size=batch_size)
Пример #9
0
def maybe_download_and_extract():
    """Download and extract the tarball from Alex's website."""
    config = network_config.getConfig()
    data_dir = config['data_dir']
    dest_directory = data_dir
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)
    filename = DATA_URL.split('/')[-1]
    filepath = os.path.join(dest_directory, filename)
    if not os.path.exists(filepath):

        def _progress(count, block_size, total_size):
            sys.stdout.write('\r>> Downloading %s %.1f%%' %
                             (filename, float(count * block_size) /
                              float(total_size) * 100.0))
            sys.stdout.flush()

        filepath, _ = urllib.request.urlretrieve(DATA_URL,
                                                 filepath,
                                                 reporthook=_progress)
        print()
        statinfo = os.stat(filepath)
        print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
        tarfile.open(filepath, 'r:gz').extractall(dest_directory)
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default(), tf.device('/cpu:0'):
        # Create a variable to count the number of train() calls. This equals the
        # number of batches processed * FLAGS.num_gpus.
        global_step = tf.get_variable('global_step', [],
                                      initializer=tf.constant_initializer(0),
                                      trainable=False)

        config = network_config.getConfig()
        batch_size = config['batch_size']
        num_gpus = config['num_gpus']
        log_device_placement = config['log_device_placement']
        train_dir = config['train_dir']
        max_steps = config['max_steps']

        # Calculate the learning rate schedule.
        #     num_batches_per_epoch = (cifar10.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN /
        #                              batch_size)
        #     decay_steps = int(num_batches_per_epoch * cifar10.NUM_EPOCHS_PER_DECAY)
        #
        #     # Decay the learning rate exponentially based on the number of steps.
        #     lr = tf.train.exponential_decay(cifar10.INITIAL_LEARNING_RATE,
        #                                     global_step,
        #                                     decay_steps,
        #                                     cifar10.LEARNING_RATE_DECAY_FACTOR,
        #                                     staircase=True)

        # Vinh: use ResNet's simple learning rate
        lr = tf.placeholder(tf.float32, [], "learning_rate")
        momentum = 0.9

        # Create an optimizer that performs gradient descent.
        opt = tf.train.GradientDescentOptimizer(lr)
        # Vinh: Use momentum as in ResNet
        #opt = tf.train.MomentumOptimizer(lr, momentum)

        # Calculate the gradients for each model tower.
        tower_grads = []
        for i in xrange(num_gpus):
            with tf.device('/gpu:%d' % i):
                with tf.name_scope('%s_%d' % (cifar10.TOWER_NAME, i)) as scope:
                    # Calculate the loss for one tower of the CIFAR model. This function
                    # constructs the entire CIFAR model but shares the variables across
                    # all towers.
                    loss = tower_loss(scope)

                    # Reuse variables for the next tower.
                    tf.get_variable_scope().reuse_variables()

                    # Retain the summaries from the final tower.
                    summaries = tf.get_collection(tf.GraphKeys.SUMMARIES,
                                                  scope)

                    # Calculate the gradients for the batch of data on this CIFAR tower.
                    grads = opt.compute_gradients(loss)

                    # Keep track of the gradients across all towers.
                    tower_grads.append(grads)

        # We must calculate the mean of each gradient. Note that this is the
        # synchronization point across all towers.
        grads = average_gradients(tower_grads)

        # Add a summary to track the learning rate.
        summaries.append(tf.scalar_summary('learning_rate', lr))

        # Add histograms for gradients.
        for grad, var in grads:
            if grad:
                summaries.append(
                    tf.histogram_summary(var.op.name + '/gradients', grad))

        # Apply the gradients to adjust the shared variables.
        apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

        # Add histograms for trainable variables.
        for var in tf.trainable_variables():
            summaries.append(tf.histogram_summary(var.op.name, var))

        # Track the moving averages of all trainable variables.
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY, global_step)

        # Vinh: don't use moving average for weights for now to have
        # fair comparison with ResNet paper. Well, I'll just save them here
        # and I'll do 2 evaluations: one without restoring them and one
        # with:
        variables_averages_op = variable_averages.apply(
            tf.trainable_variables())
        batchNormCol = tf.get_collection(BatchNormLayer.batchNormCollectionID)
        batchNormAverageOp = variable_averages.apply(batchNormCol)

        # Group all updates to into a single train op.
        train_op = tf.group(apply_gradient_op, variables_averages_op,
                            batchNormAverageOp)

        # Create a saver.
        saver = tf.train.Saver(tf.all_variables())

        # Build the summary operation from the last tower summaries.
        summary_op = tf.merge_summary(summaries)

        # Build an initialization operation to run below.
        init = tf.initialize_all_variables()

        # Start running operations on the Graph. allow_soft_placement must be set to
        # True to build towers on GPU, as some of the ops do not have GPU
        # implementations.
        sess = tf.Session(
            config=tf.ConfigProto(allow_soft_placement=True,
                                  log_device_placement=log_device_placement))
        sess.run(init)

        # Start the queue runners.
        tf.train.start_queue_runners(sess=sess)

        summary_writer = tf.train.SummaryWriter(train_dir,
                                                graph_def=sess.graph_def)

        # Vinh: change learning rates at steps 32k and 48k, terminating
        # at step 64k (counts from 1) (as in ResNet paper)
        feed_dict = {lr: 0.01}
        for step in xrange(max_steps):
            start_time = time.time()

            if (step + 1) == 800:
                feed_dict = {lr: 0.1}
            elif (step + 1) == 32000:
                feed_dict = {lr: 0.01}
            elif (step + 1) == 48000:
                feed_dict = {lr: 0.001}

            _, loss_value = sess.run([train_op, loss], feed_dict=feed_dict)
            duration = time.time() - start_time

            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % 10 == 0:
                num_examples_per_step = batch_size * num_gpus
                examples_per_sec = num_examples_per_step / duration
                sec_per_batch = duration / num_gpus

                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                    'sec/batch)')
                print(format_str % (datetime.now(), step, loss_value,
                                    examples_per_sec, sec_per_batch))

            if step % 100 == 0:
                summary_str = sess.run(summary_op, feed_dict)
                summary_writer.add_summary(summary_str, step)

            # Save the model checkpoint periodically.
            if step % 1000 == 0 or (step + 1) == max_steps:
                checkpoint_path = os.path.join(train_dir, 'model.ckpt')
                saver.save(sess, checkpoint_path, global_step=step)
def eval_once(saver, summary_writer, top_k_op, summary_op, ema):
    """Run Eval once.

  Args:
    saver: Saver.
    summary_writer: Summary writer.
    top_k_op: Top K op.
    summary_op: Summary op.
  """

    config = network_config.getConfig()
    checkpoint_dir = config['checkpoint_dir']
    num_examples = config['num_examples']
    batch_size = config['batch_size']

    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir)
        if ckpt and ckpt.model_checkpoint_path:

            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)

            # Assuming model_checkpoint_path looks something like:
            #   /my-favorite-path/cifar10_train/model.ckpt-0,
            # extract global_step from it.
            global_step = ckpt.model_checkpoint_path.split('/')[-1].split(
                '-')[-1]

            print('Gonna restore for batch norm here')
            # Vinh: restore the moving averages of batch norm's mean and variance.
            # It seems like the normal Saver object doesn't restore the moving averages of
            # untrainable variables
            # Have to hack by copying some ideas in
            # moving_average()'s source code
            # TODO: temporarily comment out because I forgot to make
            # change in BatchNorm layer with regards to get_variable()
            # TODO: test with the 2 options: using population mean and variance
            # and mini-batch mean and variance
            #       untrainableVars = list(set(variables.all_variables()) -
            #                              set(variables.trainable_variables()))
            #
            #       variables_to_restore = {val.name : val     #{ema.average_name(val) : val
            #                               for val in untrainableVars}
            #
            #       batchNormCol = tf.get_collection(BatchNormLayer.batchNormCollectionID)
            #       # a bit clunky
            #       restoredVarMap =  {ema.average_name(variables_to_restore[key])
            #                           : variables_to_restore[key]
            #                           for key in batchNormCol
            #                           if key in variables_to_restore}
            #       batchNormSaver = tf.train.Saver(restoredVarMap)
            #       batchNormSaver.restore(sess, ckpt.model_checkpoint_path)

            # Vinh: adjust after a new change in BatchNormLayer
            batchNormVarCol = tf.get_collection(
                BatchNormLayer.batchNormCollectionID)
            restoredVarMap = {
                ema.average_name(var): var
                for var in batchNormVarCol
            }
            batchNormSaver = tf.train.Saver(restoredVarMap)
            batchNormSaver.restore(sess, ckpt.model_checkpoint_path)

            # Vinh: So from this point, batch norm's means and variances are restored to their
            # moving average values

        else:
            print('No checkpoint file found')
            return

        # Start the queue runners.
        coord = tf.train.Coordinator()
        try:
            threads = []
            for qr in tf.get_collection(tf.GraphKeys.QUEUE_RUNNERS):
                threads.extend(
                    qr.create_threads(sess,
                                      coord=coord,
                                      daemon=True,
                                      start=True))

            # Vinh: the original tensorflow way will underestimate the true precision
            num_iter = int(math.ceil(num_examples / batch_size))
            true_count = 0  # Counts the number of correct predictions.

            # If num_examples is not divisible by batch_size, then total_sample_count
            # will be greater than num_examples. If we assume, in this case, the
            # filename_queue will be read until the last file, than the precision
            # calculated here will be less than the correct precision.
            total_sample_count = num_iter * batch_size
            step = 0
            while step < num_iter and not coord.should_stop():
                predictions = sess.run([top_k_op])
                true_count += np.sum(predictions)
                step += 1

            # Compute precision @ 1.
            precision = true_count / total_sample_count
            print('%s: Original precision @ 1 = %.3f' %
                  (datetime.now(), precision))

            totalExamplesEvaluated = step * batch_size
            correctPrecision = true_count / totalExamplesEvaluated
            print(
                'The correct precision %.3f over %d evaluated examples is: ' %
                (correctPrecision, totalExamplesEvaluated))

            summary = tf.Summary()
            summary.ParseFromString(sess.run(summary_op))
            summary.value.add(tag='Precision @ 1', simple_value=precision)
            summary_writer.add_summary(summary, global_step)
            print('Reach here with the total steps: %d' % (step))
        except Exception as e:  # pylint: disable=broad-except
            coord.request_stop(e)

        coord.request_stop()
        coord.join(threads, stop_grace_period_secs=10)
def evaluate():

    config = network_config.getConfig()
    test_or_train = config['eval_data']
    eval_dir = config['eval_dir']
    run_once = config['run_once']
    eval_interval_secs = config['eval_interval_secs']
    """Eval CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        # Get images and labels for CIFAR-10.
        eval_data = test_or_train == 'test'
        images, labels = cifar10.inputs(eval_data=eval_data)

        # Build a Graph that computes the logits predictions from the
        # inference model.
        #logits = cifar10.inference(images)
        # 20 layer network
        # logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = False)
        # 56 layers
        #     logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = False,
        #                                                      numStackedBlocks = 9)
        # is_train_phase should be false but I need to change BatchNormLayer first
        # so use poor man's batch norm for now
        #     logits = cifar10_model.buildNetworkWithVariableScope(images,
        #                                  is_train_phase = True,
        #                                  gateType = MywayFFLayer.HIGHWAY_GATE,
        #                                  numStackedBlocks = 3)
        #logits = cifar10_model.inference(images)

        logits = cifar10_model.buildNetworkWithVariableScope(
            images,
            is_train_phase=False,
            gateType=MywayFFLayer.RESIDUAL_GATE,
            numStackedBlocks=3)

        #     logits = cifar10_model.buildNetworkWithVariableScope(images,
        #                               is_train_phase = False,
        #                               gateType = MywayFFLayer.HIGHWAY_GATE,
        #                               numStackedBlocks = 9)

        # Calculate predictions.
        top_k_op = tf.nn.in_top_k(logits, labels, 1)

        # Restore the moving average version of the learned variables for eval.
        # Vinh: disable this for now
        variable_averages = tf.train.ExponentialMovingAverage(
            cifar10.MOVING_AVERAGE_DECAY)
        # Vinh: Will restore weights after I do a fair comparison with ResNet paper
        #     variables_to_restore = variable_averages.variables_to_restore()
        #     saver = tf.train.Saver(variables_to_restore)
        saver = tf.train.Saver()

        # Build the summary operation based on the TF collection of Summaries.
        summary_op = tf.merge_all_summaries()

        graph_def = tf.get_default_graph().as_graph_def()
        summary_writer = tf.train.SummaryWriter(eval_dir, graph_def=graph_def)

        while True:
            eval_once(saver, summary_writer, top_k_op, summary_op,
                      variable_averages)
            if run_once:
                break
            time.sleep(eval_interval_secs)
Пример #13
0
def train():
  #setConfig() # Already set in main
  config = network_config.getConfig()
  train_dir = config['train_dir']
  max_steps = config['max_steps']
  log_device_placement = config['log_device_placement']
  batch_size = config['batch_size']
    
  """Train CIFAR-10 for a number of steps."""
  with tf.Graph().as_default():
    global_step = tf.Variable(0, trainable=False)

    # Get images and labels for CIFAR-10.
    images, labels = cifar10.distorted_inputs()

    # Build a Graph that computes the logits predictions from the
    # inference model.       
    
    # 20 layer network
#     logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = True)
                                                     
    # 56 layers
#     logits = cifar10_model.buildResidualStyleNetwork(images, is_train_phase = True, 
#                                                      numStackedBlocks = 9)

    
    logits = cifar10_model.buildNetworkWithVariableScope(images, 
                          is_train_phase = True, 
                          gateType = MywayFFLayer.HIGHWAY_GATE, 
                          numStackedBlocks = 9)
    
    # Calculate loss.
    loss = cifar10.loss(logits, labels)
    print('Loss used logits from cifar10_model')
    
    # Build a Graph that trains the model with one batch of examples and
    # updates the model parameters.    
    lr = tf.placeholder(tf.float32, [], "learning_rate")
    train_op = cifar10.train(loss, global_step, lr)   

    # Build the summary operation based on the TF collection of Summaries.
    summary_op = tf.merge_all_summaries()

    # Build an initialization operation to run below.
    init = tf.initialize_all_variables()

    # Start running operations on the Graph.
    sess = tf.Session(config=tf.ConfigProto(
        log_device_placement=log_device_placement))
    sess.run(init)

    # Create a saver.
    # Vinh: this should save the moving averages of batch mean and variance
    # =============== Maybe not, I need to check it now ============================= 
    saver = tf.train.Saver(tf.all_variables())
    #saver = tf.train.Saver() # What is the difference here?


    # Start the queue runners.
    tf.train.start_queue_runners(sess=sess)

    summary_writer = tf.train.SummaryWriter(train_dir,
                                            graph_def=sess.graph_def)    
    
    for step in xrange(max_steps):
      start_time = time.time()
      
      # Vinh: change learning rates at steps 32k and 48k, terminating
      # at step 64k (counts from 1) (as in ResNet paper)      
      feed_dict = {lr : 0.1}      
      # Not reducing the learning rate for now
      if (step + 1) == 32000:
        feed_dict = {lr : 0.01}
      elif (step + 1) == 48000:
        feed_dict = {lr : 0.001}
      _, loss_value = sess.run([train_op, loss], feed_dict = feed_dict)
      duration = time.time() - start_time

      assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

      if step % 10 == 0:
        num_examples_per_step = batch_size
        examples_per_sec = num_examples_per_step / duration
        sec_per_batch = float(duration)

        format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                      'sec/batch)')
        print (format_str % (datetime.now(), step, loss_value,
                             examples_per_sec, sec_per_batch))

      if step % 100 == 0:
        # Vinh: If I monitor the learning rate in cifar10.py, I'd need to
        # pass the feed_dict above to the running of summary_op 
        summary_str = sess.run(summary_op) 
        summary_writer.add_summary(summary_str, step)

      # Save the model checkpoint periodically.
      if step % 1000 == 0 or (step + 1) == max_steps:
        checkpoint_path = os.path.join(train_dir, 'model.ckpt')
        saver.save(sess, checkpoint_path, global_step=step)
Пример #14
0
def train2(total_loss, global_step, optimizer):
    """Train CIFAR-10 model.

  Create an optimizer and apply to all trainable variables. Add moving
  average for all trainable variables.

  Args:
    total_loss: Total loss from loss().
    global_step: Integer Variable counting the number of training steps
      processed.
  Returns:
    train_op: op for training.
  """
    config = network_config.getConfig()
    batch_size = config['batch_size']
    # Variables that affect learning rate.
    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)

    # Decay the learning rate exponentially based on the number of steps.
    #   lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,
    #                                   global_step,
    #                                   decay_steps,
    #                                   LEARNING_RATE_DECAY_FACTOR,
    #                                   staircase=True)

    # Vinh: implement ResNet's simple learning rate reduction
    # Use mrry's answer to change the learning rate
    # http://stackoverflow.com/questions/33919948/how-to-set-adaptive-learning-rate-for-gradientdescentoptimizer
    #lr = INITIAL_LEARNING_RATE
    #lr = tf.placeholder(tf.float32, [], "learning_rate")
    # No need to monitor this now because I use ResNet's adaptive learning rates
    # tf.scalar_summary('learning_rate', lr)

    #momentum = 0.9

    # Generate moving averages of all losses and associated summaries.
    loss_averages_op = _add_loss_summaries(total_loss)

    # Compute gradients.
    with tf.control_dependencies([loss_averages_op]):
        # Vinh: as in ResNet paper
        #opt = tf.train.GradientDescentOptimizer(lr)
        #opt = tf.train.MomentumOptimizer(lr, momentum)
        #grads = opt.compute_gradients(total_loss)
        grads = optimizer.compute_gradients(total_loss)

    # Apply gradients.
    apply_gradient_op = optimizer.apply_gradients(grads,
                                                  global_step=global_step)

    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
        tf.histogram_summary(var.op.name, var)

    # Add histograms for gradients.
    for grad, var in grads:
        if grad:
            tf.histogram_summary(var.op.name + '/gradients', grad)

    # Track the moving averages of all trainable variables.
    # Vinh: move this to cifar10_train to see if they can solve
    # the problem of not all variables having ExpoMovAvg.
    # Well, I disable this for now. I'm gonna fix it later
    # with a much smaller subset of CIFAR10


#   variable_averages = tf.train.ExponentialMovingAverage(
#       MOVING_AVERAGE_DECAY, global_step)
#   variables_averages_op = variable_averages.apply(tf.trainable_variables())

#with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
    with tf.control_dependencies([apply_gradient_op]):
        train_op = tf.no_op(name='train')

    return train_op
Пример #15
0
def inference(images):
    """Build the CIFAR-10 model.

  Args:
    images: Images returned from distorted_inputs() or inputs().

  Returns:
    Logits.
  """
    # We instantiate all variables using tf.get_variable() instead of
    # tf.Variable() in order to share variables across multiple GPU training runs.
    # If we only ran this model on a single GPU, we could simplify this function
    # by replacing all instances of tf.get_variable() with tf.Variable().
    #
    # conv1
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 3, 64],
                                             stddev=1e-4,
                                             wd=0.0)
        conv = tf.nn.conv2d(images, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        bias = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv1)

    # pool1
    pool1 = tf.nn.max_pool(conv1,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name='pool1')
    # norm1
    norm1 = tf.nn.lrn(pool1,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm1')

    # conv2
    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights',
                                             shape=[5, 5, 64, 64],
                                             stddev=1e-4,
                                             wd=0.0)
        conv = tf.nn.conv2d(norm1, kernel, [1, 1, 1, 1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.1))
        bias = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(bias, name=scope.name)
        _activation_summary(conv2)

    # norm2
    norm2 = tf.nn.lrn(conv2,
                      4,
                      bias=1.0,
                      alpha=0.001 / 9.0,
                      beta=0.75,
                      name='norm2')
    # pool2
    pool2 = tf.nn.max_pool(norm2,
                           ksize=[1, 3, 3, 1],
                           strides=[1, 2, 2, 1],
                           padding='SAME',
                           name='pool2')

    config = network_config.getConfig()
    batch_size = config['batch_size']
    # local3
    with tf.variable_scope('local3') as scope:
        # Move everything into depth so we can perform a single matrix multiply.
        dim = 1
        for d in pool2.get_shape()[1:].as_list():
            dim *= d
        reshape = tf.reshape(pool2, [batch_size, dim])

        weights = _variable_with_weight_decay('weights',
                                              shape=[dim, 384],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [384],
                                  tf.constant_initializer(0.1))
        local3 = tf.nn.relu_layer(reshape, weights, biases, name=scope.name)
        _activation_summary(local3)

    # local4
    with tf.variable_scope('local4') as scope:
        weights = _variable_with_weight_decay('weights',
                                              shape=[384, 192],
                                              stddev=0.04,
                                              wd=0.004)
        biases = _variable_on_cpu('biases', [192],
                                  tf.constant_initializer(0.1))
        local4 = tf.nn.relu_layer(local3, weights, biases, name=scope.name)
        _activation_summary(local4)

    # softmax, i.e. softmax(WX + b)
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                              stddev=1 / 192.0,
                                              wd=0.0)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0))
        softmax_linear = tf.nn.xw_plus_b(local4,
                                         weights,
                                         biases,
                                         name=scope.name)
        _activation_summary(softmax_linear)

    return softmax_linear