示例#1
0
def evaluate(hps):
    images, labels = cifar_input.build_input(
        FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)

    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    best_precision = 0.0
    while True:

        try:
            ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
        except tf.errors.OutOfRangeError as e:
            tf.logging.error('Cannot restore checkpoint: %s', e)
            continue
        if not (ckpt_state and ckpt_state.model_checkpoint_path):
            tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
            continue

        tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
        saver.restore(sess, ckpt_state.model_checkpoint_path)

        total_prediction, correct_prediction = 0, 0
        for _ in six.moves.range(FLAGS.eval_batch_count):
            (loss, predictions, truth, train_step) = sess.run(
                [model.cost, model.predictions,
                 model.labels, model.global_step])

            truth = np.argmax(truth, axis=1)
            predictions = np.argmax(predictions, axis=1)
            correct_prediction += np.sum(truth == predictions)
            total_prediction += predictions.shape[0]

        precision = 1.0 * correct_prediction / total_prediction
        best_precision = max(precision, best_precision)

        precision_summ = tf.Summary()
        precision_summ.value.add(
            tag='Precision', simple_value=precision)
        summary_writer.add_summary(precision_summ, train_step)

        best_precision_summ = tf.Summary()
        best_precision_summ.value.add(
            tag='Best Precision', simple_value=best_precision)
        summary_writer.add_summary(best_precision_summ, train_step)

        tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                        (loss, precision, best_precision))

        summary_writer.flush()

        if FLAGS.eval_once:
            break

        time.sleep(60)
示例#2
0
def main(_):
    config_initialization()
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.eval_data_path,
                                             FLAGS.batch_size,
                                             mode='eval')
    eval(images, labels)
示例#3
0
文件: eval.py 项目: WeixiZhu94/MNIST
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None):
    if checkpoint_dir is None:
        checkpoint_dir = log_dir
    with tf.device('/cpu:0'):
      images, labels = build_input('cifar10', 100, 'test')
      predictions, total_loss = network(images, labels)
    
      tf.summary.scalar('loss', total_loss)
      predictions = tf.to_int32(tf.argmax(predictions, 1))
    
      tf.summary.scalar('accuracy', slim.metrics.accuracy(predictions, labels))

      # These are streaming metrics which compute the "running" metric,
      # e.g running accuracy
      metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map({
          'accuracy': slim.metrics.streaming_accuracy(predictions, labels),
          'streaming_mse': slim.metrics.streaming_mean_squared_error(predictions, labels),
      })

      # Define the streaming summaries to write:
      for metric_name, metric_value in metrics_to_values.items():
          tf.summary.scalar(metric_name, metric_value)

      # Evaluate every 30 seconds
      slim.evaluation.evaluation_loop(
          '',
          checkpoint_dir,
          log_dir,
          num_evals=num_batches,
          eval_op=list(metrics_to_updates.values()),
          summary_op=tf.summary.merge_all(),
          eval_interval_secs=60,
          max_number_of_evaluations = 100000000)
示例#4
0
    def __init__(self, data, dataset, eval_batch_count, eval_dir):
        os.environ["CUDA_VISIBLE_DEVICES"] = ""
        hps = resnet_model.HParams(
            batch_size=100,
            num_classes=100 if dataset == "cifar100" else 10,
            min_lrn_rate=0.0001,
            lrn_rate=0.1,
            num_residual_units=5,
            use_bottleneck=False,
            weight_decay_rate=0.0002,
            relu_leakiness=0.1,
            optimizer="mom",
            num_gpus=0)
        with tf.device("/cpu:0"):
            # Builds the testing network.
            images, labels = cifar_input.build_input(data, hps.batch_size,
                                                     dataset, False)
            self.model = resnet_model.ResNet(hps, images, labels, "eval")
            self.model.build_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            self.model.variables.set_session(sess)
            init = tf.global_variables_initializer()
            sess.run(init)

            # Initializing parameters for tensorboard.
            self.best_precision = 0.0
            self.eval_batch_count = eval_batch_count
            self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph)
        # The IP address where tensorboard logs will be on.
        self.ip_addr = ray.services.get_node_ip_address()
示例#5
0
文件: resnet_main.py 项目: kmax12/ray
  def __init__(self, data, dataset, eval_batch_count, eval_dir):
    hps = resnet_model.HParams(
        batch_size=100,
        num_classes=100 if dataset == "cifar100" else 10,
        min_lrn_rate=0.0001,
        lrn_rate=0.1,
        num_residual_units=5,
        use_bottleneck=False,
        weight_decay_rate=0.0002,
        relu_leakiness=0.1,
        optimizer="mom",
        num_gpus=0)
    input_images = data[0]
    input_labels = data[1]
    with tf.device("/cpu:0"):
      # Builds the testing network.
      images, labels = cifar_input.build_input([input_images, input_labels],
                                               hps.batch_size, dataset, False)
      self.model = resnet_model.ResNet(hps, images, labels, "eval")
      self.model.build_graph()
      config = tf.ConfigProto(allow_soft_placement=True)
      sess = tf.Session(config=config)
      self.model.variables.set_session(sess)
      self.coord = tf.train.Coordinator()
      tf.train.start_queue_runners(sess, coord=self.coord)
      init = tf.global_variables_initializer()
      sess.run(init)

      # Initializing parameters for tensorboard.
      self.best_precision = 0.0
      self.eval_batch_count = eval_batch_count
      self.summary_writer = tf.summary.FileWriter(eval_dir, sess.graph)
    # The IP address where tensorboard logs will be on.
    self.ip_addr = ray.services.get_node_ip_address()
示例#6
0
def evaluate(hps):
    """Eval loop."""
    images, labels = cifar_input.build_input('cifar10', FLAGS.eval_data_path,
                                             hps.batch_size, 'eval')
    model = resnet_model.ResNet(hps, images, labels, 'eval')
    model.build_graph()

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    saver = tf.train.Saver()

    ##################################
    ## FIXME: Make a summary writer ##
    ##################################
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    try:
        ckpt_state = tf.train.get_checkpoint_state(FLAGS.ckpt_dir)
    except tf.errors.OutOfRangeError as e:
        tf.logging.error('Cannot restore checkpoint: %s', e)
    if not (ckpt_state):
        tf.logging.info('No model to eval yet at %s', FLAGS.ckpt_dir)

    best_precision = 0.
    for i in range(len(ckpt_state.all_model_checkpoint_paths)):
        tf.logging.info('Loading checkpoint %s',
                        ckpt_state.all_model_checkpoint_paths[i])
        saver.restore(sess, ckpt_state.all_model_checkpoint_paths[i])
        total_prediction, correct_prediction = 0, 0

        for _ in six.moves.range(FLAGS.eval_batch_count):
            (summaries, loss, predictions, truth, train_step) = sess.run([
                model.summaries, model.cost, model.predictions, model.labels,
                model.global_step
            ])

            truth = np.argmax(truth, axis=1)
            predictions = np.argmax(predictions, axis=1)
            correct_prediction += np.sum(truth == predictions)
            total_prediction += predictions.shape[0]

        precision = 1.0 * correct_prediction / total_prediction
        best_precision = max(precision, best_precision)

        ########################################################
        ## FIXME: Add summary of precision and best precision ##
        ########################################################
        summ_precision = tf.Summary()
        summ_precision.value.add(tag='precision', simple_value=precision)
        summary_writer.add_summary(summ_precision, train_step)

        summ_best_precision = tf.Summary()
        summ_best_precision.value.add(tag='best_precision',
                                      simple_value=best_precision)
        summary_writer.add_summary(summ_best_precision, train_step)

        tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                        (loss, precision, best_precision))
        summary_writer.flush()
    def _train_resnet_model(self):
        ''' Shared functionality for defferent resnet model_fn.'''
        data_path = get_filenames(is_training=True, data_dir=cifar10_data_path)
        features, labels = cifar_input.build_input(dataset='cifar10',
                                                   data_path=data_path,
                                                   batch_size=32,
                                                   mode='train')

        tf.summary.image('images', features, max_outputs=6)
        features = tf.cast(features, dtype=self.dtype)
        logits = self.network(features, training=True)
        logits = tf.cast(logits, tf.float32)

        predictions = {
            'classes': tf.argmax(logits, axis=1),
            'probabilities': tf.nn.softmax(logits, name='softmax_tensor')
        }

        cross_entropy = tf.losses.sparse_softmax_cross_entropy(logits=logits,
                                                               labels=labels)

        # Create a tensor named cross_entropy for logging purposes.
        tf.identity(cross_entropy, name='cross_entropy')
        tf.summary.scalar('cross_entropy', cross_entropy)

        # Add weight decay to the loss.
        l2_loss = self._weight_decay * tf.add_n([
            tf.nn.l2_loss(tf.cast(v, tf.float32))
            for v in tf.trainable_variables()
        ])

        tf.summary.scalar('l2_loss', l2_loss)

        total_loss = cross_entropy + l2_loss
        global_step = tf.train.get_or_create_global_step()
        learning_rate = self._learning_rate_fn(global_step)
        tf.identity(learning_rate, name='learning_rate')
        tf.summary.scalar('learning_rate', learning_rate)

        optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate,
                                               momentum=0.9)
        # TODO: add a loss_scale.
        if self._loss_scale != 1:
            scaled_grad_vars = optimizer.compute_gradients(total_loss *
                                                           self._loss_scale)
            unscaled_grad_vars = [(grad / self._loss_scale, var)
                                  for grad, var in scaled_grad_vars]
            minimize_op = optimizer.apply_gradients(unscaled_grad_vars)
        else:
            minimize_op = optimizer.minimize(total_loss, global_step)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        train_op = tf.group(minimize_op, update_ops)

        # accuracy = tf.metrics.accuracy(labels, predictions['classes'])
        # metrics = {'accuracy': accuracy}
        # tf.identity(accuracy[1], name='train_accuracy')
        # tf.summary.scalar('train_accuracy', accuracy[1])

        return train_op, total_loss, predictions, global_step
示例#8
0
def train(hps):
  """Training loop."""
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  summary_writer = tf.train.SummaryWriter(FLAGS.train_dir)

  sv = tf.train.Supervisor(logdir=FLAGS.log_root,
                           is_chief=True,
                           summary_op=None,
                           save_summaries_secs=60,
                           save_model_secs=300,
                           global_step=model.global_step)
  sess = sv.prepare_or_wait_for_session()

  step = 0
  total_prediction = 0
  correct_prediction = 0
  precision = 0.0
  lrn_rate = 0.1

  while not sv.should_stop():
    (_, summaries, loss, predictions, truth, train_step) = sess.run(
        [model.train_op, model.summaries, model.cost, model.predictions,
         model.labels, model.global_step],
        feed_dict={model.lrn_rate: lrn_rate})

    if train_step < 40000:
      lrn_rate = 0.1
    elif train_step < 60000:
      lrn_rate = 0.01
    elif train_step < 80000:
      lrn_rate = 0.001
    else:
      lrn_rate = 0.0001

    predictions = np.argmax(predictions, axis=1)
    truth = np.argmax(truth, axis=1)
    for (t, p) in zip(truth, predictions):
      if t == p:
        correct_prediction += 1
      total_prediction += 1
    precision = float(correct_prediction) / total_prediction
    correct_prediction = total_prediction = 0

    step += 1
    if step % 100 == 0:
      precision_summ = tf.Summary()
      precision_summ.value.add(
          tag='Precision', simple_value=precision)
      summary_writer.add_summary(precision_summ, train_step)
      summary_writer.add_summary(summaries, train_step)
      tf.logging.info('loss: %.3f, precision: %.3f\n' % (loss, precision))
      summary_writer.flush()

  sv.Stop()
示例#9
0
def evaluate(hps):
  """Eval loop."""
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  saver = tf.train.Saver()
  summary_writer = tf.train.SummaryWriter(FLAGS.eval_dir)

  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  tf.train.start_queue_runners(sess)

  best_precision = 0.0
  while True:
    time.sleep(60)
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
      continue
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    total_prediction, correct_prediction = 0, 0
    for _ in xrange(FLAGS.eval_batch_count):
      (summaries, loss, predictions, truth, train_step) = sess.run(
          [model.summaries, model.cost, model.predictions,
           model.labels, model.global_step])

      best_predictions = np.argmax(predictions, axis=1)
      truth = np.argmax(truth, axis=1)
      for (t, p) in zip(truth, best_predictions):
        if t == p:
          correct_prediction += 1
        total_prediction += 1

    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    precision_summ = tf.Summary()
    precision_summ.value.add(
        tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    summary_writer.add_summary(summaries, train_step)
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f\n' %
                    (loss, precision, best_precision))
    summary_writer.flush()

    if FLAGS.eval_once:
      break
示例#10
0
文件: train.py 项目: WeixiZhu94/MNIST
def main(train_dir, batch_size, num_batches, log_dir):

    images, labels = build_input('cifar10', 100, 'train')
    predictions, total_loss = network(images, labels)

    tf.summary.scalar('loss', total_loss)
    optimizer = tf.train.GradientDescentOptimizer(0.1)
    train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True)

    slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
示例#11
0
def main(train_dir, batch_size, num_batches, log_dir, checkpoint_dir=None):
    if checkpoint_dir is None:
        checkpoint_dir = log_dir
    with tf.device('/cpu:0'):
        images, labels = build_input('cifar10', 100, 'test')
        logits, logits_cat1, logits_cat2, loss, loss_cat1, loss_cat2, labels_cat1, labels_cat2 = network(
            images, labels)

        tf.summary.scalar('losses/loss', loss)
        tf.summary.scalar('losses/loss_cat1', loss_cat1)
        tf.summary.scalar('losses/loss_cat2', loss_cat2)

        logits = tf.argmax(logits, axis=1)
        logits_cat1 = tf.argmax(logits_cat1, axis=1)
        logits_cat2 = tf.argmax(logits_cat2, axis=1)

        tf.summary.scalar('accuracy',
                          slim.metrics.accuracy(logits, tf.to_int64(labels)))
        tf.summary.scalar(
            'accuracy_cat_1',
            slim.metrics.accuracy(logits_cat1, tf.to_int64(labels_cat1)))
        tf.summary.scalar(
            'accuracy_cat_2',
            slim.metrics.accuracy(logits_cat2, tf.to_int64(labels_cat2)))

        # These are streaming metrics which compute the "running" metric,
        # e.g running accuracy
        metrics_to_values, metrics_to_updates = slim.metrics.aggregate_metric_map(
            {
                'accuracies/accuracy':
                slim.metrics.streaming_accuracy(logits, labels),
                'accuracies/accuracy_cat_1':
                slim.metrics.streaming_accuracy(logits_cat1, labels_cat1),
                'accuracies/accuracy_cat_2':
                slim.metrics.streaming_accuracy(logits_cat2, labels_cat2),
            })

        # Define the streaming summaries to write:
        for metric_name, metric_value in metrics_to_values.items():
            tf.summary.scalar(metric_name, metric_value)

        # Evaluate every 30 seconds
        slim.evaluation.evaluation_loop('',
                                        checkpoint_dir,
                                        log_dir,
                                        num_evals=num_batches,
                                        eval_op=list(
                                            metrics_to_updates.values()),
                                        summary_op=tf.summary.merge_all(),
                                        eval_interval_secs=60,
                                        max_number_of_evaluations=100000000)
示例#12
0
def train(hps):
    """Training loop."""

    single_gpu_graph = tf.Graph()
    with single_gpu_graph.as_default():

        images, labels = cifar_input.build_input('cifar10',
                                                 FLAGS.train_data_path,
                                                 hps.batch_size, 'train')
        model = resnet_model.ResNet(hps, images, labels, 'train')
        model.build_graph()

        truth = tf.argmax(model.labels, axis=1)
        predictions = tf.argmax(model.predictions, axis=1)
        precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    ########################################################################
    #### FIXME: Get session for distributed environments using Parallax ####
    #### Pass parallax_config as an argument                            ####
    ########################################################################

    parallax_sess, num_workers, worker_id, num_replicas_per_worker = \
          parallax.parallel_run(single_gpu_graph,
                                FLAGS.resource_info_file,
                                sync=FLAGS.sync,
                                parallax_config=parallax_config.build_config())

    for i in range(350000):

        _, global_step, cost, precision_ = \
            parallax_sess.run([model.train_op, model.global_step, model.cost, precision])

        if i % 10 == 0:
            print('step: %d, loss: %.3f, precision: %.3f' %
                  (global_step[0], cost[0], precision_[0]))

            # Tuning learning rate
            train_step = global_step[0]
            if train_step < 10000:
                lrn_rate = 0.1
            elif train_step < 15000:
                lrn_rate = 0.01
            elif train_step < 20000:
                lrn_rate = 0.001
            else:
                lrn_rate = 0.0001
            feed_dict = {model.lrn_rate: []}
            for worker in range(num_replicas_per_worker):
                feed_dict[model.lrn_rate].append(lrn_rate)
            parallax_sess.run(model.global_step, feed_dict=feed_dict)
示例#13
0
def main(train_dir, batch_size, num_batches, log_dir):

    images, labels = build_input('cifar10', 100, 'train')
    predictions, total_loss = network(images, labels)
    
    report()

    tf.summary.scalar('loss', total_loss)
    predictions = tf.argmax(predictions, axis=1)
    tf.summary.scalar('accuracy', slim.metrics.accuracy(predictions, tf.to_int64(labels)))

    optimizer = tf.train.GradientDescentOptimizer(0.1)
    train_op = slim.learning.create_train_op(total_loss, optimizer, summarize_gradients=True)

    slim.learning.train(train_op, log_dir, save_summaries_secs=20, save_interval_secs=20)
示例#14
0
def evaluate():
    eval_batch_count = 50  # faster validation, change to 40000/batch_size for the report
    validation_error = 0
    validation_accuracy = 0

    with tf.device('/cpu:0'):
        with tf.Graph().as_default() as g:
            images, labels = cifar_input.build_input(
                'cifar100', '../../cifar/cifar100/validation.bin', batch_size,
                'eval')  #TEST.BIN OR VALIDATION.BIN

            logits = inference(images, NUM_CLASSES=100)
            saver = tf.train.Saver()

            losses = loss(logits, labels)

            accuracies = accuracy(logits, labels)

            sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
            tf.train.start_queue_runners(sess)

            ckpt_state = tf.train.get_checkpoint_state(train_dir)
            saver.restore(sess, ckpt_state.model_checkpoint_path)

            for _ in six.moves.range(eval_batch_count):
                (value_losses, value_accuracy) = sess.run([losses, accuracies])
                validation_error += value_losses
                validation_accuracy += value_accuracy
            validation_error /= 50
            validation_accuracy /= 50

            step = str(ckpt_state.model_checkpoint_path).split('-')[1]
            tf.logging.info('loss: %.3f, best accuracy: %.3f' %
                            (validation_error, validation_accuracy))
            f = open(train_dir + "validation_data.csv", 'ab')
            f.write('{0},{1},{2}\n'.format(step, validation_error,
                                           validation_accuracy))
            f.close()
            f = open(train_dir + "log.txt", 'ab')
            f.write('loss: {0}, best accuracy: {1}\n'.format(
                validation_error, validation_accuracy))
            f.close()
示例#15
0
    def __init__(self, data, dataset, num_gpus):
        if num_gpus > 0:
            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
                [str(i) for i in ray.get_gpu_ids()])
        hps = resnet_model.HParams(
            batch_size=128,
            num_classes=100 if dataset == "cifar100" else 10,
            min_lrn_rate=0.0001,
            lrn_rate=0.1,
            num_residual_units=5,
            use_bottleneck=False,
            weight_decay_rate=0.0002,
            relu_leakiness=0.1,
            optimizer="mom",
            num_gpus=num_gpus)

        # We seed each actor differently so that each actor operates on a
        # different subset of data.
        if num_gpus > 0:
            tf.set_random_seed(ray.get_gpu_ids()[0] + 1)
        else:
            # Only a single actor in this case.
            tf.set_random_seed(1)

        input_images = data[0]
        input_labels = data[1]
        with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"):
            # Build the model.
            images, labels = cifar_input.build_input([input_images,
                                                      input_labels],
                                                     hps.batch_size, dataset,
                                                     False)
            self.model = resnet_model.ResNet(hps, images, labels, "train")
            self.model.build_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            sess = tf.Session(config=config)
            self.model.variables.set_session(sess)
            self.coord = tf.train.Coordinator()
            tf.train.start_queue_runners(sess, coord=self.coord)
            init = tf.global_variables_initializer()
            sess.run(init)
            self.steps = 10
示例#16
0
    def __init__(self, data, dataset, num_gpus):
        if num_gpus > 0:
            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
                [str(i) for i in ray.get_gpu_ids()])
        hps = resnet_model.HParams(
            batch_size=128,
            num_classes=100 if dataset == "cifar100" else 10,
            min_lrn_rate=0.0001,
            lrn_rate=0.1,
            num_residual_units=5,
            use_bottleneck=False,
            weight_decay_rate=0.0002,
            relu_leakiness=0.1,
            optimizer="mom",
            num_gpus=num_gpus)

        # We seed each actor differently so that each actor operates on a
        # different subset of data.
        if num_gpus > 0:
            tf.set_random_seed(ray.get_gpu_ids()[0] + 1)
        else:
            # Only a single actor in this case.
            tf.set_random_seed(1)

        with tf.device("/gpu:0" if num_gpus > 0 else "/cpu:0"):
            # Build the model.
            images, labels = cifar_input.build_input(data,
                                                     hps.batch_size, dataset,
                                                     False)
            self.model = resnet_model.ResNet(hps, images, labels, "train")
            self.model.build_graph()
            config = tf.ConfigProto(allow_soft_placement=True)
            config.gpu_options.allow_growth = True
            sess = tf.Session(config=config)
            self.model.variables.set_session(sess)
            init = tf.global_variables_initializer()
            sess.run(init)
            self.steps = 10
示例#17
0
文件: train.py 项目: WeixiZhu94/MNIST
def main(train_dir, batch_size, num_batches, log_dir):

    images, labels = build_input('cifar10', 100, 'train')
    logits, logits_cat1, logits_cat2, loss, loss_cat1, loss_cat2, labels_cat1, labels_cat2 = network(
        images, labels)

    report()

    tf.summary.scalar('losses/loss', loss)
    tf.summary.scalar('losses/loss_cat1', loss_cat1)
    tf.summary.scalar('losses/loss_cat2', loss_cat2)

    logits = tf.argmax(logits, axis=1)
    logits_cat1 = tf.argmax(logits_cat1, axis=1)
    logits_cat2 = tf.argmax(logits_cat2, axis=1)

    tf.summary.scalar('accuracy',
                      slim.metrics.accuracy(logits, tf.to_int64(labels)))
    tf.summary.scalar(
        'accuracy_cat_1',
        slim.metrics.accuracy(logits_cat1, tf.to_int64(labels_cat1)))
    tf.summary.scalar(
        'accuracy_cat_2',
        slim.metrics.accuracy(logits_cat2, tf.to_int64(labels_cat2)))

    optimizer = tf.train.GradientDescentOptimizer(0.1)
    total_loss = loss_cat2
    #total_loss = loss_cat1
    #total_loss = loss
    train_op = slim.learning.create_train_op(total_loss,
                                             optimizer,
                                             summarize_gradients=True)

    slim.learning.train(train_op,
                        log_dir,
                        save_summaries_secs=20,
                        save_interval_secs=20)
示例#18
0
def evaluate(hps):
    """Eval loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.eval_data_path,
                                             hps.batch_size, FLAGS.mode,
                                             hps.data_format)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    best_precision = 0.0
    while True:
        try:
            ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
        except tf.errors.OutOfRangeError as e:
            tf.logging.error('Cannot restore checkpoint: %s', e)
            continue
        if not (ckpt_state and ckpt_state.model_checkpoint_path):
            tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
            break
        tf.logging.info('Loading checkpoint %s',
                        ckpt_state.model_checkpoint_path)
        saver.restore(sess, ckpt_state.model_checkpoint_path)

        global_step = ckpt_state.model_checkpoint_path.split('/')[-1].split(
            '-')[-1]
        if not global_step.isdigit():
            global_step = 0
        else:
            global_step = int(global_step)

        total_prediction, correct_prediction, correct_prediction_top5 = 0, 0, 0
        start_time = time.time()
        for _ in six.moves.range(FLAGS.eval_batch_count):
            (summaries, loss, predictions, truth, train_step) = sess.run([
                model.summaries, model.cost, model.predictions, model.labels,
                model.global_step
            ])

            if not FLAGS.time_inference:
                for (indiv_truth, indiv_prediction) in zip(truth, predictions):
                    indiv_truth = np.argmax(indiv_truth)
                    top5_prediction = np.argsort(indiv_prediction)[-5:]
                    top1_prediction = np.argsort(indiv_prediction)[-1]
                    correct_prediction += (indiv_truth == top1_prediction)
                    if indiv_truth in top5_prediction:
                        correct_prediction_top5 += 1
                    total_prediction += 1

        if FLAGS.time_inference:
            print("Time for inference: %.4f" % (time.time() - start_time))
        else:
            precision = 1.0 * correct_prediction / total_prediction
            precision_top5 = 1.0 * correct_prediction_top5 / total_prediction
            best_precision = max(precision, best_precision)

            precision_summ = tf.Summary()
            precision_summ.value.add(tag='Precision', simple_value=precision)
            summary_writer.add_summary(precision_summ, train_step)
            best_precision_summ = tf.Summary()
            best_precision_summ.value.add(tag='Best Precision',
                                          simple_value=best_precision)
            summary_writer.add_summary(best_precision_summ, train_step)
            summary_writer.add_summary(summaries, train_step)
            print('Precision @ 1 = %.4f, Recall @ 5 = %.4f, Global step = %d' %
                  (precision, precision_top5, global_step))
            summary_writer.flush()

        if FLAGS.eval_once:
            break

        time.sleep(60)
示例#19
0
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode)
    model = incre_resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    sub_vars_ = 0
    restore_vars = []
    for v in tf.trainable_variables():
        if v.name.find('bn') == -1 and v.name.find('logit') == -1:
            restore_vars.append(v)
            var_ = v  #tf.get_default_graph().get_tensor_by_name(v.name+':0')
            oral_var_ = tf.contrib.framework.load_variable(
                './ckpt_change/', v.name)
            oral_var_ = tf.Variable(oral_var_,
                                    name=v.name.split(':')[0] + '_oral')
            sub_vars_ = sub_vars_ + tf.reduce_sum(
                tf.abs(tf.to_float(var_) - tf.to_float(oral_var_)))
    saver = tf.train.Saver(restore_vars)

    incre_loss = 0.001 * sub_vars_ + model.cost
    trainable_variables = tf.trainable_variables()
    grads = tf.gradients(incre_loss, trainable_variables)
    optimizer = tf.train.MomentumOptimizer(model.lrn_rate, 0.5)
    incre_train_op = optimizer.apply_gradients(zip(grads, trainable_variables),
                                               global_step=model.global_step,
                                               name='train_step')
    train_op = [incre_train_op] + model._extra_train_ops
    train_ops = tf.group(*train_op)

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'incre loss': incre_loss,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < 40000:
                self._lrn_rate = 0.1
            elif train_step < 60000:
                self._lrn_rate = 0.01
            elif train_step < 80000:
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        saver.restore(
            mon_sess,
            '/home/fuxianya/models/research/resnet/ckpt_change/model.ckpt-16072'
        )
        while not mon_sess.should_stop():
            mon_sess.run(train_ops)
示例#20
0
def train(hps, res_dict):

    with tf.Graph().as_default():

        with tf.device('/cpu:0'):
            train_images, train_labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, 128, 'train')
            test_images, test_labels = cifar_input.build_input(FLAGS.dataset, FLAGS.eval_data_path, 100, 'eval')

        lr = tf.placeholder(tf.float32)
        phase = tf.placeholder(tf.bool) # true for training
        x_input = tf.cond(phase, lambda:train_images, lambda:test_images)
        Y_true = tf.cond(phase, lambda:train_labels, lambda:test_labels)

        loss, precision = vgg16(x_input, Y_true, phase, True)
        tf.summary.scalar('Precision', precision)

        global_step = tf.Variable(0, name='global_step', trainable=False)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(update_ops):
            opt = tf.train.MomentumOptimizer(lr, 0.9)
            grads_and_vars = opt.compute_gradients(loss)
            optimizer = opt.apply_gradients(grads_and_vars, global_step=global_step)

        # Add histograms for gradients
        for grad, var in grads_and_vars:
            if grad is not None:
                tf.summary.histogram(var.op.name + '/gradients', grad)

        # Summary op
        summary_op = tf.summary.merge_all()

        # Initializer for the variables
        init = tf.global_variables_initializer()

        # Saver op
        saver = tf.train.Saver()
        writer = tf.summary.FileWriter(FLAGS.res_dir, graph=tf.get_default_graph())

        train_learning_rate = FLAGS.training_lr_init
        TEST_ACCURACY=0.1
        compression_ratio = 1.0
        zero_layers=0
        with tf.Session() as sess:

            if FLAGS.resume:
                # restore the model
                try:
                    ckpt_state = tf.train.get_checkpoint_state(FLAGS.train_dir)
                except tf.errors.OutOfRangeError as e:
                    tf.logging.error('Cannot restore checkpoint: %s', e)

                if not (ckpt_state and ckpt_state.model_checkpoint_path):
                    tf.logging.info('No model to load yet at %s', FLAGS.train_dir)

                tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
                saver.restore(sess, ckpt_state.model_checkpoint_path)
                model_step = ckpt_state.model_checkpoint_path.split('/')[-1].split('-')[-1]
                print("\n Resume model was saved at step {}".format(model_step))
                print('train steps:{}, model step:{}'.format(TRAIN_STEPS, model_step))
                train_learning_rate = FLAGS.resume_lr_init
            else:
                sess.run(init)

            # initialize regularization parameter
            if FLAGS.use_growl | FLAGS.use_group_lasso:
                layer_reg_params, hps = reg_params_init(sess, hps)

            # Start input enqueue threads
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            try:
                prev_time = time.clock()
                while not coord.should_stop():
                    step = sess.run(global_step)

                    # check whether to decay learning rate
                    if (step+1) % FLAGS.lr_decay_step == 0:
                        train_learning_rate = train_learning_rate * FLAGS.lr_decay_rate

                    if step % BATCH_PER_EPOCH == 0:
                        summary_extend = tf.Summary()
                        _, summary, train_loss, train_accuracy = sess.run([optimizer, summary_op, loss, precision],
                                                                           feed_dict={lr:train_learning_rate, phase:True})
                        summary_extend.ParseFromString(summary)
                        summary_extend.value.add(tag='testing accuracy', simple_value=TEST_ACCURACY)
                        summary_extend.value.add(tag='compression', simple_value=compression_ratio)
                        writer.add_summary(summary_extend, step)
                        print('step: {}, lr_rate: {}, train_loss: {:.4f}, train_accuracy: {:.4f}'.format(step, train_learning_rate, train_loss, train_accuracy))
                    else:
                        sess.run([optimizer], feed_dict={lr:train_learning_rate,phase:True})

                    # save the model and evaluate
                    if (step % FLAGS.checkpoint_freq == 0 and step != 0) or (step == TRAIN_STEPS):
                        print('Checkpoint! Now saving model...')
                        saver.save(sess, FLAGS.train_dir, global_step=step)

                    if step % FLAGS.eval_freq == 0:
                        current_time = time.clock()
                        test_loss = 0
                        test_accuracy = 0
                        for i in range(EVAL_BATCHES):
                            test_img_vals = sess.run(test_images)
                            test_label_vals = sess.run(test_labels)
                            test_loss_i, test_accur_i = sess.run([loss, precision], feed_dict={phase:False})
                            test_loss += test_loss_i
                            test_accuracy += test_accur_i

                        test_loss = test_loss / EVAL_BATCHES
                        TEST_ACCURACY = test_accuracy = test_accuracy / EVAL_BATCHES
                        res_dict['test_accur_arr'].append(test_accuracy)
                        res_dict['training_accur_arr'].append(train_accuracy)
                        res_dict['steps'].append(step)
                        batch_time = (current_time-prev_time)/FLAGS.checkpoint_freq
                        print('    TEST_ACCURACY: {:.4f}, 1 batch takes: {:.4f}'.format(test_accuracy, batch_time))
                        prev_time = current_time

                    # apply proximal gradient update, and update the mask
                    if (step % REG_APPLY_FREQ==0) and (step>0) and FLAGS.use_sparse_reg:
                        apply_reg_prox(sess, train_learning_rate, layer_reg_params, hps)

                        # update mask
                        zero_layers, layer_ID = update_mask(sess, FLAGS.mask_threshold, hps, res_dict, step)
                        compression_ratio = measure_compression(sess, res_dict, step, True, hps)

                        if zero_layers >= 1:
                            print("There exists zero value layers at step:{0}, layers IDs:{1}".format(step, layer_ID))
                            coord.request_stop()

                    if ((step >= TRAIN_STEPS) and FLAGS.retrain_on) or ((step % FLAGS.display_similarity_freq==0) and step>1):
                        print("Get the group information! \n")
                        group_info, num_clusters_arr = display_similarity(sess, FLAGS.num_training_epochs, hps, res_dict)
                        np.save(FLAGS.train_dir + 'group_info.npy', group_info)
                        np.save(FLAGS.train_dir + 'num_clusters_arr.npy', num_clusters_arr)

                    if step >= TRAIN_STEPS:
                        coord.request_stop()

            except tf.errors.OutOfRangeError:
                np.save(FLAGS.res_dir + 'res_dict.npy', res_dict)
                print('Done training')
            finally:
                coord.request_stop()

            coord.join(threads)
            sess.close()

            return zero_layers, step
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < 40000:
                self._lrn_rate = 0.1
            elif train_step < 60000:
                self._lrn_rate = 0.01
            elif train_step < 80000:
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(model.train_op)
示例#22
0
def evaluate(hps):
    """Eval loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.eval_data_path,
                                             hps.batch_size, FLAGS.mode)
    model = resnet_model.ResNet(hps, FLAGS.mode)
    global_step = tf.train.get_or_create_global_step()
    model.build_graph(images, labels, True)
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    best_precision = 0.0
    while True:
        try:
            ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
        except tf.errors.OutOfRangeError as e:
            tf.logging.error('Cannot restore checkpoint: %s', e)
            continue
        if not (ckpt_state and ckpt_state.model_checkpoint_path):
            tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
            continue
        tf.logging.info('Loading checkpoint %s',
                        ckpt_state.model_checkpoint_path)
        saver.restore(sess, ckpt_state.model_checkpoint_path)

        vars_ = {}
        for v in tf.trainable_variables():
            vars_[v.name] = v

        total_prediction, correct_prediction, total_preds = 0, 0, 0
        for _ in six.moves.range(FLAGS.eval_batch_count):
            cost, pred, logits = model.forward_prob(images, labels, vars_)
            tmp = tf.argmax(labels, axis=1)
            preds = tf.argmax(pred, axis=1)
            co_pred = tf.reduce_mean(tf.to_float(tf.equal(preds, tmp)))
            total_preds += co_pred

            (summaries, loss, predictions, truth, train_step,
             total_preds) = sess.run([
                 model.summaries, model.cost, model.predictions, labels,
                 global_step, total_preds
             ])

            truth = np.argmax(truth, axis=1)
            predictions = np.argmax(predictions, axis=1)
            correct_prediction += np.sum(truth == predictions)
            total_prediction += predictions.shape[0]

        t_preds = total_preds / FLAGS.eval_batch_count
        precision = 1.0 * correct_prediction / total_prediction
        best_precision = max(precision, best_precision)

        precision_summ = tf.Summary()
        precision_summ.value.add(tag='Precision', simple_value=precision)
        summary_writer.add_summary(precision_summ, train_step)
        best_precision_summ = tf.Summary()
        best_precision_summ.value.add(tag='Best Precision',
                                      simple_value=best_precision)
        summary_writer.add_summary(best_precision_summ, train_step)
        summary_writer.add_summary(summaries, train_step)
        tf.logging.info(
            'loss: %.3f, precision: %.3f, best precision: %.3f, t preds: %.3f'
            % (loss, precision, best_precision, t_preds))
        summary_writer.flush()

        if FLAGS.eval_once:
            break

        time.sleep(60)
示例#23
0
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode)
    model = Net(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write("total_params: %d\n" % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    labeled_examples = tf.greater(tf.reduce_max(model.labels, axis=1),
                                  tf.zeros([hps.batch_size, 1]))
    labeled_examples = tf.cast(labeled_examples, tf.float32)
    correct_predictions = tf.cast(tf.equal(predictions, truth), tf.float32)
    correct_predictions = tf.multiply(correct_predictions, labeled_examples)
    precision = tf.reduce_sum(correct_predictions) / tf.reduce_sum(
        labeled_examples)

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar("Precision", precision)]),
    )

    logging_hook = tf.train.LoggingTensorHook(
        tensors={
            "step": model.global_step,
            "loss": model.cost,
            "wmc": model.wmc,
            "cross_entropy": model.cross_entropy,
            "precision": precision,
        },
        every_n_iter=100,
    )

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step, feed_dict={model.lrn_rate: self._lrn_rate})

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < 10000:
                self._lrn_rate = 0.1
            elif train_step < 20000:
                self._lrn_rate = 0.05
            elif train_step < 35000:
                self._lrn_rate = 0.01
            else:
                self._lrn_rate = 0.001

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True),
    ) as mon_sess:
        while not mon_sess.should_stop():
            mon_sess.run(model.train_op)
示例#24
0
def evaluate(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  # 模型变量存储器
  saver = tf.train.Saver()
  # 总结文件 生成器
  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
  
  # 执行Session
  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  
  # 启动所有队列执行器
  tf.train.start_queue_runners(sess)

  best_precision = 0.0
  while True:
    # 检查checkpoint文件
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
      continue
  
    # 读取模型数据(训练期间生成)
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    # 逐Batch执行测试
    total_prediction, correct_prediction = 0, 0
    for _ in six.moves.range(FLAGS.eval_batch_count):
      # 执行预测
      (loss, predictions, truth, train_step) = sess.run(
          [model.cost, model.predictions,
           model.labels, model.global_step])
      # 计算预测结果
      truth = np.argmax(truth, axis=1)
      predictions = np.argmax(predictions, axis=1)
      correct_prediction += np.sum(truth == predictions)
      total_prediction += predictions.shape[0]

    # 计算准确率
    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    # 添加准确率总结
    precision_summ = tf.Summary()
    precision_summ.value.add(
        tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    
    # 添加最佳准确总结
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    
    # 添加测试总结
    #summary_writer.add_summary(summaries, train_step)
    
    # 打印日志
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                    (loss, precision, best_precision))
    
    # 执行写文件
    summary_writer.flush()

    if FLAGS.eval_once:
      break

    time.sleep(60)
示例#25
0
def train(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()

  # 计算预测准确率
  truth = tf.argmax(model.labels, axis=1)
  predictions = tf.argmax(model.predictions, axis=1)
  precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

  # 建立总结存储器,每100步存储一次
  summary_hook = tf.train.SummarySaverHook(
              save_steps=100,
              output_dir=FLAGS.train_dir,
              summary_op=tf.summary.merge(
                              [model.summaries,
                               tf.summary.scalar('Precision', precision)]))
  # 建立日志打印器,每100步打印一次
  logging_hook = tf.train.LoggingTensorHook(
      tensors={'step': model.global_step,
               'loss': model.cost,
               'precision': precision},
      every_n_iter=100)

  # 学习率更新器,基于全局Step
  class _LearningRateSetterHook(tf.train.SessionRunHook):

    def begin(self):
      #初始学习率
      self._lrn_rate = 0.1

    def before_run(self, run_context):
      return tf.train.SessionRunArgs(
                      # 获取全局Step
                      model.global_step,
                      # 设置学习率
                      feed_dict={model.lrn_rate: self._lrn_rate})  

    def after_run(self, run_context, run_values):
      # 动态更新学习率
      train_step = run_values.results
      if train_step < 40000:
        self._lrn_rate = 0.1
      elif train_step < 60000:
        self._lrn_rate = 0.01
      elif train_step < 80000:
        self._lrn_rate = 0.001
      else:
        self._lrn_rate = 0.0001

  # 建立监控Session
  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=FLAGS.log_root,
      hooks=[logging_hook, _LearningRateSetterHook()],
      chief_only_hooks=[summary_hook],
      # 禁用默认的SummarySaverHook,save_summaries_steps设置为0
      save_summaries_steps=0, 
      config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
    while not mon_sess.should_stop():
      # 执行优化训练操作
      mon_sess.run(model.train_op)
示例#26
0
def main(_):
    config_initialization()
    images, labels = cifar_input.build_input(FLAGS.dataset, FLAGS.train_data_path, FLAGS.batch_size, mode='train')
    train_op, train_step_fn = create_train_op(images, labels)
    train(train_op, train_step_fn)
示例#27
0
def main(_):
    inputs, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.eval_data_path,
                                             FLAGS.batch_size, 'eval')
    is_training = True

    with slim.arg_scope(nets.resnet_v2.resnet_arg_scope()):
        net, endpoints = nets.resnet_v2.resnet_v2_101(inputs,
                                                      num_classes=None,
                                                      is_training=is_training)
    with tf.variable_scope('Logits'):
        net = tf.squeeze(net, axis=[1, 2])
        net = slim.dropout(net, keep_prob=0.5, scope='scope')
        logits = slim.fully_connected(net,
                                      num_outputs=FLAGS.num_classes,
                                      activation_fn=None,
                                      scope='fc')

    # 有选择地恢复变量
    checkpoint_exclude_scopes = 'Logits'
    exclusions = None
    if checkpoint_exclude_scopes:
        exclusions = [
            scope.strip() for scope in checkpoint_exclude_scopes.split(',')
        ]
    variables_to_restore = []
    for var in slim.get_model_variables():
        excluded = False
        for exclusion in exclusions:
            if var.op.name.startswith(exclusion):
                excluded = True
            if not excluded:
                variables_to_restore.append(var)

    logits = tf.nn.softmax(logits)
    classes = tf.argmax(logits, axis=1, name='classes')
    accuracy = tf.reduce_mean(
        tf.cast(tf.equal(tf.cast(classes, dtype=tf.int32), labels),
                dtype=tf.float32))

    # 获取最新的模型
    ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_model_path)
    saver_restore = tf.train.Saver()

    with tf.Session() as sess:
        # 载入训练模型
        saver_restore.restore(sess, ckpt.model_checkpoint_path)

        # 开启队列
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)
        correct_prediction = 0

        for i in range(FLAGS.num_steps):
            correct_prediction += sess.run(accuracy)

        # 输出测试情况
        precision = correct_prediction / FLAGS.num_steps
        validate_log = 'Validation precision: {:.4f}'.format(precision)
        print(validate_log)
        # 关闭队列
        coord.request_stop()
        coord.join(threads)
示例#28
0
def train(hps):
    """Training loop."""
    class_loss = []
    images1, labels1 = cifar_input.build_input(
        FLAGS.dataset, '/home/fuxianya/data/bin/train_batch', hps.batch_size,
        FLAGS.mode)
    model = resnet_model.ResNet(hps, FLAGS.mode)
    model.build_graph(images1, labels1, True)

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(labels1, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision_o = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))
    loss_o = model.cost

    vars_ = {}
    for v in tf.trainable_variables():
        #if v.name.find('bn')==-1:
        #  print(v.name)
        vars_[v.name] = v
    cost, pred, logits = model.forward_prob(images1, labels1, vars_, True)
    inner_grad = tf.gradients(cost, list(vars_.values()))
    inner_grad = [tf.stop_gradient(grad) for grad in inner_grad]
    inner_grad_dict = dict(zip(vars_.keys(), inner_grad))
    new_vars = dict(
        zip(vars_.keys(), [
            vars_[key] - model.lrn_rate * inner_grad_dict[key]
            for key in vars_.keys()
        ]))

    class_preds = []
    costb = []
    for i in range(0, 10):
        class_image, class_label = cifar_input.build_input(
            FLAGS.dataset, FLAGS.train_data_path + '_' + str(i),
            hps.batch_size, FLAGS.mode)

        cost1, pred, _ = model.forward_prob(class_image, class_label, new_vars,
                                            True)
        costb.append(cost1)
        tmp = tf.argmax(class_label, axis=1)
        preds = tf.argmax(pred, axis=1)
        co_pred = tf.reduce_mean(tf.to_float(tf.equal(preds, tmp)))
        class_preds.append(co_pred)

    meta_loss = tf.to_float(0.5) * tf.reduce_sum(costb) / tf.to_float(
        10) + tf.to_float(0.5) * loss_o
    #meta_loss = tf.reduce_mean(costb, 0, keep_dims=True)
    #meta_optimizer = tf.train.AdamOptimizer(model.lrn_rate).minimize(meta_loss, global_step=global_step)
    trainable_variables = tf.trainable_variables()
    grads = tf.gradients(meta_loss, trainable_variables)
    optimizer = tf.train.MomentumOptimizer(model.lrn_rate, 0.9)
    meta_train_op = optimizer.apply_gradients(zip(grads, trainable_variables),
                                              global_step=model.global_step,
                                              name='train_step')
    train_op = [meta_train_op] + model.extra_train_ops
    train_ops = tf.group(*train_op)
    total_accs = tf.reduce_sum(class_preds) / tf.to_float(10)
    '''
  gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.666)
  config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options)
  sess = tf.Session(config=config)
  sess.run(tf.global_variables_initializer())
  for i in range(2000):
      sess.run(meta_train_op)
      l, p, tl, tp = sess.run([loss_o, precision_o, meta_loss, total_accs])
      print('epoch:%d, loaa_0:%f, acc_0:%f, loss:%f, acc:%f'%(l,p,tl,tp))
      saver.save(sess, 'ckpt/model.ckpt', global_step=i+1)

  '''
    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', total_accs)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss_o': loss_o,
        'precision_o': precision_o,
        'total precision': total_accs,
        'total losses': meta_loss
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < 2000:
                self._lrn_rate = 0.1
            elif train_step < 4000:
                self._lrn_rate = 0.01
            elif train_step < 6000:
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
    epoch = 0
    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook],
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True,
                                  gpu_options=gpu_options)) as mon_sess:
        while not mon_sess.should_stop() and epoch < 4000:
            #mon_sess.run(meta_optimizer)
            #mon_sess.run([meta_loss, meta_train_op])
            #mon_sess.run(meta_train_op)
            mon_sess.run(train_ops)
            epoch = epoch + 1
示例#29
0
def train(hps):
    """Training loop."""
    images, labels = cifar_input.build_input(FLAGS.dataset,
                                             FLAGS.train_data_path,
                                             hps.batch_size, FLAGS.mode,
                                             hps.data_format)
    model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=FLAGS.train_dir,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    num_steps_per_epoch = 391  # TODO: Don't hardcode this.

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.01

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if train_step < num_steps_per_epoch:
                self._lrn_rate = 0.01
            elif train_step < (91 * num_steps_per_epoch):
                self._lrn_rate = 0.1
            elif train_step < (136 * num_steps_per_epoch):
                self._lrn_rate = 0.01
            elif train_step < (181 * num_steps_per_epoch):
                self._lrn_rate = 0.001
            else:
                self._lrn_rate = 0.0001

    class _SaverHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self.saver = tf.train.Saver(max_to_keep=10000)
            subprocess.call("rm -rf %s; mkdir -p %s" %
                            (FLAGS.checkpoint_dir, FLAGS.checkpoint_dir),
                            shell=True)
            self.f = open(os.path.join(FLAGS.checkpoint_dir, "times.log"), 'w')

        def after_create_session(self, sess, coord):
            self.sess = sess
            self.start_time = time.time()

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step  # Asks for global step value.
            )

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            epoch = train_step / num_steps_per_epoch
            if train_step % num_steps_per_epoch == 0:
                end_time = time.time()
                directory = os.path.join(FLAGS.checkpoint_dir,
                                         ("%5d" % epoch).replace(' ', '0'))
                subprocess.call("mkdir -p %s" % directory, shell=True)
                ckpt_name = 'model.ckpt'
                self.saver.save(self.sess,
                                os.path.join(directory, ckpt_name),
                                global_step=train_step)
                self.f.write("Step: %d\tTime: %s\n" %
                             (train_step, end_time - self.start_time))
                print("Saved checkpoint after %d epoch(s) to %s..." %
                      (epoch, directory))
                sys.stdout.flush()
                self.start_time = time.time()

        def end(self, sess):
            self.f.close()

    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.log_root,
            hooks=[logging_hook, _LearningRateSetterHook()],
            chief_only_hooks=[summary_hook, _SaverHook()],
            save_checkpoint_secs=None,
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=None,
            save_summaries_secs=None,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        for i in range(num_steps_per_epoch * 181):
            mon_sess.run(model.train_op)
示例#30
0
文件: main.py 项目: frhrdr/pixeldp
def train(hps, model, dir_name=None):
    """Training loop."""
    if dir_name == None:
        dir_name = FLAGS.data_dir + "/" + FLAGS.model_dir

    if FLAGS.dataset == 'mnist':
        mnist = tf.contrib.learn.datasets.load_dataset("mnist")
        dataset = mnist.train
        images = tf.placeholder(tf.float32, [hps.batch_size, 784],
                                name='x-input')
        labels = tf.placeholder(tf.int64, [hps.batch_size], name='y-input')
    elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100':
        images, labels = cifar_input.build_input(FLAGS.dataset,
                                                 FLAGS.eval_data_path,
                                                 hps.batch_size,
                                                 hps.image_standardization,
                                                 FLAGS.mode)
    model = model.Model(hps, images, labels, FLAGS.mode)
    model.build_graph()

    param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.
        TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
    sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

    tf.contrib.tfprof.model_analyzer.print_model_analysis(
        tf.get_default_graph(),
        tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

    truth = tf.argmax(model.labels, axis=1)
    predictions = tf.argmax(model.predictions, axis=1)
    one_hot_preds = tf.one_hot(predictions,
                               depth=hps.num_classes,
                               dtype=tf.float32)
    votes = tf.reshape(one_hot_preds,
                       [hps.n_draws, hps.batch_size, hps.num_classes])
    predictions = tf.argmax(tf.reduce_sum(votes, axis=0), axis=1)
    precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

    summary_hook = tf.train.SummarySaverHook(
        save_steps=100,
        output_dir=dir_name,
        summary_op=tf.summary.merge(
            [model.summaries,
             tf.summary.scalar('Precision', precision)]))

    logging_hook = tf.train.LoggingTensorHook(tensors={
        'step': model.global_step,
        'loss': model.cost,
        'precision': precision
    },
                                              every_n_iter=100)

    class _LearningRateSetterHook(tf.train.SessionRunHook):
        """Sets learning_rate based on global step."""
        def begin(self):
            self._lrn_rate = 0.1
            self._schedule = list(zip(hps.lrn_rte_changes, hps.lrn_rte_vals))

        def before_run(self, run_context):
            return tf.train.SessionRunArgs(
                model.global_step,  # Asks for global step value.
                feed_dict={model.lrn_rate:
                           self._lrn_rate})  # Sets learning rate

        def after_run(self, run_context, run_values):
            train_step = run_values.results
            if len(self._schedule) > 0 and train_step >= self._schedule[0][0]:
                # Update learning rate according to the schedule.
                self._lrn_rate = self._schedule[0][1]
                self._schedule = self._schedule[1:]

    print("START TRAINING")
    steps = 0
    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=dir_name,
            hooks=[
                logging_hook,
                _LearningRateSetterHook(),
                tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
            ],
            chief_only_hooks=[summary_hook],
            # Since we provide a SummarySaverHook, we need to disable default
            # SummarySaverHook. To do that we set save_summaries_steps to 0.
            save_summaries_steps=0,
            config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
        while not mon_sess.should_stop():
            s = 1.0 - min(0.99975**steps, 0.9)
            if s > 0.9: s = 1.0  # this triggers around 10k steps

            if FLAGS.dataset == 'mnist':
                xs, ys = dataset.next_batch(hps.batch_size, fake_data=False)
                args = {
                    model.noise_scale: s,
                    model._images: xs,
                    model._labels: ys
                }
            elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100':
                args = {model.noise_scale: s}

            mon_sess.run(model.train_op, args)
            steps += 1
示例#31
0
def train(hps):
  """Training loop."""
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()

  param_stats = tf.contrib.tfprof.model_analyzer.print_model_analysis(
      tf.get_default_graph(),
      tfprof_options=tf.contrib.tfprof.model_analyzer.
          TRAINABLE_VARS_PARAMS_STAT_OPTIONS)
  sys.stdout.write('total_params: %d\n' % param_stats.total_parameters)

  tf.contrib.tfprof.model_analyzer.print_model_analysis(
      tf.get_default_graph(),
      tfprof_options=tf.contrib.tfprof.model_analyzer.FLOAT_OPS_OPTIONS)

  truth = tf.argmax(model.labels, axis=1)
  predictions = tf.argmax(model.predictions, axis=1)
  precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

  summary_hook = tf.train.SummarySaverHook(
      save_steps=100,
      output_dir=FLAGS.train_dir,
      summary_op=[model.summaries,
                  tf.summary.scalar('Precision', precision)])

  logging_hook = tf.train.LoggingTensorHook(
      tensors={'step': model.global_step,
               'loss': model.cost,
               'precision': precision},
      every_n_iter=100)

  class _LearningRateSetterHook(tf.train.SessionRunHook):
    """Sets learning_rate based on global step."""

    def begin(self):
      self._lrn_rate = 0.1

    def before_run(self, run_context):
      return tf.train.SessionRunArgs(
          model.global_step,  # Asks for global step value.
          feed_dict={model.lrn_rate: self._lrn_rate})  # Sets learning rate

    def after_run(self, run_context, run_values):
      train_step = run_values.results
      if train_step < 40000:
        self._lrn_rate = 0.1
      elif train_step < 60000:
        self._lrn_rate = 0.01
      elif train_step < 80000:
        self._lrn_rate = 0.001
      else:
        self._lrn_rate = 0.0001

  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=FLAGS.log_root,
      hooks=[logging_hook, _LearningRateSetterHook()],
      chief_only_hooks=[summary_hook],
      # Since we provide a SummarySaverHook, we need to disable default
      # SummarySaverHook. To do that we set save_summaries_steps to 0.
      save_summaries_steps=0,
      config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
    while not mon_sess.should_stop():
      mon_sess.run(model.train_op)
示例#32
0
文件: main.py 项目: frhrdr/pixeldp
def evaluate(hps, model, dir_name=None, rerun=False):
    """Evaluate the ResNet and log prediction counters to compute
    sensitivity."""
    if dir_name == None:
        dir_name = FLAGS.data_dir + "/" + FLAGS.model_dir

    if os.path.isfile(dir_name + "/eval_data.json") and not rerun:
        # run only new models
        return

    if FLAGS.dataset == 'mnist':
        mnist = tf.contrib.learn.datasets.load_dataset("mnist")
        dataset = mnist.test
        images = tf.placeholder(tf.float32, [hps.batch_size, 784],
                                name='x-input')
        labels = tf.placeholder(tf.int64, [hps.batch_size], name='y-input')
    elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100':
        images, labels = cifar_input.build_input(FLAGS.dataset,
                                                 FLAGS.eval_data_path,
                                                 hps.batch_size,
                                                 hps.image_standardization,
                                                 FLAGS.mode)
    model = model.Model(hps, images, labels, FLAGS.mode)
    model.build_graph()
    saver = tf.train.Saver()
    summary_writer = tf.summary.FileWriter(dir_name)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    tf.train.start_queue_runners(sess)

    best_precision = 0.0
    try:
        ckpt_state = tf.train.get_checkpoint_state(dir_name)
    except tf.errors.OutOfRangeError as e:
        tf.logging.error('Cannot restore checkpoint: %s', e)

    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    # Make predictions on the dataset, keep the label distribution
    data = {
        'predictions': [],
        'pred_truth': [],
    }
    total_prediction, correct_prediction = 0, 0
    eval_data_size = FLAGS.eval_data_size
    eval_batch_size = hps.batch_size
    eval_batch_count = int(eval_data_size / eval_batch_size)
    for i in six.moves.range(eval_batch_count):
        if FLAGS.dataset == 'mnist':
            xs, ys = dataset.next_batch(hps.batch_size, fake_data=False)
            args = {
                model.noise_scale: 1.0,
                model._images: xs,
                model._labels: ys
            }
        elif FLAGS.dataset == 'cifar10' or FLAGS.dataset == 'cifar100':
            args = {model.noise_scale: 1.0}

        (
            summaries,
            loss,
            predictions,
            truth,
            train_step,
        ) = sess.run([
            model.summaries,
            model.cost,
            model.predictions,
            model.labels,
            model.global_step,
        ], args)

        print("Done: {}/{}".format(eval_batch_size * i, eval_data_size))
        truth = np.argmax(truth, axis=1)[:hps.batch_size]
        prediction_votes = np.zeros([hps.batch_size, hps.num_classes])
        predictions = np.argmax(predictions, axis=1)
        for i in range(hps.n_draws):
            for j in range(hps.batch_size):
                prediction_votes[j, predictions[i * hps.batch_size + j]] += 1
        predictions = np.argmax(prediction_votes, axis=1)

        data['predictions'] += prediction_votes.tolist()
        data['pred_truth'] += (truth == predictions).tolist()

        print("{} / {}".format(np.sum(truth == predictions), len(predictions)))

        correct_prediction += np.sum(truth == predictions)
        total_prediction += predictions.shape[0]

        current_precision = 1.0 * correct_prediction / total_prediction
        print(current_precision)
        print()

    # For Parseval, get true sensitivity, use to rescale the actual attack
    # bound as the nosie assumes this to be 1 but often it is not.
    if hps.noise_scheme == 'l2_l2_s1':
        # Parseval updates usually have a sensitivity higher than 1
        # despite the projection: we need to rescale when computing
        # sensitivity.
        sensitivity_multiplier = float(
            sess.run(model.sensitivity_multiplier, {model.noise_scale: 1.0}))
    else:
        sensitivity_multiplier = 1.0
    with open(dir_name + "/sensitivity_multiplier.json", 'w') as f:
        d = [sensitivity_multiplier]
        f.write(json.dumps(d))

    # Compute robustness and add it to the eval data.
    dp_mechs = {
        'l2_l2_s1': 'gaussian',
        'l1_l2_s1': 'gaussian',
        'l1_l1_s1': 'laplace',
        'l1_l1': 'laplace',
        'l1_l2': 'gaussian',
        'l2': 'gaussian',
        'l1': 'laplace',
    }
    robustness = [
        utils.robustness_size(counts=x,
                              dp_attack_size=hps.attack_norm_bound,
                              dp_epsilon=1.0,
                              dp_delta=0.05,
                              dp_mechanism=dp_mechs[hps.noise_scheme]) /
        sensitivity_multiplier for x in data['predictions']
    ]
    data['robustness'] = robustness
    data['sensitivity_mult_used'] = sensitivity_multiplier

    # Log eval data
    with open(dir_name + "/eval_data.json", 'w') as f:
        f.write(json.dumps(data))

    # Print stuff
    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    precision_summ = tf.Summary()
    precision_summ.value.add(tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(tag='Best Precision',
                                  simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    summary_writer.add_summary(summaries, train_step)
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                    (loss, precision, best_precision))
    summary_writer.flush()
示例#33
0
def train(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.train_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()

  # 计算预测准确率
  truth = tf.argmax(model.labels, axis=1)
  predictions = tf.argmax(model.predictions, axis=1)
  precision = tf.reduce_mean(tf.to_float(tf.equal(predictions, truth)))

  # 建立总结存储器,每100步存储一次
  summary_hook = tf.train.SummarySaverHook(
              save_steps=100,
              output_dir=FLAGS.train_dir,
              summary_op=tf.summary.merge(
                              [model.summaries,
                               tf.summary.scalar('Precision', precision)]))
  # 建立日志打印器,每100步打印一次
  logging_hook = tf.train.LoggingTensorHook(
      tensors={'step': model.global_step,
               'loss': model.cost,
               'precision': precision},
      every_n_iter=100)

  # 学习率更新器,基于全局Step
  class _LearningRateSetterHook(tf.train.SessionRunHook):

    def begin(self):
      #初始学习率
      self._lrn_rate = 0.1

    def before_run(self, run_context):
      return tf.train.SessionRunArgs(
                      # 获取全局Step
                      model.global_step,
                      # 设置学习率
                      feed_dict={model.lrn_rate: self._lrn_rate})  

    def after_run(self, run_context, run_values):
      # 动态更新学习率
      train_step = run_values.results
      if train_step < 40000:
        self._lrn_rate = 0.1
      elif train_step < 60000:
        self._lrn_rate = 0.01
      elif train_step < 80000:
        self._lrn_rate = 0.001
      else:
        self._lrn_rate = 0.0001

  # 建立监控Session
  with tf.train.MonitoredTrainingSession(
      checkpoint_dir=FLAGS.log_root,
      hooks=[logging_hook, _LearningRateSetterHook()],
      chief_only_hooks=[summary_hook],
      # 禁用默认的SummarySaverHook,save_summaries_steps设置为0
      save_summaries_steps=0, 
      config=tf.ConfigProto(allow_soft_placement=True)) as mon_sess:
    while not mon_sess.should_stop():
      # 执行优化训练操作
      mon_sess.run(model.train_op)
示例#34
0
def evaluate(hps):
  # 构建输入数据(读取队列执行器)
  images, labels = cifar_input.build_input(
      FLAGS.dataset, FLAGS.eval_data_path, hps.batch_size, FLAGS.mode)
  # 构建残差网络模型
  model = resnet_model.ResNet(hps, images, labels, FLAGS.mode)
  model.build_graph()
  # 模型变量存储器
  saver = tf.train.Saver()
  # 总结文件 生成器
  summary_writer = tf.summary.FileWriter(FLAGS.eval_dir)
  
  # 执行Session
  sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
  
  # 启动所有队列执行器
  tf.train.start_queue_runners(sess)

  best_precision = 0.0
  while True:
    # 检查checkpoint文件
    try:
      ckpt_state = tf.train.get_checkpoint_state(FLAGS.log_root)
    except tf.errors.OutOfRangeError as e:
      tf.logging.error('Cannot restore checkpoint: %s', e)
      continue
    if not (ckpt_state and ckpt_state.model_checkpoint_path):
      tf.logging.info('No model to eval yet at %s', FLAGS.log_root)
      continue
  
    # 读取模型数据(训练期间生成)
    tf.logging.info('Loading checkpoint %s', ckpt_state.model_checkpoint_path)
    saver.restore(sess, ckpt_state.model_checkpoint_path)

    # 逐Batch执行测试
    total_prediction, correct_prediction = 0, 0
    for _ in six.moves.range(FLAGS.eval_batch_count):
      # 执行预测
      (loss, predictions, truth, train_step) = sess.run(
          [model.cost, model.predictions,
           model.labels, model.global_step])
      # 计算预测结果
      truth = np.argmax(truth, axis=1)
      predictions = np.argmax(predictions, axis=1)
      correct_prediction += np.sum(truth == predictions)
      total_prediction += predictions.shape[0]

    # 计算准确率
    precision = 1.0 * correct_prediction / total_prediction
    best_precision = max(precision, best_precision)

    # 添加准确率总结
    precision_summ = tf.Summary()
    precision_summ.value.add(
        tag='Precision', simple_value=precision)
    summary_writer.add_summary(precision_summ, train_step)
    
    # 添加最佳准确总结
    best_precision_summ = tf.Summary()
    best_precision_summ.value.add(
        tag='Best Precision', simple_value=best_precision)
    summary_writer.add_summary(best_precision_summ, train_step)
    
    # 添加测试总结
    #summary_writer.add_summary(summaries, train_step)
    
    # 打印日志
    tf.logging.info('loss: %.3f, precision: %.3f, best precision: %.3f' %
                    (loss, precision, best_precision))
    
    # 执行写文件
    summary_writer.flush()

    if FLAGS.eval_once:
      break

    time.sleep(60)
示例#35
0
def get_model(hps, dataset, train_data_path, mode='train'):
    images, labels = cifar_input.build_input(dataset, train_data_path,
                                             hps.batch_size, mode)
    model = resnet_model.ResNet(hps, images, labels, mode)
    model.build_graph()
    return model
示例#36
0
def train(args):
    """
        Complete training and validation script. Additionally, saves inference model, trained weights and summaries.

        Parameters
        ----------
        args : argparse.parser object
            contains all necessary command line arguments

        Returns
        -------
    """

    if not args.resume:
        os.system("rm -rf %s" % args.save_path)
        os.system("mkdir -p %s" % args.save_path)
    else:
        print('Resuming training')

    num_classes = 10
    batch_size = 128

    # Define and build the network graph
    net = ResNet(num_classes, strides=[[1, 1], [1, 1], [2, 2], [2, 2]])

    # Parse the csv files and define input ops for training and validation I/O
    print('Loading data from {}'.format(args.data_dir))
    x_train, y_train = reader.build_input(
        'cifar10', os.path.join(args.data_dir, 'data_batch*'), batch_size,
        'train')

    # Define training metrics and optimisation ops
    train_net = net(x_train)
    train_logits_ = train_net['logits']
    train_pred_ = train_net['y_']
    train_truth_ = y_train

    train_acc_ = tf.reduce_mean(
        tf.cast(
            tf.equal(tf.cast(train_truth_, tf.int32),
                     tf.cast(train_pred_, tf.int32)), tf.float32))
    modules.scalar_summary(train_acc_,
                           'train/acc',
                           collections=['losses', 'metrics'])

    ce = modules.sparse_crossentropy(train_logits_,
                                     train_truth_,
                                     name='train/loss',
                                     collections=['losses', 'training'])
    l2 = modules.l2_regularization(net.get_variables(tf.GraphKeys.WEIGHTS),
                                   0.0002,
                                   name='train/l2',
                                   collections=['training', 'regularization'])
    train_loss_ = ce + l2

    lr_placeholder = tf.placeholder(tf.float32)
    train_op_ = tf.train.MomentumOptimizer(lr_placeholder,
                                           0.9).minimize(train_loss_)

    train_summaries = tf.summary.merge([
        tf.summary.merge_all('training'),
    ] + [
        tf.summary.histogram(var.name, var)
        for var in net.get_variables(tf.GraphKeys.MOVING_AVERAGE_VARIABLES)
    ])

    if args.run_validation:
        X_test, Y_test = reader.build_input(
            'cifar10', os.path.join(args.data_dir, 'test_batch*'), 100, 'eval')
        # Define validation outputs
        val_net = net(X_test, is_training=False)
        val_logits_ = val_net['logits']
        val_pred_ = val_net['y_']
        val_truth_ = Y_test

        val_loss_ = modules.sparse_crossentropy(
            val_logits_, val_truth_, collections=['losses', 'validation'])
        val_acc_ = tf.reduce_mean(
            tf.cast(
                tf.equal(tf.cast(val_truth_, tf.int32),
                         tf.cast(val_pred_, tf.int32)), tf.float32))

    # Define and setup a training supervisor
    global_step = tf.Variable(0, name='global_step', trainable=False)
    sv = tf.train.Supervisor(logdir=args.save_path,
                             is_chief=True,
                             summary_op=None,
                             save_summaries_secs=tps.save_summary_sec,
                             save_model_secs=tps.save_model_sec,
                             global_step=global_step)

    s = sv.prepare_or_wait_for_session(config=tf.ConfigProto())

    # Main training loop
    step = s.run(global_step) if args.resume else 0
    while not sv.should_stop():

        if step < 40000:
            lr = 0.1
        elif step < 60000:
            lr = 0.01
        elif step < 80000:
            lr = 0.001
        else:
            lr = 0.0001

        # Run the training op
        _ = s.run(train_op_, feed_dict={lr_placeholder: lr})

        # Evaluation of training and validation data
        if step % tps.steps_eval == 0:
            (train_loss, train_acc, train_pred, train_truth, t_sum) = s.run([
                train_loss_, train_acc_, train_pred_, train_truth_,
                train_summaries
            ])
            sv.summary_computed(s, t_sum, global_step=step)

            print("\nEval step= {:d}".format(step))
            print("Train: Loss= {:.6f}; Acc {:.6f}".format(
                train_loss, train_acc))

            # Evaluate all validation data
            if args.run_validation:
                all_loss = []
                all_acc = []
                for _ in range(50):
                    (val_loss, val_pred, val_truth, val_acc) = s.run(
                        [val_loss_, val_pred_, val_truth_, val_acc_])

                    all_loss.append(val_loss)
                    all_acc.append(val_acc)

                mean_loss = np.mean(all_loss, axis=0)
                mean_acc = np.mean(all_acc, axis=0)

                sv.summary_computed(s,
                                    modules.scalar_summary(
                                        mean_loss, 'val/loss'),
                                    global_step=step)
                sv.summary_computed(s,
                                    modules.scalar_summary(
                                        mean_acc, 'val/acc'),
                                    global_step=step)

                print("Valid: Loss= {:.6f}; Acc {:.6f}".format(
                    mean_loss, mean_acc))

        # Stopping condition
        if step >= tps.max_steps and tps.max_steps > 0:
            print('Run %d steps of %d steps - stopping now' %
                  (step, tps.max_steps))
            break

        step += 1