Пример #1
0
def imagenet_input(is_training):
  """Data reader for imagenet.

  Reads in imagenet data and performs pre-processing on the images.

  Args:
     is_training: bool specifying if train or validation dataset is needed.
  Returns:
     A batch of images and labels.
  """
  if is_training:
    dataset = dataset_factory.get_dataset('imagenet', 'train',
                                          FLAGS.dataset_dir)
  else:
    dataset = dataset_factory.get_dataset('imagenet', 'validation',
                                          FLAGS.dataset_dir)

  provider = slim.dataset_data_provider.DatasetDataProvider(
      dataset,
      shuffle=is_training,
      common_queue_capacity=2 * FLAGS.batch_size,
      common_queue_min=FLAGS.batch_size)
  [image, label] = provider.get(['image', 'label'])

  image_preprocessing_fn = preprocessing_factory.get_preprocessing(
      'mobilenet_v1', is_training=is_training)

  image = image_preprocessing_fn(image, FLAGS.image_size, FLAGS.image_size)

  images, labels = tf.train.batch(
      tensors=[image, label],
      batch_size=FLAGS.batch_size,
      num_threads=4,
      capacity=5 * FLAGS.batch_size)
  return images, labels
Пример #2
0
def config_initialization():
    # image shape and feature layers shape inference
    image_shape = (FLAGS.train_image_height, FLAGS.train_image_width)
    
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')
    tf.logging.set_verbosity(tf.logging.DEBUG)
    util.init_logger(log_file = 'log_train_seglink_%d_%d.log'%image_shape, log_path = FLAGS.train_dir, stdout = False, mode = 'a')
    
    
    config.init_config(image_shape, 
                       batch_size = FLAGS.batch_size, 
                       weight_decay = FLAGS.weight_decay, 
                       num_gpus = FLAGS.num_gpus, 
                       train_with_ignored = FLAGS.train_with_ignored,
                       seg_loc_loss_weight = FLAGS.seg_loc_loss_weight, 
                       link_cls_loss_weight = FLAGS.link_cls_loss_weight, 
                       )

    batch_size = config.batch_size
    batch_size_per_gpu = config.batch_size_per_gpu
        
    tf.summary.scalar('batch_size', batch_size)
    tf.summary.scalar('batch_size_per_gpu', batch_size_per_gpu)

    util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name)
    
    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
    config.print_config(FLAGS, dataset)
    return dataset
Пример #3
0
def main():
  with tf.Graph().as_default():
    if not dataset_dir:
      raise ValueError('You must supply the dataset directory with --dataset_dir')

    deploy_config = model_deploy.DeploymentConfig(
        num_clones=num_clones,
        clone_on_cpu=clone_on_cpu,
        replica_id=task,
        num_replicas=worker_replicas,
        num_ps_tasks=num_ps_tasks)

    dataset = dataset_factory.get_dataset(
        dataset_name, dataset_split_name, dataset_dir)

    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=True)

    with tf.device(deploy_config.inputs_device()):
      with tf.name_scope('inputs'):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=num_readers,
            common_queue_capacity=20 * batch_size,
            common_queue_min=10 * batch_size)
        [image, label, fp] = provider.get(['image', 'label', 'filepath'])
        label -= labels_offset

    train_image_size = 224

    image = image_preprocessing_fn(image, train_image_size,
                                   train_image_size)

    images, labels, fps = tf.train.batch(
        [image, label, fp],
        batch_size=batch_size,
        num_threads=num_preprocessing_threads,
        capacity=5 * batch_size)
    tf.image_summary('image', images, max_images=5)
    labels = slim.one_hot_encoding(
        labels, dataset.num_classes - labels_offset)
    batch_queue = slim.prefetch_queue.prefetch_queue(
        [images, labels, fps], capacity=2 * deploy_config.num_clones)

    images, labels, fps = batch_queue.dequeue()

    sess = tf.Session()
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess, coord)

    image_data, label_data, fp_data = sess.run([images, labels, fps])

    coord.request_stop()
    coord.join(threads)
    sess.close()
    return image_data, label_data, fp_data
Пример #4
0
def config_initialization():
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')
    tf.logging.set_verbosity(tf.logging.DEBUG)
    
    # image shape and feature layers shape inference
    image_shape = (FLAGS.train_image_height, FLAGS.train_image_width)
    
    config.init_config(image_shape, batch_size = FLAGS.batch_size)

    util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name)
    
    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
#     config.print_config(FLAGS, dataset)
    return dataset
Пример #5
0
def main(_):
  if not FLAGS.output_file:
    raise ValueError('You must supply the path to save to with --output_file')
  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default() as graph:
    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
                                          FLAGS.dataset_dir)
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=FLAGS.is_training)
    image_size = FLAGS.image_size or network_fn.default_image_size
    placeholder = tf.placeholder(name='input', dtype=tf.float32,
                                 shape=[1, image_size, image_size, 3])
    network_fn(placeholder)
    graph_def = graph.as_graph_def()
    with gfile.GFile(FLAGS.output_file, 'wb') as f:
      f.write(graph_def.SerializeToString())
Пример #6
0
def config_initialization():
    # image shape and feature layers shape inference
    image_shape = (FLAGS.eval_image_height, FLAGS.eval_image_width)
    
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')
    tf.logging.set_verbosity(tf.logging.DEBUG)
    
    config.init_config(image_shape, 
                       batch_size = 1, 
                       seg_conf_threshold = FLAGS.seg_conf_threshold,
                       link_conf_threshold = FLAGS.link_conf_threshold, 
                       train_with_ignored = FLAGS.train_with_ignored,
                       seg_loc_loss_weight = FLAGS.seg_loc_loss_weight, 
                       link_cls_loss_weight = FLAGS.link_cls_loss_weight, 
                       )
        
    
    util.proc.set_proc_name('eval_' + FLAGS.model_name + '_' + FLAGS.dataset_name )
    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)
    config.print_config(FLAGS, dataset, print_to_file = False)
    
    return dataset
Пример #7
0
def main(_):
  if not FLAGS.output_file:
    raise ValueError('You must supply the path to save to with --output_file')
  if FLAGS.is_video_model and not FLAGS.num_frames:
    raise ValueError(
        'Number of frames must be specified for video models with --num_frames')
  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default() as graph:
    dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'train',
                                          FLAGS.dataset_dir)
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=FLAGS.is_training)
    image_size = FLAGS.image_size or network_fn.default_image_size
    if FLAGS.is_video_model:
      input_shape = [FLAGS.batch_size, FLAGS.num_frames,
                     image_size, image_size, 3]
    else:
      input_shape = [FLAGS.batch_size, image_size, image_size, 3]
    placeholder = tf.placeholder(name='input', dtype=tf.float32,
                                 shape=input_shape)
    network_fn(placeholder)

    if FLAGS.quantize:
      tf.contrib.quantize.create_eval_graph()

    graph_def = graph.as_graph_def()
    if FLAGS.write_text_graphdef:
      tf.io.write_graph(
          graph_def,
          os.path.dirname(FLAGS.output_file),
          os.path.basename(FLAGS.output_file),
          as_text=True)
    else:
      with gfile.GFile(FLAGS.output_file, 'wb') as f:
        f.write(graph_def.SerializeToString())
Пример #8
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
  with tf.Graph().as_default():
    tf_global_step = slim.get_or_create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    n_hash = FLAGS.number_hashing_functions
    L_vec = FLAGS.neuron_vector_length
    quant_params = []
    for i in range(len(n_hash)):
        quant_params.append([int(n_hash[i]), int(L_vec[i])])

    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        quant_params=quant_params, is_training=False)
#     network_fn = nets_factory.get_network_fn(
#         FLAGS.model_name,
#         num_classes=(dataset.num_classes - FLAGS.labels_offset),
#         is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label] = provider.get(['image', 'label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels = tf.compat.v1.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(input=logits, axis=1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    #names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
    names_to_values, names_to_updates = aggregate_metric_map({
        #'Accuracy': slim.metrics.streaming_accuracy(predictions,labels),
        'Accuracy': tf.compat.v1.metrics.accuracy(labels, predictions), ##FIXXED
        'Recall_5': (
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in names_to_values.items():
      summary_name = 'eval/%s' % name
      op = tf.compat.v1.summary.scalar(summary_name, value, collections=[])
      op = tf.compat.v1.Print(op, [value], summary_name)
      tf.compat.v1.add_to_collection(tf.compat.v1.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    if tf.io.gfile.isdir(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.compat.v1.logging.info('Evaluating %s' % checkpoint_path)

    config = tf.compat.v1.ConfigProto()
    config.gpu_options.allow_growth=True
#     config.log_device_placement=True
    
    slim.evaluation.evaluate_once(
        master=FLAGS.master,
        checkpoint_path=checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
	    session_config=config,
        variables_to_restore=variables_to_restore)
Пример #9
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf

from datasets import dataset_factory
from deployment import model_deploy
from nets import nets_factory
from preprocessing import preprocessing_factory

slim = tf.contrib.slim

dataset_name = 'tianchi'
dataset_split_name = 'train'
dataset_dir = '/home/fangsh/tianchi/tianchi_dataset/tfrecord'
batcg_size = 32


dataset = dataset_factory.get_dataset(
        dataset_name, dataset_split_name, dataset_dir)

provider = slim.dataset_data_provider.DatasetDataProvider(
    dataset,
    num_readers= 4,
    common_queue_capacity=20 * batch_size,
    common_queue_min=10 * batch_size)
[image, label] = provider.get(['image_raw', 'label'])
Пример #10
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.DEBUG)
    with tf.Graph().as_default():
        # Config model_deploy. Keep TF Slim Models structure.
        # Useful if want to need multiple GPUs and/or servers in the future.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=0,
            num_replicas=1,
            num_ps_tasks=0)
        # Create global_step.
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        # Select the dataset.
        dataset = dataset_factory.get_dataset(
            FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

        # Get the SSD network and its anchors.
        ssd_class = nets_factory.get_network(FLAGS.model_name)
        ssd_params = ssd_class.default_params._replace(num_classes=FLAGS.num_classes)
        ssd_net = ssd_class(ssd_params)
        ssd_shape = ssd_net.params.img_shape
        ssd_anchors = ssd_net.anchors(ssd_shape)

        # Select the preprocessing function.
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        tf_utils.print_configuration(FLAGS.__flags, ssd_params,
                                     dataset.data_sources, FLAGS.train_dir)
        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.device(deploy_config.inputs_device()):
            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size,
                    shuffle=True)
            # Get for SSD network: image, labels, bboxes.
            [image, shape, glabels, gbboxes] = provider.get(['image', 'shape',
                                                             'object/label',
                                                             'object/bbox'])
            # Pre-processing image, labels and bboxes.
            image, glabels, gbboxes = \
                image_preprocessing_fn(image, glabels, gbboxes,
                                       out_shape=ssd_shape,
                                       data_format=DATA_FORMAT)
            # Encode groundtruth labels and bboxes.
            gclasses, glocalisations, gscores = \
                ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)
            batch_shape = [1] + [len(ssd_anchors)] * 3

            # Training batches and queue.
            r = tf.train.batch(
                tf_utils.reshape_list([image, gclasses, glocalisations, gscores]),
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)
            b_image, b_gclasses, b_glocalisations, b_gscores = \
                tf_utils.reshape_list(r, batch_shape)

            # Intermediate queueing: unique batch computation pipeline for all
            # GPUs running the training.
            batch_queue = slim.prefetch_queue.prefetch_queue(
                tf_utils.reshape_list([b_image, b_gclasses, b_glocalisations, b_gscores]),
                capacity=2 * deploy_config.num_clones)

        # =================================================================== #
        # Define the model running on every GPU.
        # =================================================================== #
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple
            clones of network_fn."""
            # Dequeue batch.
            b_image, b_gclasses, b_glocalisations, b_gscores = \
                tf_utils.reshape_list(batch_queue.dequeue(), batch_shape)

            # Construct SSD network.
            arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay,
                                          data_format=DATA_FORMAT)
            with slim.arg_scope(arg_scope):
                predictions, localisations, logits, end_points = \
                    ssd_net.net(b_image, is_training=True)
            # Add loss function.
            ssd_net.losses(logits, localisations,
                           b_gclasses, b_glocalisations, b_gscores,
                           match_threshold=FLAGS.match_threshold,
                           negative_ratio=FLAGS.negative_ratio,
                           alpha=FLAGS.loss_alpha,
                           label_smoothing=FLAGS.label_smoothing)
            return end_points

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # =================================================================== #
        # Add summaries from first clone.
        # =================================================================== #
        clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                            tf.nn.zero_fraction(x)))
        # Add summaries for losses and extra losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(tf.summary.scalar(loss.op.name, loss))
        for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope):
            summaries.add(tf.summary.scalar(loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        # =================================================================== #
        # Configure the moving averages.
        # =================================================================== #
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        # =================================================================== #
        # Configure the optimization procedure.
        # =================================================================== #
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = tf_utils.configure_learning_rate(FLAGS,
                                                             dataset.num_samples,
                                                             global_step)
            optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
            summaries.add(tf.summary.scalar('learning_rate', learning_rate))

        if FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = tf_utils.get_variables_to_train(FLAGS)

        # and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones,
            optimizer,
            var_list=variables_to_train)
        # Add total_loss to summary.
        summaries.add(tf.summary.scalar('total_loss', total_loss))

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)
        update_ops.append(grad_updates)
        update_op = tf.group(*update_ops)
        train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                          name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                           first_clone_scope))
        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # =================================================================== #
        # Kicks off the training.
        # =================================================================== #
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        saver = tf.train.Saver(max_to_keep=5,
                               keep_checkpoint_every_n_hours=1.0,
                               write_version=2,
                               pad_step_number=False)
        slim.learning.train(
            train_tensor,
            logdir=FLAGS.train_dir,
            master='',
            is_chief=True,
            init_fn=tf_utils.get_init_fn(FLAGS),
            summary_op=summary_op,
            number_of_steps=FLAGS.max_number_of_steps,
            log_every_n_steps=FLAGS.log_every_n_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            saver=saver,
            save_interval_secs=FLAGS.save_interval_secs,
            session_config=config,
            sync_optimizer=None)
def main():
    MODEL_NAME = 'inception_resnet_v2'

    # Where the training (fine-tuned) checkpoint and logs will be saved to.
    TRAIN_DIR = 'D:/pig_recognize/pig_slim1/flowers-models/inception_resnet_v2/all'

    # Where the dataset is saved to.
    DATASET_DIR = 'D:/pig_recognize/pig_slim1/cifar10'
    FLAGS.checkpoint_path = TRAIN_DIR
    FLAGS.eval_dir = TRAIN_DIR
    FLAGS.dataset_name = 'cifar10'
    FLAGS.dataset_split_name = 'test'
    FLAGS.dataset_dir = DATASET_DIR
    FLAGS.model_name = MODEL_NAME
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image1 = image_preprocessing_fn(image, eval_image_size,
                                        eval_image_size)

        #    images = tf.expand_dims(image1,0)

        images, labels = tf.train.batch(
            [image1, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        logits, _ = network_fn(images)
        soft_result = tf.nn.softmax(logits)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        labels = tf.squeeze(labels)

        #    # Define the metrics:
        #    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        #        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        #        'Recall_5': slim.metrics.streaming_recall_at_k(
        #            logits, labels, 5),
        #    })
        #
        #    # Print the summaries to screen.
        #    for name, value in names_to_values.items():
        #      summary_name = 'eval/%s' % name
        #      op = tf.summary.scalar(summary_name, value, collections=[])
        #      op = tf.Print(op, [value], summary_name)
        #      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
#      checkpoint_path = 'flowers-models/inception_resnet_v2/all/35000/model.ckpt-35000'
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        restorer = tf.train.Saver(variables_to_restore)

        with tf.Session() as sess:
            #        restorer.restore(sess, "D:/pig_recognize/pig_slim1/flowers-models/inception_resnet_v2/all/model.ckpt-2000")
            restorer.restore(sess, checkpoint_path)
            print('Model restored.')
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)
            tag = []
            #        eval_op=list(names_to_updates.values())
            for i in range(60):
                logits_, soft_result_, labels_, predictions_ = sess.run(
                    [logits, soft_result, labels, predictions])
                for m in range(FLAGS.batch_size):
                    for n in range(30):
                        tag.append([
                            labels_[m],
                            str(n + 1),
                            str('%.8f' % (soft_result_[m, n]))
                        ])
#        image_in = misc.imread('D:/pig_recognize/pig_slim1/pig_test_padding/00006.JPG')
#        image_resize = misc.imresize(image_in, (299,299,3))
#        input_img = np.array(image_resize, dtype='uint8')
##        input_lab = np.array(1, dtype='int64')
##        imgs = np.expand_dims(input_img,0)
##        imgs = np.append(imgs,imgs,0)
#        feed_dict = {image:input_img}
#        logits_ , predictions_= sess.run([logits, predictions], feed_dict=feed_dict)
                print(i)
#        print(logits_, predictions_)_face
            with open('out_b_19_face.csv', 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                for x in tag:
                    writer.writerow(x)
            coord.request_stop()
            coord.join(threads)
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpus
  if FLAGS.num_clones == -1:
    FLAGS.num_clones = len(FLAGS.gpus.split(','))

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    # tf.set_random_seed(42)
    tf.set_random_seed(0)
    ######################
    # Config model_deploy#
    ######################
    deploy_config = model_deploy.DeploymentConfig(
        num_clones=FLAGS.num_clones,
        clone_on_cpu=FLAGS.clone_on_cpu,
        replica_id=FLAGS.task,
        num_replicas=FLAGS.worker_replicas,
        num_ps_tasks=FLAGS.num_ps_tasks)

    # Create global_step
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name,
        FLAGS.dataset_dir.split(','),
        dataset_list_dir=FLAGS.dataset_list_dir,
        num_samples=FLAGS.frames_per_video,
        modality=FLAGS.modality,
        split_id=FLAGS.split_id)

    ######################
    # Select the network #
    ######################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        batch_size=FLAGS.batch_size,
        weight_decay=FLAGS.weight_decay,
        is_training=True,
        dropout_keep_prob=(1.0-FLAGS.dropout),
        pooled_dropout_keep_prob=(1.0-FLAGS.pooled_dropout),
        batch_norm=FLAGS.netvlad_batch_norm)

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=True)  # in case of pooling images,
                           # now preprocessing is done video-level

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    with tf.device(deploy_config.inputs_device()):
      provider = dataset_data_provider.DatasetDataProvider(
        dataset,
        num_readers=FLAGS.num_readers,
        common_queue_capacity=20 * FLAGS.batch_size,
        common_queue_min=10 * FLAGS.batch_size,
        bgr_flips=FLAGS.bgr_flip)
      [image, label] = provider.get(['image', 'label'])
      # now note that the above image might be a 23 channel image if you have
      # both RGB and flow streams. It will need to split later, but all the
      # preprocessing will be done consistently for all frames over all streams
      label = tf.string_to_number(label, tf.int32)
      label.set_shape(())
      label -= FLAGS.labels_offset

      train_image_size = FLAGS.train_image_size or network_fn.default_image_size

      scale_ratios=[float(el) for el in FLAGS.scale_ratios.split(',')],
      image = image_preprocessing_fn(image, train_image_size,
                                     train_image_size,
                                     scale_ratios=scale_ratios,
                                     out_dim_scale=FLAGS.out_dim_scale,
                                     model_name=FLAGS.model_name)

      images, labels = tf.train.batch(
          [image, label],
          batch_size=FLAGS.batch_size,
          num_threads=FLAGS.num_preprocessing_threads,
          capacity=5 * FLAGS.batch_size)
      if FLAGS.debug:
        images = tf.Print(images, [labels], 'Read batch')
      labels = slim.one_hot_encoding(
          labels, dataset.num_classes - FLAGS.labels_offset)
      batch_queue = slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * deploy_config.num_clones)
      summarize_images(images, provider.num_channels_stream)

    ####################
    # Define the model #
    ####################
    kwargs = {}
    if FLAGS.conv_endpoint is not None:
      kwargs['conv_endpoint'] = FLAGS.conv_endpoint
    def clone_fn(batch_queue):
      """Allows data parallelism by creating multiple clones of network_fn."""
      images, labels = batch_queue.dequeue()
      logits, end_points = network_fn(
          images, pool_type=FLAGS.pooling,
          classifier_type=FLAGS.classifier_type,
          num_channels_stream=provider.num_channels_stream,
          netvlad_centers=FLAGS.netvlad_initCenters.split(','),
          stream_pool_type=FLAGS.stream_pool_type,
          **kwargs)

      #############################
      # Specify the loss function #
      #############################
      if 'AuxLogits' in end_points:
        slim.losses.softmax_cross_entropy(
            end_points['AuxLogits'], labels,
            label_smoothing=FLAGS.label_smoothing, weight=0.4, scope='aux_loss')
      slim.losses.softmax_cross_entropy(
          logits, labels, label_smoothing=FLAGS.label_smoothing, weight=1.0)
      return end_points

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
    first_clone_scope = deploy_config.clone_scope(0)
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    # Add summaries for end_points.
    global end_points_debug
    end_points = clones[0].outputs
    end_points_debug = dict(end_points)
    end_points_debug['images'] = images
    end_points_debug['labels'] = labels
    for end_point in end_points:
      x = end_points[end_point]
      summaries.add(tf.histogram_summary('activations/' + end_point, x))
      summaries.add(tf.scalar_summary('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
      summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
      summaries.add(tf.histogram_summary(variable.op.name, variable))

    #################################
    # Configure the moving averages #
    #################################
    if FLAGS.moving_average_decay:
      moving_average_variables = slim.get_model_variables()
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, global_step)
    else:
      moving_average_variables, variable_averages = None, None

    #########################################
    # Configure the optimization procedure. #
    #########################################
    with tf.device(deploy_config.optimizer_device()):
      learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
      optimizer = _configure_optimizer(learning_rate)
      summaries.add(tf.scalar_summary('learning_rate', learning_rate,
                                      name='learning_rate'))

    if FLAGS.sync_replicas:
      # If sync_replicas is enabled, the averaging will be done in the chief
      # queue runner.
      optimizer = tf.train.SyncReplicasOptimizer(
          opt=optimizer,
          replicas_to_aggregate=FLAGS.replicas_to_aggregate,
          variable_averages=variable_averages,
          variables_to_average=moving_average_variables,
          replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
          total_num_replicas=FLAGS.worker_replicas)
    elif FLAGS.moving_average_decay:
      # Update ops executed locally by trainer.
      update_ops.append(variable_averages.apply(moving_average_variables))

    # Variables to train.
    variables_to_train = _get_variables_to_train()
    logging.info('Training the following variables: %s' % (
      ' '.join([el.name for el in variables_to_train])))

    #  and returns a train_tensor and summary_op
    total_loss, clones_gradients = model_deploy.optimize_clones(
        clones,
        optimizer,
        var_list=variables_to_train)

    # clip the gradients if needed
    if FLAGS.clip_gradients > 0:
      logging.info('Clipping gradients by %f' % FLAGS.clip_gradients)
      with tf.name_scope('clip_gradients'):
        clones_gradients = slim.learning.clip_gradient_norms(
            clones_gradients,
            FLAGS.clip_gradients)

    # Add total_loss to summary.
    summaries.add(tf.scalar_summary('total_loss', total_loss,
                                    name='total_loss'))

    # Create gradient updates.
    train_ops = {}
    if FLAGS.iter_size == 1:
      grad_updates = optimizer.apply_gradients(clones_gradients,
                                               global_step=global_step)
      update_ops.append(grad_updates)

      update_op = tf.group(*update_ops)
      train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                        name='train_op')
      train_ops = train_tensor
    else:
      gvs = [(grad, var) for grad, var in clones_gradients]
      varnames = [var.name for grad, var in gvs]
      varname_to_var = {var.name: var for grad, var in gvs}
      varname_to_grad = {var.name: grad for grad, var in gvs}
      varname_to_ref_grad = {}
      for vn in varnames:
        grad = varname_to_grad[vn]
        print("accumulating ... ", (vn, grad.get_shape()))
        with tf.variable_scope("ref_grad"):
          with tf.device(deploy_config.variables_device()):
            ref_var = slim.local_variable(
                np.zeros(grad.get_shape(),dtype=np.float32),
                name=vn[:-2])
            varname_to_ref_grad[vn] = ref_var

      all_assign_ref_op = [ref.assign(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()]
      all_assign_add_ref_op = [ref.assign_add(varname_to_grad[vn]) for vn, ref in varname_to_ref_grad.items()]
      assign_gradients_ref_op = tf.group(*all_assign_ref_op)
      accmulate_gradients_op = tf.group(*all_assign_add_ref_op)
      with tf.control_dependencies([accmulate_gradients_op]):
        final_gvs = [(varname_to_ref_grad[var.name] / float(FLAGS.iter_size), var) for grad, var in gvs]
        apply_gradients_op = optimizer.apply_gradients(final_gvs, global_step=global_step)
        update_ops.append(apply_gradients_op)
        update_op = tf.group(*update_ops)
        train_tensor = control_flow_ops.with_dependencies([update_op],
            total_loss, name='train_op')
      for i in range(FLAGS.iter_size):
        if i == 0:
          train_ops[i] = assign_gradients_ref_op
        elif i < FLAGS.iter_size - 1:  # because apply_gradients also computes
                                       # (see control_dependency), so
                                       # no need of running an extra iteration
          train_ops[i] = accmulate_gradients_op
        else:
          train_ops[i] = train_tensor


    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))

    # Merge all summaries together.
    summary_op = tf.merge_summary(list(summaries), name='summary_op')

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.intra_op_parallelism_threads = FLAGS.cpu_threads
    # config.allow_soft_placement = True
    # config.gpu_options.per_process_gpu_memory_fraction=0.7

    ###########################
    # Kicks off the training. #
    ###########################
    logging.info('RUNNING ON SPLIT %d' % FLAGS.split_id)
    slim.learning.train(
        train_ops,
        train_step_fn=train_step,
        logdir=FLAGS.train_dir,
        master=FLAGS.master,
        is_chief=(FLAGS.task == 0),
        init_fn=_get_init_fn(),
        summary_op=summary_op,
        number_of_steps=FLAGS.max_number_of_steps,
        log_every_n_steps=FLAGS.log_every_n_steps,
        save_summaries_secs=FLAGS.save_summaries_secs,
        save_interval_secs=FLAGS.save_interval_secs,
        sync_optimizer=optimizer if FLAGS.sync_replicas else None,
        session_config=config)
def main(_):
    ###add for pruning
    if FLAGS.model_name == "vgg":
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=0.9)  #add by lzlu
        sessGPU = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    else:
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=0.3)  #add by lzlu
        sessGPU = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
    print("FLAGS.model_name:", FLAGS.model_name)
    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth=True
    #sessGPU = tf.Session(config=config)
    #sessGPU = tf.Session(config=tf.ConfigProto(log_device_placement=True))
    #sess = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)))
    print("FLAGS.max_number_of_steps:", FLAGS.max_number_of_steps)
    print("FLAGS.learning_rate:", FLAGS.learning_rate)
    print("FLAGS.weight_decay:", FLAGS.weight_decay)
    print("FLAGS.batch_size:", FLAGS.batch_size)
    print("FLAGS.trainable_scopes:", FLAGS.trainable_scopes)
    print("FLAGS.pruning_rates:", FLAGS.pruning_rates)
    print("FLAGS.train_dir:", FLAGS.train_dir)
    print("FLAGS.checkpoint_path:", FLAGS.checkpoint_path)
    print("FLAGS.pruning_gradient_update_ratio:",
          FLAGS.pruning_gradient_update_ratio)
    ###
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        #######################
        # Config model_deploy #
        #######################
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)
        print("deploy_config.variables_device():")
        print(deploy_config.variables_device())
        # Create global_step
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ######################
        # Select the network #
        ######################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay,
            is_training=True)

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        with tf.device(deploy_config.inputs_device()):
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size)
            [image, label] = provider.get(['image', 'label'])
            label -= FLAGS.labels_offset

            train_image_size = FLAGS.train_image_size or network_fn.default_image_size

            image = image_preprocessing_fn(image, train_image_size,
                                           train_image_size)

            images, labels = tf.train.batch(
                [image, label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)
            labels = slim.one_hot_encoding(
                labels, dataset.num_classes - FLAGS.labels_offset)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [images, labels], capacity=2 * deploy_config.num_clones)

        ####################
        # Define the model #
        ####################
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of network_fn."""
            with tf.device(deploy_config.inputs_device()):
                images, labels = batch_queue.dequeue()
            logits, end_points = network_fn(images)

            #############################
            # Specify the loss function #
            #############################
            if 'AuxLogits' in end_points:
                tf.losses.softmax_cross_entropy(
                    logits=end_points['AuxLogits'],
                    onehot_labels=labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weights=0.4,
                    scope='aux_loss')
            tf.losses.softmax_cross_entropy(
                logits=logits,
                onehot_labels=labels,
                label_smoothing=FLAGS.label_smoothing,
                weights=1.0)
            return end_points

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(
                tf.summary.scalar('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))

        # Add summaries for losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))
            ##add for pruning
            summaries.add(
                tf.summary.scalar('pruning_rate/' + variable.op.name,
                                  1 - tf.nn.zero_fraction(variable)))

        #################################
        # Configure the moving averages #
        #################################
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        #########################################
        # Configure the optimization procedure. #
        #########################################
        print("deploy_config.optimizer_device():")
        print(deploy_config.optimizer_device())
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = _configure_learning_rate(dataset.num_samples,
                                                     global_step)
            optimizer = _configure_optimizer(learning_rate)
            summaries.add(tf.summary.scalar('learning_rate', learning_rate))

        if FLAGS.sync_replicas:
            # If sync_replicas is enabled, the averaging will be done in the chief
            # queue runner.
            optimizer = tf.train.SyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                variable_averages=variable_averages,
                variables_to_average=moving_average_variables,
                replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
                total_num_replicas=FLAGS.worker_replicas)
        elif FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = _get_variables_to_train()

        #  and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)

        ###add by lzlu
        variables = tf.model_variables()
        slim.model_analyzer.analyze_vars(variables, print_info=True)
        ##print("variables_to_train:",variables_to_train)
        ##print("clones_gradients_before_pruning:",clones_gradients)
        variables_to_pruning = get_variables_to_pruning()
        pruningMask = get_pruning_mask(variables_to_pruning)
        ##print("pruningMask__grad:",pruningMask)
        ##print("My_variables_to_pruning__grad:",variables_to_pruning)
        clones_gradients = apply_pruning_to_grad(clones_gradients, pruningMask)
        ##print("clones_gradients_after_pruning:",clones_gradients)
        ##print("slim.get_model_variables():",slim.get_model_variables())
        ###

        # Add total_loss to summary.
        summaries.add(tf.summary.scalar('total_loss', total_loss))

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)

        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        ### add for pruning
        #######################
        # Config mySaver      #
        #######################
        class mySaver(tf.train.Saver):
            def restore(self, sess, save_path):
                ##print("mySaver--restore...!")
                tf.train.Saver.restore(self, sess, save_path)
                variables_to_pruning = get_variables_to_pruning()
                ##print("My_variables_to_pruning__restore:",variables_to_pruning)
                pruningMask = apply_pruning_to_var(variables_to_pruning, sess)
                ##print("mySaver--restore done!")
            def save(self,
                     sess,
                     save_path,
                     global_step=None,
                     latest_filename=None,
                     meta_graph_suffix="meta",
                     write_meta_graph=True,
                     write_state=True):
                ##print("My Saver--save...!")
                tf.train.Saver.save(self, sess, save_path, global_step,
                                    latest_filename, meta_graph_suffix,
                                    write_meta_graph, write_state)
                ##print("My Saver--save done!")

        saver = mySaver(max_to_keep=2)
        ###

        ###########################
        # Kicks off the training. #
        ###########################
        slim.learning.train(
            train_tensor,
            logdir=FLAGS.train_dir,
            master=FLAGS.master,
            is_chief=(FLAGS.task == 0),
            init_fn=_get_init_fn(),
            summary_op=summary_op,
            number_of_steps=FLAGS.max_number_of_steps,
            log_every_n_steps=FLAGS.log_every_n_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            saver=saver,  #add for pruning
            save_interval_secs=FLAGS.save_interval_secs,
            sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #14
0
def eval_model(candidate, N, F):
    print("eval model")
    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name, 'test',
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            candidate,
            N,
            F,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = network_fn.default_image_size

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        FLAGS.batch_size = 100
        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        logits, _ = network_fn(images)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            'Recall_5':
            slim.metrics.streaming_recall_at_k(logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.items():
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        config = tf.ConfigProto(allow_soft_placement=True,
                                log_device_placement=False)
        config.gpu_options.allow_growth = True

        slim.evaluation.evaluation_loop(
            master=FLAGS.master,
            checkpoint_dir=FLAGS.train_dir,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            eval_op=list(names_to_updates.values()),
            variables_to_restore=variables_to_restore,
            session_config=config)

        return
Пример #15
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.DEBUG)
    with tf.Graph().as_default():

        # =================================================================== #
        # Config model_deploy.                                                #
        # Keep TF Slim Models structure.                                      #
        # Useful if want to need multiple GPUs and/or servers in the future.  #
        # =================================================================== #
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=0,
            num_replicas=1,
            num_ps_tasks=0)

        # Create global_step.
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        # =================================================================== #
        # Select the dataset.
        # =================================================================== #
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        # =================================================================== #
        # Select the network
        # =================================================================== #
        if FLAGS.model_name == "crnn":
            crnn_net = nets_factory.get_network(FLAGS.model_name)
            network_fn = crnn_net(phase='Train',
                                  num_classes=(dataset.num_classes -
                                               FLAGS.labels_offset))

        else:
            network_fn = nets_factory.get_network_fn(
                FLAGS.model_name,
                num_classes=(dataset.num_classes - FLAGS.labels_offset),
                weight_decay=FLAGS.weight_decay,
                is_training=True)

        # =================================================================== #
        # Select the preprocessing function.
        # =================================================================== #
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        #tf_utils.print_configuration(FLAGS.__flags,
        #                             dataset.data_sources, save_dir=FLAGS.train_dir)
        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.device(deploy_config.inputs_device()):
            if FLAGS.dataset_name == "ocr":
                image, label = tf_utils.read_features(ops.join(
                    FLAGS.dataset_dir, "ocr_train_000.tfrecord"),
                                                      num_epochs=None)

            else:
                with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                    provider = slim.dataset_data_provider.DatasetDataProvider(
                        dataset,
                        num_readers=FLAGS.num_readers,
                        common_queue_capacity=20 * FLAGS.batch_size,
                        common_queue_min=10 * FLAGS.batch_size,
                        shuffle=True)

                [image, label] = provider.get(['image', 'label'])

            # Pre-processing image, labels and bboxes.
            train_image_size = FLAGS.train_image_size or network_fn.default_image_size

            #image = image_preprocessing_fn(image, 32, 256)

            # Resize the image to the specified height and width.
            image = tf.expand_dims(image, 0)
            image = tf.image.resize_bilinear(image, [IMAGE_H, IMAGE_W],
                                             align_corners=False)
            image = tf.squeeze(image, [0])
            image = tf.subtract(image, 0.5)
            image = tf.multiply(image, 2.0)

            #label = tf.reshape(label,[MAX_CHAR_LEN])
            images, labels = tf.train.shuffle_batch(
                tensors=[image, label],
                batch_size=32,
                capacity=1000 + 2 * 32,
                min_after_dequeue=100,
                #enqueue_many=True,
                num_threads=1)
            images = tf.cast(x=images, dtype=tf.float32)

            if FLAGS.model_name != "crnn":
                labels = slim.one_hot_encoding(
                    labels, dataset.num_classes - FLAGS.labels_offset)
                batch_queue = slim.prefetch_queue.prefetch_queue(
                    [images, labels], capacity=2 * deploy_config.num_clones)

        # =================================================================== #
        # Define the model running on every GPU.
        # =================================================================== #
        #def clone_fn(batch_queue):
        def clone_fn(images, labels):
            """Allows data parallelism by creating multiple
            clones of network_fn."""
            # Dequeue batch.
            #images, labels = batch_queue.dequeue()
            with tf.variable_scope('crnn'):
                logits, end_points = network_fn.build_CRNNnet(images)

            #############################
            # Specify the loss function #
            #############################

            if FLAGS.model_name == "crnn":
                if FLAGS.dataset_name == "mnist":
                    idx = tf.where(tf.not_equal(labels, 0))
                    labels = tf.SparseTensor(idx, tf.gather_nd(labels, idx),
                                             labels.get_shape())
                    labels = tf.cast(labels, tf.int32)

                ctc_loss = tf.nn.ctc_loss(
                    labels=labels,
                    inputs=logits,
                    sequence_length=SEQ_LENGTH,
                    ctc_merge_repeated=True,
                    ignore_longer_outputs_than_inputs=True,
                    time_major=True)

                ctc_loss = tf.reduce_mean(ctc_loss)
                ctc_loss = tf.Print(ctc_loss, [ctc_loss], message='* Loss : ')

                tf.losses.add_loss(ctc_loss)
                decoded, log_prob = tf.nn.ctc_beam_search_decoder(
                    logits, sequence_length=SEQ_LENGTH, merge_repeated=False)

                sequence_dist = tf.reduce_mean(
                    tf.edit_distance(tf.cast(decoded[0], tf.int32), labels))

            else:
                if 'AuxLogits' in end_points:
                    slim.losses.softmax_cross_entropy(
                        end_points['AuxLogits'],
                        labels,
                        label_smoothing=FLAGS.label_smoothing,
                        weights=0.4,
                        scope='aux_loss')
                slim.losses.softmax_cross_entropy(
                    logits,
                    labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weights=1.0)

            return end_points, ctc_loss, sequence_dist, labels, decoded

        if FLAGS.model_name == "crnn":
            end_points, ctc_loss, sequence_dist, labels, decoded = clone_fn(
                images, labels)

            network_fn.train_crnn(FLAGS, global_step, ctc_loss, sequence_dist,
                                  labels, decoded)

        else:
            # =================================================================== #
            # Add summaries from first clone.
            # =================================================================== #
            # Gather initial summaries.
            summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

            #clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
            clones = model_deploy.create_clones(deploy_config, clone_fn,
                                                [images, labels])
            first_clone_scope = deploy_config.clone_scope(0)
            # Gather update_ops from the first clone. These contain, for example,
            # the updates for the batch_norm variables created by network_fn.
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                           first_clone_scope)

            # Add summaries for end_points.
            end_points, ctc_loss, sequence_dist, labels, decoded = clones[
                0].outputs

            for end_point in end_points:
                x = end_points[end_point]
                summaries.add(
                    tf.summary.histogram('activations/' + end_point, x))
                summaries.add(
                    tf.summary.scalar('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))
            # Add summaries for losses.
            for loss in tf.get_collection(tf.GraphKeys.LOSSES,
                                          first_clone_scope):
                summaries.add(
                    tf.summary.scalar('losses/%s' % loss.op.name, loss))
            summaries.add(tf.summary.scalar('losses/ctc_loss',
                                            tensor=ctc_loss))
            summaries.add(tf.summary.scalar('Seq_Dist', tensor=sequence_dist))

            # Add summaries for variables.
            for variable in slim.get_model_variables():
                summaries.add(tf.summary.histogram(variable.op.name, variable))

            # =================================================================== #
            # Configure the moving averages.
            # =================================================================== #
            if FLAGS.moving_average_decay:
                moving_average_variables = slim.get_model_variables()
                variable_averages = tf.train.ExponentialMovingAverage(
                    FLAGS.moving_average_decay, global_step)
            else:
                moving_average_variables, variable_averages = None, None

            # =================================================================== #
            # Configure the optimization procedure.
            # =================================================================== #
            with tf.device(deploy_config.optimizer_device()):
                learning_rate = tf_utils.configure_learning_rate(
                    FLAGS, dataset.num_samples, global_step)
                #optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate).minimize(cost)
                optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
                #optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate).minimize(loss=ctc_loss, global_step=global_step)
                summaries.add(
                    tf.summary.scalar('learning_rate', tensor=learning_rate))

            if FLAGS.sync_replicas:
                # If sync_replicas is enabled, the averaging will be done in the chief
                # queue runner.
                optimizer = tf.train.SyncReplicasOptimizer(
                    opt=optimizer,
                    replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                    total_num_replicas=FLAGS.worker_replicas,
                    variable_averages=variable_averages,
                    variables_to_average=moving_average_variables)
            elif FLAGS.moving_average_decay:
                # Update ops executed locally by trainer.
                update_ops.append(
                    variable_averages.apply(moving_average_variables))

            # Variables to train.
            variables_to_train = tf_utils.get_variables_to_train(FLAGS)

            # and returns a train_tensor and summary_op
            total_loss, clones_gradients = model_deploy.optimize_clones(
                clones,
                optimizer,
                #regularization_losses = ctc_loss,
                var_list=variables_to_train)

            # Add total_loss to summary.
            summaries.add(tf.summary.scalar('total_loss', total_loss))

            # Create gradient updates.
            grad_updates = optimizer.apply_gradients(clones_gradients,
                                                     global_step=global_step)
            update_ops.append(grad_updates)
            update_op = tf.group(*update_ops)
            with tf.control_dependencies([update_op]):
                train_tensor = tf.identity(total_loss, name='train_op')
                train_tensor = slim.learning.create_train_op(
                    total_loss, optimizer)
            """    
            train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                              name='train_op')
            """

            # Add the summaries from the first clone. These contain the summaries
            #summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
            #                                   first_clone_scope))
            # Merge all summaries together.
            summary_op = tf.summary.merge(list(summaries), name='summary_op')

            # =================================================================== #
            # Configure the saver procedure.
            # =================================================================== #
            saver = tf.train.Saver(max_to_keep=5,
                                   keep_checkpoint_every_n_hours=1.0,
                                   write_version=2,
                                   pad_step_number=False)

            model_save_dir = './checkpoints/' + FLAGS.model_name
            if not ops.exists(model_save_dir):
                os.makedirs(model_save_dir)
            train_start_time = time.strftime('%Y-%m-%d-%H-%M-%S',
                                             time.localtime(time.time()))
            model_name = 'CRNNnet_{:s}.ckpt'.format(str(train_start_time))
            model_save_path = ops.join(model_save_dir, model_name)

            # =================================================================== #
            # Kicks off the training.
            # =================================================================== #
            #summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num),graph=tf.get_default_graph())

            gpu_options = tf.GPUOptions(
                per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
            config = tf.ConfigProto(log_device_placement=False,
                                    gpu_options=gpu_options)

            slim.learning.train(
                train_tensor,
                logdir=FLAGS.train_dir,
                master='',
                is_chief=True,
                init_fn=tf_utils.get_init_fn(FLAGS),
                summary_op=summary_op,
                number_of_steps=FLAGS.max_number_of_steps,
                log_every_n_steps=FLAGS.log_every_n_steps,
                save_summaries_secs=FLAGS.save_summaries_secs,
                saver=saver,
                save_interval_secs=FLAGS.save_interval_secs,
                session_config=config,
                #session_wrapper=tfdbg.LocalCLIDebugWrapperSession,
                sync_optimizer=None)
Пример #16
0
def main(_):
    '''
   training with optimization
   '''
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        network_fn = get_network_fn(num_classes=FLAGS.num_classes,
                                    is_training=True)

        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        train_set = dataset_factory.get_dataset(FLAGS.dataset_name, "train",
                                                FLAGS.dataset_dir)
        #val_set = dataset_factory.get_dataset(FLAGS.dataset_name, "val", FLAGS.dataset_dir)

        with tf.device(deploy_config.inputs_device()):
            #####Consider Replace the following until #####
            #options = tf.python_io.TFRecordOptions(TFRecordCompressionType.ZLIB)
            train_provider = slim.dataset_data_provider.DatasetDataProvider(
                train_set,
                num_readers=FLAGS.num_readers,
                #   reader_kwargs={'options':options},
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size)
            try:
                [train_image, train_label, train_boxes
                 ] = train_provider.get(['image', 'label', 'gt_boxes'])
                #[train_image, train_boxes, train_masks] = train_provider.get(['image', 'gt_boxes', 'gt_masks'])
                print(train_image, train_label, train_boxes)
                train_image, train_label, train_boxes, train_masks = _preprocessing.preprocess_image(
                    train_image, train_label, train_boxes, is_training=True)
            except Exception as e:
                print(e)
                return
            train_images, train_labels = tf.train.batch(
                [train_image, train_label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)

            train_batch_queue = slim.prefetch_queue.prefetch_queue(
                [train_images, train_labels], capacity=2 * FLAGS.num_clones)
            print(train_batch_queue)

            #val_provider = slim.dataset_data_provider.DatasetDataProvider(
            #   val_set,
            #   num_readers=FLAGS.num_readers,
            #   reader_kwargs={'options':options},
            #   common_queue_capacity=20 * FLAGS.batch_size,
            #   common_queue_min=10 * FLAGS.batch_size)
            #
            #[val_image, val_label, val_boxes, val_masks] = val_provider.get(['image', 'label', 'gt_boxes', 'gt_masks'])
            #
            #val_image, val_label, val_boxes, val_masks = _preprocessing.preprocess_image(val_image, val_label, val_boxes, val_masks)
            #
            #val_images, val_labels = tf.train.batch(
            #   [val_image, val_label],
            #   batch_size=FLAGS.batch_size,
            #   num_threads=FLAGS.num_preprocessing_threads,
            #   capacity=5 * FLAGS.batch_size)


#          val_batch_queue = slim.prefetch_queue.prefetch_queue(
#             [val_images, val_labels], capacity=2 * FLAGS.num_clones)

        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of networks"""
            images, labels = batch_queue.dequeue()
            #print(images, labels)
            images = tf.squeeze(images, [1])
            pred_annotation, fc8s, end_points = network_fn(images=images)
            ############################
            ## Loss function #
            ############################
            #print("Pred_annot", pred_annotation, "Labels", labels,"fc8s", fc8s)

            tf.losses.sparse_softmax_cross_entropy(
                logits=tf.to_float(pred_annotation),
                labels=tf.to_int32(labels),
                weights=1.0,
                scope="entropy")

            #loss = tf.reduce_mean((tf.losses.sparse_softmax_cross_entropy(logits=tf.to_float(pred_annotation),labels=tf.to_int32(labels),scope="entropy")))

            return images, labels, pred_annotation, end_points

        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [train_batch_queue])
        clone_scope = deploy_config.clone_scope(0)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, clone_scope)

        images, labels, pred_annotation, end_points = clones[0].outputs

        summaries.add(tf.summary.image("Original_images", images))
        summaries.add(tf.summary.image("Ground_truth_masks", labels))
        summaries.add(
            tf.summary.image("Prediction_masks", tf.to_float(pred_annotation)))

        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(
                tf.summary.scalar('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))

        for loss in tf.get_collection(tf.GraphKeys.LOSSES, clone_scope):
            summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        with tf.device(deploy_config.optimizer_device()):
            learning_rate = utils._configure_learning_rate(
                train_set.num_samples, global_step)
            optimizer = utils._configure_optimizer(learning_rate)
            summaries.add(tf.summary.scalar('learning rate', learning_rate))

        if FLAGS.sync_replicas:
            # If sync_replicas is enabled, the averaging will be done in the chief
            # queue runner.
            optimizer = tf.train.SyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                variable_averages=variable_averages,
                variables_to_average=moving_average_variables,
                replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
                total_num_replicas=FLAGS.worker_replicas)
        elif FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        variables_to_train = utils._get_variables_to_train()
        for var in variables_to_train:
            print(var.op.name)

        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)
        print('total_loss', total_loss, 'clone_gradients', clones_gradients)
        summaries.add(tf.summary.scalar('total_loss', total_loss))

        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)

        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)

        train_tensor = control_flow_ops.with_dependencies([update_op],
                                                          total_loss,
                                                          name='train_op')

        summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                           clone_scope))

        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # Validate Set Evaluation options
        #network_fn_eval = get_network_fn(num_classes=NUM_OF_CLASSES, is_training=False)
        #print("val_images", val_images)
        #val_preds, fc8s, _ = network_fn(images=val_images)
        #names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        #    'mean_iou': slim.metrics.streaming_mean_iou(val_preds, val_labels, num_classes=NUM_OF_CLASSES),
        #})

        #for name, value in names_to_values.items():
        #   summary_name = 'eval/%s' % name
        #   op = tf.summary.scalar(summary_name, value, collections=[])
        #   op = tf.Print(op, [value], summary_name)
        #   tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        #if FLAGS.moving_average_decay:
        #   variable_averages = tf.train.ExponentialMovingAverage(
        #       FLAGS.moving_average_decay, tf_global_step)
        #   variables_to_restore = variable_averages.variables_to_restore(
        #       slim.get_model_variables())
        #   variables_to_restore[tf_global_step.op.name] = tf_global_step
        #else:
        #   variables_to_restore = slim.get_variables_to_restore()

        #for i in range(FLAGS.max_steps / FLAGS.iter_train_steps):

        slim.learning.train(
            train_tensor,
            logdir=FLAGS.logs_dir,
            master='',
            is_chief=(FLAGS.task == 0),
            init_fn=utils._get_init_fn(),
            summary_op=summary_op,
            number_of_steps=FLAGS.
            max_steps,  #FLAGS.iter_train_steps*(i+1) if FLAGS.max_steps > FLAGS.iter_train_steps*(i+1) else FLAGS.max_steps,
            log_every_n_steps=FLAGS.log_every_n_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            save_interval_secs=FLAGS.save_interval_secs,
            sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #17
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    if FLAGS.variable_update == "horovod":
        import horovod.tensorflow as hvd
        hvd.init()

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        #######################
        # Config model_deploy #
        #######################
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        # Create global_step
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ######################
        # Select the network #
        ######################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay,
            is_training=True)

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        with tf.device(deploy_config.inputs_device()):
            if FLAGS.variable_update == "horovod":
                import horovod.tensorflow as hvd
                # Set different seeds for shuffle queue, so that different workers
                # start to read different input files.
                # it's copied from tf_cnn_benchmarks/benchmark_cnn.py.
                seed_value = 1234 + int(hvd.rank())
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size,
                seed=seed_value
                if FLAGS.variable_update == "horovod" else None)
            [image, label] = provider.get(['image', 'label'])
            label -= FLAGS.labels_offset

            train_image_size = FLAGS.train_image_size or network_fn.default_image_size

            image = image_preprocessing_fn(image, train_image_size,
                                           train_image_size)

            images, labels = tf.train.batch(
                [image, label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)
            labels = slim.one_hot_encoding(
                labels, dataset.num_classes - FLAGS.labels_offset)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [images, labels], capacity=2 * deploy_config.num_clones)

        ####################
        # Define the model #
        ####################
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of network_fn."""
            images, labels = batch_queue.dequeue()
            logits, end_points = network_fn(images)

            #############################
            # Specify the loss function #
            #############################
            if 'AuxLogits' in end_points:
                slim.losses.softmax_cross_entropy(
                    end_points['AuxLogits'],
                    labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weights=0.4,
                    scope='aux_loss')
            slim.losses.softmax_cross_entropy(
                logits,
                labels,
                label_smoothing=FLAGS.label_smoothing,
                weights=1.0)
            return end_points

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(
                tf.summary.scalar('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))

        # Add summaries for losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        #################################
        # Configure the moving averages #
        #################################
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = _configure_learning_rate(dataset.num_samples,
                                                     global_step)
            optimizer = _configure_optimizer(learning_rate)
            summaries.add(tf.summary.scalar('learning_rate', learning_rate))

        if FLAGS.sync_replicas:
            # If sync_replicas is enabled, the averaging will be done in the chief
            # queue runner.
            optimizer = tf.train.SyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                total_num_replicas=FLAGS.worker_replicas,
                variable_averages=variable_averages,
                variables_to_average=moving_average_variables)
        elif FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = _get_variables_to_train()

        #  and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)
        # Add total_loss to summary.
        summaries.add(tf.summary.scalar('total_loss', total_loss))

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)
        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)
        with tf.control_dependencies([update_op]):
            train_tensor = tf.identity(total_loss, name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        config = tf.ConfigProto()
        config.intra_op_parallelism_threads = FLAGS.num_intra_threads
        config.inter_op_parallelism_threads = FLAGS.num_inter_threads

        if FLAGS.variable_update == "horovod":
            import horovod.tensorflow as hvd
            local_init_op_ = hvd.broadcast_global_variables(0)
            is_chief = hvd.rank() == 0
        else:
            local_init_op_ = _USE_DEFAULT
            is_chief = FLAGS.task == 0

        ###########################
        # Kicks off the training. #
        ###########################
        slim.learning.train(
            train_tensor,
            logdir=FLAGS.train_dir if is_chief else None,
            master=FLAGS.master,
            # all horovod workers are 'chiefs'
            is_chief=is_chief or FLAGS.variable_update == "horovod",
            init_fn=_get_init_fn(),
            local_init_op=local_init_op_,
            summary_op=summary_op if is_chief else _USE_DEFAULT,
            session_config=config,
            number_of_steps=FLAGS.max_number_of_steps,
            log_every_n_steps=FLAGS.log_every_n_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            save_interval_secs=FLAGS.save_interval_secs,
            sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #18
0
            hm_a = hm.max(axis=0, keepdims=True)
            dense_wh_mask = np.concatenate([hm_a, hm_a], axis=0)
            ret.update({'dense_wh': dense_wh, 'dense_wh_mask': dense_wh_mask})
            del ret['wh']
        elif self.opt.cat_spec_wh:
            ret.update({
                'cat_spec_wh': cat_spec_wh,
                'cat_spec_mask': cat_spec_mask
            })
            del ret['wh']
        if self.opt.reg_offset:
            ret.update({'reg': reg})
        if self.opt.debug > 0 or not self.split == 'train':
            gt_det = np.array(gt_det, dtype=np.float32) if len(gt_det) > 0 else \
                np.zeros((1, 6), dtype=np.float32)
            meta = {'c': c, 's': s, 'gt_det': gt_det, 'img_id': img_id}
            ret['meta'] = meta
        return ret


if __name__ == '__main__':
    from datasets.dataset_factory import get_dataset
    from opts import opts
    opts_obj = opts()
    opt = opts_obj.parse()
    Dataset = get_dataset('pascal', 'ctdet')
    dataset = Dataset(opt, 'train')
    opts_obj.update_dataset_info_and_set_heads(opt, dataset)
    for i in range(len(dataset)):
        dataset[i]
Пример #19
0
def main():

    ####################################################################################################

    title = opts.title
    seed = opts.seed
    mode = opts.mode

    gpu_list = opts.gpu_list
    batch_size = opts.batch_size

    dataset = opts.dataset
    preprocess = opts.preprocess
    network = opts.network
    optimizer = opts.optimizer
    lr_decay = opts.lr_decay
    epoch_step = opts.epoch_step
    learning_step = opts.learning_step

    path_load = opts.path_load
    path_save = opts.path_save

    print_line()

    ####################################################################################################

    time_tag = get_time('%y-%m-%d %X')
    time_tag_short = time_tag[:8]
    seed = set_seed(seed)

    num_check_log = 0
    title_temp = title
    while True:
        path_log = '../log/' + time_tag_short + '(' + title_temp + ').txt'
        if os.path.isfile(
                path_log
        ) and title != 'temp':  # if title is 'temp', we will overwrite it
            num_check_log += 1
            title_temp = title + '_%d' % num_check_log
        else:
            title = title_temp
            del num_check_log, title_temp
            break

    print('title: ' + title)
    set_log(path_log)
    print_line()

    ####################################################################################################

    print(time_tag)
    print('SEED = %d' % seed)

    print_opts('options/' + OPTION + '.py')
    print_line()

    ####################################################################################################

    model_dir = '../model/'

    if isinstance(path_save, bool):
        # if title is 'temp', we will not save model
        path_save = model_dir + time_tag_short + '(' + title + ').tf' if path_save and title != 'temp' else None

    if path_load is not None:
        # key word search
        list = glob.glob(model_dir + '*' + path_load + '*.tf.data*')
        if len(list) == 0:
            raise FileNotFoundError(
                'Could not find any model file match the key words' +
                path_load)
        elif len(list) > 1:
            for list_file in list:
                print(list_file)
            raise FileNotFoundError(
                'Find more than one model file match the key words' +
                path_load)

        path_load = list[0][:list[0].find('.tf.') + 3]
        print('Find model in', path_load)

    ####################################################################################################

    os.environ['CUDA_VISIBLE_DEVICES'] = ''.join(
        str(gpu) + ',' for gpu in gpu_list)
    # os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    num_worker = max(len(gpu_list), 1)
    dataset_train = get_dataset(dataset, split='train')
    dataset_test = get_dataset(dataset, split='test')

    num_batch_train = dataset_train.num_sample // batch_size
    num_batch_test = dataset_test.num_sample // 100

    assert batch_size % num_worker == 0, 'batch_size %d can not be divided by number of workers %d' % (
        batch_size, num_worker)

    iterator_train = get_batch(dataset_train,
                               preprocess,
                               True,
                               batch_size // num_worker,
                               num_worker,
                               seed=seed)
    iterator_test = get_batch(dataset_test,
                              preprocess,
                              False,
                              100,
                              num_worker,
                              seed=seed)

    ####################################################################################################

    if mode in ['input_train', 'input_test']:
        if mode == 'input_train':
            num_batch = num_batch_train
            batch_input = iterator_train.get_next()
        else:
            num_batch = num_batch_test
            batch_input = iterator_test.get_next()

        print('Testing the speed of data input pipeline.')
        sess = get_session(gpu_list)
        while True:
            for batch in tqdm(range(num_batch),
                              desc='Input pipeline',
                              leave=False,
                              smoothing=0.1):
                batch_input_ = sess.run(batch_input)

    ####################################################################################################

    nets = []
    net = get_net_fn(network)

    if num_worker == 1:
        if len(gpu_list) == 0:
            print('Multi-CPU training, it might be slow', )
            print(
                'All parameters are pinned to CPU, all Ops are pinned to CPU')
            is_cpu_ps = True
        else:
            print('Single-GPU training with gpu', gpu_list[0])
            print(
                'All parameters are pinned to GPU, all Ops are pinned to GPU')
            is_cpu_ps = False

    elif num_worker > 1:
        print('Multi-GPU training tower with gpu list', gpu_list)
        print('All parameters are pinned to CPU, all Ops are pinned to GPU')
        print(
            'Get batchnorm moving average updates from data in the first GPU for speed'
        )
        print('Get L2 decay grads in the second GPU for speed')
        is_cpu_ps = True
    else:
        raise NotImplementedError('Unrecognized device settings')

    tower_grads = []
    tower_losses = []
    tower_errors = []

    # Loops over the number of workers and creates a copy ("tower") of the model on each worker.
    for i in range(num_worker):

        worker = '/gpu:%d' % i if gpu_list else '/cpu:0'

        # Creates a device setter used to determine where Ops are to be placed.
        if is_cpu_ps:
            # tf.train.replica_device_setter supports placing variables on the CPU, all
            # on one GPU, or on ps_servers defined in a cluster_spec.
            device_setter = tf.train.replica_device_setter(
                worker_device=worker, ps_device='/cpu:0', ps_tasks=1)
        else:
            device_setter = worker
        '''
    1. pin ops to GPU
    2. pin parameters to CPU (multi-GPU training) or GPU (single-GPU training)
    3. reuse parameters multi-GPU training

    # Creates variables on the first loop.  On subsequent loops reuse is set
    # to True, which results in the "towers" sharing variables.
    # tf.device calls the device_setter for each Op that is created.
    # device_setter returns the device the Op is to be placed on.
    '''
        with tf.variable_scope(tf.get_variable_scope(), reuse=bool(i != 0)), \
             tf.device(device_setter):

            print('Training model on GPU %d' %
                  gpu_list[i]) if gpu_list else print('Training model on CPUs')

            batch_train = iterator_train.get_next()

            if mode == 'speed_net':
                with tf.device('/cpu:0'):
                    print(
                        'Testing the speed of model by synthesized data, '
                        'which is theoretically the maximum speed for training this model'
                    )
                    batch_train = iterator_train.get_next()
                    shape_x = [batch_size // num_worker
                               ] + batch_train[0].get_shape().as_list()[1:]
                    shape_y = [batch_size // num_worker
                               ] + batch_train[1].get_shape().as_list()[1:]

                    batch_train_x = tf.zeros(shape_x, dtype=tf.float32)
                    batch_train_y = tf.zeros(shape_y, dtype=tf.float32)
                batch_train = [batch_train_x, batch_train_y]

            nets.append(
                net(batch_train[0],
                    batch_train[1],
                    opts=opts,
                    is_training=True))

            tower_losses.append(nets[i].loss)
            tower_errors.append(nets[i].error)

            if i == 0:
                # We only get batchnorm moving average updates from data in the first worker for speed
                update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                nets[-1].count_parameters()
                nets[-1].count_MACs()
                nets[-1].count_MEMs()

            loss_worker = nets[i].loss
            if num_worker == 1:
                # Single-GPU or multi-CPU training
                loss_worker += nets[i].get_l2_loss()
            elif i == 1:
                # We only compute L2 grads in the second worker for speed.
                # In this case, L2 grads should multiple num_worker to maintain the equivalence
                loss_worker += num_worker * nets[i].get_l2_loss()
            tower_grads.append(
                optimizer.compute_gradients(loss_worker,
                                            colocate_gradients_with_ops=True))

            if i == num_worker - 1:
                print('Testing model on GPU %d' %
                      gpu_list[i]) if gpu_list else print(
                          'Testing model on CPUs')
                tf.get_variable_scope().reuse_variables()
                batch_test = iterator_test.get_next()
                nets.append(
                    net(batch_test[0],
                        batch_test[1],
                        opts=opts,
                        is_training=False))
                error_batch_test = nets[-1].error

                if mode in ['attack']:
                    print('Attack model on GPU %d' %
                          gpu_list[i - 1]) if gpu_list else print(
                              'Attack model on CPUs')
                    tf.get_variable_scope().reuse_variables()
                    batch_attack_x = tf.placeholder(
                        shape=batch_test[0].get_shape(),
                        dtype=batch_test[0].dtype)
                    batch_attack_y = tf.placeholder(
                        shape=batch_test[1].get_shape(),
                        dtype=batch_test[1].dtype)
                    nets.append(
                        net(batch_attack_x,
                            batch_attack_y,
                            opts=opts,
                            is_training=False))
                    error_batch_attack = nets[-1].error

    with tf.device('/cpu:0' if is_cpu_ps else worker):
        grad_batch_train = aggregate_gradients(tower_grads)
        loss_batch_train = aggregate_statistics(tower_losses)
        error_batch_train = aggregate_statistics(tower_errors)

        with tf.control_dependencies(update_ops):
            train_op = optimizer.apply_gradients(grad_batch_train,
                                                 global_step=learning_step)

    ####################################################################################################

    if hasattr(opts, 'delay'):
        delay4gpus(opts.delay, gpu_list=gpu_list)

    sess = get_session(gpu_list)
    saver = tf.train.Saver(max_to_keep=None)

    def evaluate():
        error_test = 0.
        for _ in tqdm(range(num_batch_test),
                      desc='Test',
                      leave=False,
                      smoothing=0.1):
            error_test += sess.run(error_batch_test)
        return error_test / num_batch_test

    def attack(black=False):
        error_fgsm = 0.
        delta = 1. / 32

        if black is False:
            adversial_x = []
            adversial_y = []
            for _ in tqdm(range(num_batch_test),
                          desc='Attack',
                          leave=False,
                          smoothing=0.1):
                test_x, test_y, grads = sess.run(
                    [nets[1].H[0], nets[1].Y[0], nets[1].grads_H[0]])
                fsgm_x = test_x + delta * np.sign(grads)
                error_fgsm += sess.run(error_batch_attack,
                                       feed_dict={
                                           batch_attack_x: fsgm_x,
                                           batch_attack_y: test_y
                                       })
                adversial_x.append(fsgm_x)
                adversial_y.append(test_y)
        else:
            adversial_sample = np.load('adversial_sample.npz')
            adversial_x = adversial_sample['x']
            adversial_y = adversial_sample['y']
            for i in tqdm(range(adversial_x.shape[0]),
                          desc='Attack',
                          leave=False,
                          smoothing=0.1):
                error_fgsm += sess.run(error_batch_attack,
                                       feed_dict={
                                           batch_attack_x: adversial_x[i, ...],
                                           batch_attack_y: adversial_y[i, ...]
                                       })

        adversial_x = np.array(adversial_x)
        adversial_y = np.array(adversial_y)
        np.savez('adversial_sample.npz', x=adversial_x, y=adversial_y)

        return error_fgsm / num_batch_test

    def save_model(path):
        saver.save(sess, path)
        print('S', end='')

    def load_model(path):
        print('Loading model from %s ...' % path_load)
        saver.restore(sess, path_load)

    if path_load is not None:
        load_model(path_load)
        error_test_best = evaluate()
        print('Test: %.4f' % error_test_best)

    if mode == 'attack':
        print(attack(black=False))

    if mode == 'export':
        vars_list = get_variable('batchnorm/gamma:')
        vars_numpy = sess.run(vars_list)
        export(vars_numpy, 'gamma')

    if mode in ['test', 'export', 'attack']:
        exit(0)

    if mode == 'restart':
        sess.run(epoch_step.assign(0))

    print_line()

    ####################################################################################################

    while True:
        # update learning rate
        lr_epoch = sess.run(lr_decay)
        if lr_epoch <= 0:
            break
        epoch = sess.run(epoch_step)
        print('Epoch: %03d' % epoch, end=' ')

        loss_epoch = 0.
        error_epoch = 0.
        t0 = get_time()
        for batch in tqdm(range(num_batch_train),
                          desc='Epoch: %03d' % epoch,
                          leave=False,
                          smoothing=0.1):

            if mode == 'debug':
                print('DEBUG: '),
                _, loss_delta, error_delta, H, W, gradsH, gradsW, label_ = sess.run(
                    [
                        train_op, loss_batch_train, error_batch_train,
                        nets[0].H, nets[0].W, nets[0].grads_H, nets[0].grads_W,
                        nets[0].Y
                    ])
            else:
                _, loss_delta, error_delta = sess.run(
                    [train_op, loss_batch_train, error_batch_train])

            loss_epoch += loss_delta
            error_epoch += error_delta

        print('Loss: %.6f Train: %.4f' %
              (loss_epoch / num_batch_train, error_epoch / num_batch_train),
              end=' ')
        FPS = num_batch_train * batch_size / (get_time() - t0)

        error_test = evaluate()
        assert error_test > 1e-4, (
            'Invalid test error %f, something goes wrong' % error_test)
        print('Test: %.4f lr: %.4f FPS: %d' % (error_test, lr_epoch, FPS),
              end=' ')

        sess.run(epoch_step.assign(epoch + 1))

        if epoch == 1:
            error_test_best = min(error_test, 0.9)
        if error_test < error_test_best:
            print('B', end=' ')
            if path_save is not None:
                save_model(path_save)
            error_test_best = error_test

        print('')

    print_line()

    ####################################################################################################

    sess.close()
    print('Optimization ended at ' + get_time('%y-%m-%d %X'))
    return 0
Пример #20
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    with tf.Graph().as_default():

        tf_global_step = slim.get_or_create_global_step()

        tf.logging.info("Preparing dataset")

        dataset = dataset_factory.get_dataset(dataset_name, dataset_split_name,
                                              dataset_dir)

        network_fn = nets_factory.get_network_fn(
            model_name, num_classes=dataset.num_classes, is_training=False)

        tf.logging.info("Initializing dataset provider")

        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=32,
            common_queue_min=1)

        tf.logging.info("Initialized provider, now getting image and label")

        [image, label] = provider.get(['image', 'label'])

        tf.logging.info("Got image with label %s" % label)

        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        images, labels = tf.train.batch([image, label], batch_size=1)

        logits, _ = network_fn(images)

        variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels)
            # 'Recall_5': slim.metrics.streaming_recall_at_k(
            #logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.items():
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        num_batches = 1

        # with tf.Session() as sess:
        #     sess.run(tf.global_variables_initializer())
        #     sess.run(tf.local_variables_initializer())
        #
        #     sess.run(names_to_updates.values())
        #
        #     metric_values = sess.run(names_to_values.values())
        #     for metric, value in zip(names_to_values.keys(), metric_values):
        #         tf.logging.info('Metric %s has value: %f' % (metric, value))

        tf.logging.info('Evaluating %s' % checkpoint_path)

        slim.evaluation.evaluate_once(
            master='',
            checkpoint_path=checkpoint_path,
            logdir=eval_dir,
            num_evals=num_batches,
            eval_op=list(names_to_updates.values()),
            variables_to_restore=variables_to_restore)
Пример #21
0
def main(_):
  if not FLAGS.train_dir and not FLAGS.checkpoint_path:
    print('Either --train_dir or --checkpoint_path flags has to be provided.')
  if FLAGS.train_dir and FLAGS.checkpoint_path:
    print('Only one of --train_dir or --checkpoint_path should be provided.')
  params = model_lib.default_hparams()
  params.parse(FLAGS.hparams)
  tf.logging.info('User provided hparams: %s', FLAGS.hparams)
  tf.logging.info('All hyper parameters: %s', params)
  batch_size = params.eval_batch_size
  graph = tf.Graph()
  with graph.as_default():
    # dataset
    dataset, num_examples, num_classes, bounds = dataset_factory.get_dataset(
        FLAGS.dataset,
        FLAGS.split_name,
        batch_size,
        FLAGS.dataset_image_size,
        is_training=False)
    dataset_iterator = dataset.make_one_shot_iterator()
    images, labels = dataset_iterator.get_next()
    if FLAGS.num_examples > 0:
      num_examples = min(num_examples, FLAGS.num_examples)

    # setup model
    global_step = tf.train.get_or_create_global_step()
    model_fn_two_args = model_lib.get_model(FLAGS.model_name, num_classes)
    model_fn = lambda x: model_fn_two_args(x, is_training=False)
    if not FLAGS.adv_method or FLAGS.adv_method == 'clean':
      logits = model_fn(images)
    else:
      adv_examples = adversarial_attack.generate_adversarial_examples(
          images, bounds, model_fn, FLAGS.adv_method)
      logits = model_fn(adv_examples)

    # update trainable variables if fine tuning is used
    model_lib.filter_trainable_variables(FLAGS.trainable_scopes)

    # Setup the moving averages
    if FLAGS.moving_average_decay and (FLAGS.moving_average_decay > 0):
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          tf.contrib.framework.get_model_variables())
      variables_to_restore[global_step.op.name] = global_step
    else:
      variables_to_restore = tf.contrib.framework.get_variables_to_restore()

    # Setup evaluation metric
    with tf.name_scope('Eval'):
      names_to_values, names_to_updates = (
          tf.contrib.metrics.aggregate_metric_map({
              'Accuracy': tf.metrics.accuracy(labels, tf.argmax(logits, 1)),
              'Top5': tf.metrics.recall_at_k(tf.to_int64(labels), logits, 5)
          }))

      for name, value in names_to_values.iteritems():
        tf.summary.scalar(name, value)

    # Run evaluation
    num_batches = int(num_examples / batch_size)
    if FLAGS.train_dir:
      output_dir = os.path.join(FLAGS.train_dir, FLAGS.eval_name)
      if not tf.gfile.Exists(output_dir):
        tf.gfile.MakeDirs(output_dir)
      tf.contrib.training.evaluate_repeatedly(
          FLAGS.train_dir,
          master=FLAGS.master,
          scaffold=tf.train.Scaffold(
              saver=tf.train.Saver(variables_to_restore)),
          eval_ops=names_to_updates.values(),
          eval_interval_secs=FLAGS.eval_interval_secs,
          hooks=[
              tf.contrib.training.StopAfterNEvalsHook(num_batches),
              tf.contrib.training.SummaryAtEndHook(output_dir),
              tf.train.LoggingTensorHook(names_to_values, at_end=True),
          ],
          max_number_of_evaluations=1 if FLAGS.eval_once else None)
    else:
      result = tf.contrib.training.evaluate_once(
          FLAGS.checkpoint_path,
          master=FLAGS.master,
          scaffold=tf.train.Scaffold(
              saver=tf.train.Saver(variables_to_restore)),
          eval_ops=names_to_updates.values(),
          final_ops=names_to_values,
          hooks=[
              tf.contrib.training.StopAfterNEvalsHook(num_batches),
              tf.train.LoggingTensorHook(names_to_values, at_end=True),
          ])
      if FLAGS.output_file:
        with tf.gfile.Open(FLAGS.output_file, 'a') as f:
          f.write('%s,%.3f,%.3f\n'
                  % (FLAGS.eval_name, result['Accuracy'], result['Top5']))
Пример #22
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        logits, _ = network_fn(images)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        print(slim.get_model_variables())

        predictions = tf.argmax(logits, 1)
        labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            'Recall_5':
            slim.metrics.streaming_sparse_recall_at_k(logits, labels, 5),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.items():
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        slim.evaluation.evaluate_once(
            master=FLAGS.master,
            checkpoint_path=checkpoint_path,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            eval_op=list(names_to_updates.values()),
            variables_to_restore=variables_to_restore)
Пример #23
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #######################
    # Config model_deploy #
    #######################
    deploy_config = model_deploy.DeploymentConfig(
      num_clones=FLAGS.num_clones,
      clone_on_cpu=FLAGS.clone_on_cpu,
      replica_id=FLAGS.task,
      num_replicas=FLAGS.worker_replicas,
      num_ps_tasks=FLAGS.num_ps_tasks)

    # Create global_step
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
      FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ######################
    # Select the network #
    ######################
    network_fn = nets_factory.get_network_fn(
      FLAGS.model_name,
      num_classes=(dataset.num_classes - FLAGS.labels_offset),
      weight_decay=FLAGS.weight_decay,
      is_training=True,
      width_multiplier=FLAGS.width_multiplier)

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
      preprocessing_name,
      is_training=True)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    with tf.device(deploy_config.inputs_device()):
      provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        num_readers=FLAGS.num_readers,
        common_queue_capacity=20 * FLAGS.batch_size,
        common_queue_min=10 * FLAGS.batch_size)

      # gt_bboxes format [ymin, xmin, ymax, xmax]
      [image, img_shape, gt_labels, gt_bboxes] = provider.get(['image', 'shape',
                                                               'object/label',
                                                               'object/bbox'])

      # Preprocesing
      # gt_bboxes = scale_bboxes(gt_bboxes, img_shape)  # bboxes format [0,1) for tf draw

      image, gt_labels, gt_bboxes = image_preprocessing_fn(image,
                                                           config.IMG_HEIGHT,
                                                           config.IMG_WIDTH,
                                                           labels=gt_labels,
                                                           bboxes=gt_bboxes,
                                                           )

      #############################################
      # Encode annotations for losses computation #
      #############################################

      # anchors format [cx, cy, w, h]
      anchors = tf.convert_to_tensor(config.ANCHOR_SHAPE, dtype=tf.float32)

      # encode annos, box_input format [cx, cy, w, h]
      input_mask, labels_input, box_delta_input, box_input = encode_annos(gt_labels,
                                                                          gt_bboxes,
                                                                          anchors,
                                                                          config.NUM_CLASSES)

      images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input = tf.train.batch(
        [image, input_mask, labels_input, box_delta_input, box_input],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

      batch_queue = slim.prefetch_queue.prefetch_queue(
        [images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input], capacity=2 * deploy_config.num_clones)

    ####################
    # Define the model #
    ####################
    def clone_fn(batch_queue):
      """Allows data parallelism by creating multiple clones of network_fn."""
      images, b_input_mask, b_labels_input, b_box_delta_input, b_box_input = batch_queue.dequeue()
      anchors = tf.convert_to_tensor(config.ANCHOR_SHAPE, dtype=tf.float32)
      end_points = network_fn(images)
      end_points["viz_images"] = images
      conv_ds_14 = end_points['MobileNet/conv_ds_14/depthwise_conv']
      dropout = slim.dropout(conv_ds_14, keep_prob=0.5, is_training=True)
      num_output = config.NUM_ANCHORS * (config.NUM_CLASSES + 1 + 4)
      predict = slim.conv2d(dropout, num_output, kernel_size=(3, 3), stride=1, padding='SAME',
                            activation_fn=None,
                            weights_initializer=tf.truncated_normal_initializer(stddev=0.0001),
                            scope="MobileNet/conv_predict")

      with tf.name_scope("Interpre_prediction") as scope:
        pred_box_delta, pred_class_probs, pred_conf, ious, det_probs, det_boxes, det_class = \
          interpre_prediction(predict, b_input_mask, anchors, b_box_input)
        end_points["viz_det_probs"] = det_probs
        end_points["viz_det_boxes"] = det_boxes
        end_points["viz_det_class"] = det_class

      with tf.name_scope("Losses") as scope:
        losses(b_input_mask, b_labels_input, ious, b_box_delta_input, pred_class_probs, pred_conf, pred_box_delta)

      return end_points

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
    first_clone_scope = deploy_config.clone_scope(0)
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    # Add summaries for end_points.
    end_points = clones[0].outputs
    for end_point in end_points:
      if end_point not in ["viz_images", "viz_det_probs", "viz_det_boxes", "viz_det_class"]:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                        tf.nn.zero_fraction(x)))

    # Add summaries for det result TODO(shizehao): vizulize prediction


    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
      summaries.add(tf.summary.histogram(variable.op.name, variable))

    #################################
    # Configure the moving averages #
    #################################
    if FLAGS.moving_average_decay:
      moving_average_variables = slim.get_model_variables()
      variable_averages = tf.train.ExponentialMovingAverage(
        FLAGS.moving_average_decay, global_step)
    else:
      moving_average_variables, variable_averages = None, None

    #########################################
    # Configure the optimization procedure. #
    #########################################
    with tf.device(deploy_config.optimizer_device()):
      learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
      optimizer = _configure_optimizer(learning_rate)
      summaries.add(tf.summary.scalar('learning_rate', learning_rate))

    if FLAGS.sync_replicas:
      # If sync_replicas is enabled, the averaging will be done in the chief
      # queue runner.
      optimizer = tf.train.SyncReplicasOptimizer(
        opt=optimizer,
        replicas_to_aggregate=FLAGS.replicas_to_aggregate,
        variable_averages=variable_averages,
        variables_to_average=moving_average_variables,
        replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
        total_num_replicas=FLAGS.worker_replicas)
    elif FLAGS.moving_average_decay:
      # Update ops executed locally by trainer.
      update_ops.append(variable_averages.apply(moving_average_variables))

    # Variables to train.
    variables_to_train = _get_variables_to_train()

    #  and returns a train_tensor and summary_op
    total_loss, clones_gradients = model_deploy.optimize_clones(
      clones,
      optimizer,
      var_list=variables_to_train)
    # Add total_loss to summary.
    summaries.add(tf.summary.scalar('total_loss', total_loss))

    # Create gradient updates.
    grad_updates = optimizer.apply_gradients(clones_gradients,
                                             global_step=global_step)
    update_ops.append(grad_updates)

    update_op = tf.group(*update_ops)
    train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                      name='train_op')

    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    ###########################
    # Kicks off the training. #
    ###########################
    slim.learning.train(
      train_tensor,
      logdir=FLAGS.train_dir,
      master=FLAGS.master,
      is_chief=(FLAGS.task == 0),
      init_fn=_get_init_fn(),
      summary_op=summary_op,
      number_of_steps=FLAGS.max_number_of_steps,
      log_every_n_steps=FLAGS.log_every_n_steps,
      save_summaries_secs=FLAGS.save_summaries_secs,
      save_interval_secs=FLAGS.save_interval_secs,
      sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #24
0
def prefetch_test(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str

    Dataset = get_dataset(opt.dataset)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)
    Logger(opt)
    Detector = detector_factory[opt.task]
    dataset = Dataset(opt, 'test')
    model_begin = 100
    model_end = 140
    if opt.load_model != '':
        model_begin = 0
        model_end = 0
    if opt.test_with_eval:
        map_dcit = {'best_id': 0, 'best_map': 0}
        best_output = []
    for model_id in range(model_begin, model_end + 1):
        if opt.load_model == '':
            model_path = opt.save_dir[:-4] if opt.save_dir.endswith(
                'TEST') else opt.save_dir
            opt.load_model = os.path.join(model_path,
                                          'model_' + str(model_id) + '.pth')
        detector = Detector(opt)

        data_loader = torch.utils.data.DataLoader(PrefetchDataset(
            opt, dataset, detector.pre_process),
                                                  batch_size=1,
                                                  shuffle=False,
                                                  num_workers=1,
                                                  pin_memory=True)

        num_iters = len(dataset)
        print("----epoch :{} -----".format(model_id))
        bar = Bar('{}'.format(opt.exp_id), max=num_iters)
        time_stats = ['tot', 'load', 'pre', 'net', 'dec', 'post', 'merge']
        avg_time_stats = {t: AverageMeter() for t in time_stats}
        output_hoi = []

        for ind, (img_id, pre_processed_images) in enumerate(data_loader):
            ret = detector.run(pre_processed_images)
            output_i = ret['results_rel'].copy()
            output_i['file_name'] = dataset.hoi_annotations[int(
                img_id)]['file_name']
            output_hoi.append(output_i)
            Bar.suffix = '[{0}/{1}]|Tot: {total:} |ETA: {eta:} '.format(
                ind, num_iters, total=bar.elapsed_td, eta=bar.eta_td)
            for t in avg_time_stats:
                avg_time_stats[t].update(ret[t])
                Bar.suffix = Bar.suffix + '|{} {tm.val:.3f}s ({tm.avg:.3f}s) '.format(
                    t, tm=avg_time_stats[t])
            bar.next()
        bar.finish()

        if opt.test_with_eval:
            if 'hico' in opt.dataset:
                hoi_eval = hico(
                    os.path.join(opt.root_path,
                                 'hico_det/annotations/test_hico.json'))
            elif 'vcoco' in opt.dataset:
                hoi_eval = vcoco(
                    os.path.join(opt.root_path,
                                 'verbcoco/annotations/test_vcoco.json'))
            elif 'hoia' in opt.dataset:
                hoi_eval = hoia(
                    os.path.join(opt.root_path,
                                 'hoia/annotations/test_hoia.json'))
            map = hoi_eval.evalution(output_hoi)
            if map > map_dcit['best_map']:
                map_dcit['best_map'] = map
                map_dcit['best_id'] = model_id
                best_output = output_hoi

        if opt.save_predictions:
            save_json(output_hoi, model_path,
                      'predictions_model_' + str(model_id) + '.json')
    if opt.test_with_eval:
        print('best model id: {}, best map: {}'.format(map_dcit['best_id'],
                                                       map_dcit['best_map']))
        save_json(best_output, model_path, 'best_predictions.json')
Пример #25
0
def train():
    """Train CIFAR-10 for a number of steps."""
    with tf.Graph().as_default():
        global_step = tf.train.get_or_create_global_step()

        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            dataset = dataset_factory.get_dataset("color", "train",
                                                  "D:/colors")
            examples_per_shard = 1024
            min_queue_examples = examples_per_shard * 50
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=8,
                common_queue_capacity=min_queue_examples + 3 * 12,
                common_queue_min=min_queue_examples)
            [image, label] = provider.get(['image', 'label'])
            # image,label=set(image,label_1,label_2,FLAGS.coarse,fw[FLAGS.coarse])
            preprocessing_name = "color"  # or FLAGS.model_name
            image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                preprocessing_name, is_training=True)
            image = image_preprocessing_fn(image, 24, 24)

            images, labels = tf.train.shuffle_batch([image, label],
                                                    batch_size=12,
                                                    num_threads=4,
                                                    capacity=2 * 4 * 12,
                                                    min_after_dequeue=48)

            # labels = slim.one_hot_encoding(labels, 10)
            batch_queue = slim.prefetch_queue.prefetch_queue([images, labels],
                                                             capacity=16 * 1,
                                                             num_threads=4)

            images, labels = batch_queue.dequeue()

        # with tf.device('/cpu:0'):
        #     img, label = cifar10.read_and_decode("tmp/cifar10_newdata/train.tfrecords")
        #     img_batch, label_batch = tf.train.shuffle_batch([img, label],
        #                                                 batch_size=128, capacity=2000,
        #                                                 min_after_dequeue=1000)
        # Build a Graph that computes the logits predictions from the
        # inference model.
        logits = colors.inference(images)
        # logits=cifar10.resnet_50(images, classes=10,is_training=True)
        # model = cifar10_model.ResNetCifar10(
        #     44,
        #     is_training=True,
        #     batch_norm_decay=0.997,
        #     batch_norm_epsilon=1e-5,
        #     data_format='channels_last')

        # logits = model.forward_pass(images, input_data_format='channels_last')
        # logits=cifar10.resnet_50(images)
        # logits=cifar10.resnet_50(images)
        # Calculate loss  and  acc.
        loss, accuracy = colors.loss(logits, labels)

        # Build a Graph that trains the model with one batch of examples and
        # updates the model parameters.
        train_op = colors.train(loss, global_step)

        ##### validation step

        # with tf.device('/cpu:0'):
        #     eval_images, eval_labels = cifar10.inputs(eval_data="test")
        # # eval_logits = cifar10.alexnet_cifar_FC(eval_images, True)
        #
        # eval_logits = model.forward_pass(eval_images, input_data_format='channels_last')
        # top_k_op = cifar10.my_accuracy(eval_logits, eval_labels)

        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss,runtime and accuracy."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(
                    [loss, accuracy, logits, labels])  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                    loss_value, acc_value, logitss, labless = run_values.results
                    x = np.argmax(logits)
                    examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                    sec_per_batch = float(duration / FLAGS.log_frequency)

                    format_str = (
                        '%s: step %d, loss = %.2f,  batch_accuracy=%.4f   (%.1f examples/sec; %.3f '
                        'sec/batch)')
                    print(format_str %
                          (datetime.now(), self._step, loss_value, acc_value,
                           examples_per_sec, sec_per_batch))

        config = tf.ConfigProto(
            log_device_placement=FLAGS.log_device_placement)
        config.gpu_options.allow_growth = True
        add_global = global_step.assign_add(1)
        # saver = tf.train.Saver()
        var_list = tf.trainable_variables()
        g_list = tf.global_variables()
        bn_moving_vars = [g for g in g_list if 'moving_mean' in g.name]
        bn_moving_vars += [g for g in g_list if 'moving_variance' in g.name]
        var_list += bn_moving_vars
        with tf.train.MonitoredTrainingSession(
                save_checkpoint_secs=60,
                checkpoint_dir=FLAGS.train_dir,
                hooks=[
                    tf.train.StopAtStepHook(last_step=FLAGS.max_steps),
                    tf.train.NanTensorHook(loss),
                    tf.train.SummarySaverHook(
                        save_steps=1000,
                        output_dir=FLAGS.train_dir,
                        summary_op=tf.summary.merge_all()),
                    _LoggerHook()
                ],
                config=config) as mon_sess:
            f = open("result.txt", 'a+')
            while not mon_sess.should_stop():
                mon_sess.run(train_op)

                step = mon_sess.run(add_global)
                if step % 1000 == 0:
                    lr = mon_sess.run(tf.get_collection('learning_rate'))
                    f.write("step %d-----------------------------" % step)
                    f.write("lr>>%.5f        " % lr[0])
Пример #26
0
    def train_input_fn():
        # Select the dataset.
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.data_dir)
        tf_utils.print_configuration(FLAGS.__flags, ron_params,
                                     dataset.data_sources, FLAGS.model_dir)
        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=120 * FLAGS.batch_size,
                common_queue_min=80 * FLAGS.batch_size,
                shuffle=True)
        # Get for RON network: image, labels, bboxes.
        # (ymin, xmin, ymax, xmax) fro gbboxes
        [image, shape, glabels, gbboxes, isdifficult] = provider.get([
            'image', 'shape', 'object/label', 'object/bbox', 'object/difficult'
        ])
        isdifficult_mask = tf.cond(
            tf.reduce_sum(
                tf.cast(
                    tf.logical_not(
                        tf.equal(tf.ones_like(isdifficult), isdifficult)),
                    tf.float32)) < 1.,
            lambda: tf.one_hot(0,
                               tf.shape(isdifficult)[0],
                               on_value=True,
                               off_value=False,
                               dtype=tf.bool),
            lambda: isdifficult < tf.ones_like(isdifficult))

        glabels = tf.boolean_mask(glabels, isdifficult_mask)
        gbboxes = tf.boolean_mask(gbboxes, isdifficult_mask)
        # Select the preprocessing function.
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        # Pre-processing image, labels and bboxes.
        image, glabels, gbboxes = image_preprocessing_fn(
            image,
            glabels,
            gbboxes,
            out_shape=ron_shape,
            data_format=DATA_FORMAT)

        # Encode groundtruth labels and bboxes.
        # glocalisations is our regression object
        # gclasses is the ground_trutuh label
        # gscores is the the jaccard score with ground_truth
        gclasses, glocalisations, gscores = ron_net.bboxes_encode(
            glabels,
            gbboxes,
            ron_anchors,
            positive_threshold=FLAGS.match_threshold,
            ignore_threshold=FLAGS.neg_threshold)

        # each size of the batch elements
        # include one image, three others(gclasses, glocalisations, gscores)
        batch_shape = [1] + [len(ron_anchors)] * 3

        # Training batches and queue.
        r = tf.train.batch(tf_utils.reshape_list(
            [image, gclasses, glocalisations, gscores]),
                           batch_size=FLAGS.batch_size,
                           num_threads=FLAGS.num_preprocessing_threads,
                           capacity=120 * FLAGS.batch_size,
                           shared_name=None)
        b_image, b_gclasses, b_glocalisations, b_gscores = tf_utils.reshape_list(
            r, batch_shape)
        return b_image, {
            'b_gclasses': b_gclasses,
            'b_glocalisations': b_glocalisations,
            'b_gscores': b_gscores
        }
Пример #27
0
def main():
    args = parse_args()
    if args.cfg_file is not None:
        cfg_from_file(args.cfg_file)

    tf.logging.info('Using Config:')
    pprint.pprint(cfg)

    train_dir = get_output_dir(
        'default' if args.cfg_file is None else args.cfg_file)
    os.environ['CUDA_VISIBLE_DEVICES'] = cfg.GPUS
    num_clones = len(cfg.GPUS.split(','))

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        ######################
        # Config model_deploy#
        ######################
        tf.set_random_seed(cfg.RNG_SEED)
        deploy_config = model_deploy.DeploymentConfig(num_clones=num_clones,
                                                      clone_on_cpu=False,
                                                      replica_id=0,
                                                      num_replicas=1,
                                                      num_ps_tasks=0)

        # Create global_step
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        ######################
        # Select the dataset #
        ######################
        kwargs = {}
        if cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO > 1:
            kwargs['num_samples'] = cfg.TRAIN.VIDEO_FRAMES_PER_VIDEO
            kwargs['randomFromSegmentStyle'] = cfg.TRAIN.READ_SEGMENT_STYLE
            kwargs['modality'] = cfg.INPUT.VIDEO.MODALITY
            kwargs['split_id'] = cfg.INPUT.SPLIT_ID
        if cfg.DATASET_LIST_DIR != '':
            kwargs['dataset_list_dir'] = cfg.DATASET_LIST_DIR
        if cfg.INPUT_FILE_STYLE_LABEL != '':
            kwargs['input_file_style_label'] = cfg.INPUT_FILE_STYLE_LABEL
        dataset, num_pose_keypoints = dataset_factory.get_dataset(
            cfg.DATASET_NAME, cfg.TRAIN.DATASET_SPLIT_NAME, cfg.DATASET_DIR,
            **kwargs)

        ####################
        # Select the network #
        ####################
        network_fn = nets_factory.get_network_fn(
            cfg.MODEL_NAME,
            num_classes=(dataset.num_classes),
            num_pose_keypoints=num_pose_keypoints,
            weight_decay=cfg.TRAIN.WEIGHT_DECAY,
            is_training=True,
            cfg=cfg)  # advanced network creation controlled with cfg.NET

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = cfg.MODEL_NAME
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        with tf.device(deploy_config.inputs_device()):
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=cfg.NUM_READERS,
                common_queue_capacity=20 * cfg.TRAIN.BATCH_SIZE,
                common_queue_min=10 * cfg.TRAIN.BATCH_SIZE)

            [image, pose_label_hmap, pose_label_valid,
             action_label] = train_preprocess_pipeline(provider, cfg,
                                                       network_fn,
                                                       num_pose_keypoints,
                                                       image_preprocessing_fn)
            # input_data = [preprocess_pipeline(
            #   provider, cfg, network_fn, num_pose_keypoints, image_preprocessing_fn)
            #   for _ in range(cfg.NUM_PREPROCESSING_THREADS)]

            images, pose_labels_hmap, pose_labels_valid, action_labels = tf.train.batch(
                [image, pose_label_hmap, pose_label_valid, action_label],
                # input_data,
                batch_size=cfg.TRAIN.BATCH_SIZE,
                num_threads=cfg.NUM_PREPROCESSING_THREADS,
                capacity=5 * cfg.TRAIN.BATCH_SIZE)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [images, pose_labels_hmap, pose_labels_valid, action_labels],
                capacity=5 * deploy_config.num_clones * cfg.TRAIN.ITER_SIZE)

        ####################
        # Define the model #
        ####################
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of network_fn."""
            images, labels_pose, labels_pose_valid, labels_action = batch_queue.dequeue(
            )
            # due to the multi-frame/video thing, need to squeeze first 2 dimensions
            labels_pose = tf.concat(tf.unstack(labels_pose), axis=0)
            labels_pose_valid = tf.concat(tf.unstack(labels_pose_valid),
                                          axis=0)
            logits, end_points = network_fn(images)
            pose_logits = end_points['PoseLogits']

            #############################
            # Specify the loss function #
            #############################
            # if 'AuxLogits' in end_points:
            #   slim.losses.softmax_cross_entropy(
            #       end_points['AuxLogits'], labels,
            #       label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=0.4, scope='aux_loss')
            # slim.losses.softmax_cross_entropy(
            #     logits, labels, label_smoothing=cfg.TRAIN.LABEL_SMOOTHING, weight=1.0)
            end_points['Images'] = images
            end_points['PoseLabels'] = labels_pose
            end_points['ActionLabels'] = labels_action
            end_points['ActionLogits'] = logits
            tf.logging.info('PoseLogits shape is {}.'.format(
                pose_logits.get_shape().as_list()))

            gen_losses(labels_action, logits, cfg.TRAIN.LOSS_FN_ACTION,
                       dataset.num_classes, cfg.TRAIN.LOSS_FN_ACTION_WT,
                       labels_pose, pose_logits, cfg.TRAIN.LOSS_FN_POSE,
                       labels_pose_valid, cfg.TRAIN.LOSS_FN_POSE_WT,
                       end_points, cfg)

            return end_points

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs

        # store the end points in a global variable for debugging in train_step
        global end_points_debug
        end_points_debug = end_points

        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            # summaries.add(tf.summary.scalar(tf.nn.zero_fraction(x),
            #                                 name='sparsity/' + end_point))
        sum_img = tf.concat(tf.unstack(end_points['Images']), axis=0)
        if sum_img.get_shape().as_list()[-1] not in [1, 3, 4]:
            sum_img = tf.reduce_sum(sum_img, axis=-1, keep_dims=True)
            sum_img = sum_img - tf.reduce_min(sum_img)
            sum_img = sum_img / (tf.reduce_max(sum_img) + cfg.EPS)
        summaries.add(tf.summary.image('images', sum_img))
        for epname in cfg.TRAIN.OTHER_IMG_SUMMARIES_TO_ADD:
            if epname in end_points:
                summaries.add(
                    tf.summary.image('image_vis/' + epname,
                                     end_points[epname]))
        summaries = summaries.union(
            _summarize_heatmaps('labels', end_points['PoseLabels'], sum_img))
        summaries = summaries.union(
            _summarize_heatmaps('pose', end_points['PoseLogits'], sum_img))
        if 'PoseLossMask' in end_points:
            summaries = summaries.union(
                _summarize_heatmaps('loss_mask/pose',
                                    end_points['PoseLossMask'], sum_img))

        # Add summaries for losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(
                tf.summary.scalar(tensor=loss,
                                  name='losses/%s' % loss.op.name))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        #################################
        # Configure the moving averages #
        #################################
        if cfg.TRAIN.MOVING_AVERAGE_VARIABLES:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                cfg.TRAIN.MOVING_AVERAGE_VARIABLES, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = _configure_learning_rate(dataset.num_samples,
                                                     num_clones, global_step)
            optimizer = _configure_optimizer(learning_rate)
            summaries.add(
                tf.summary.scalar(tensor=learning_rate, name='learning_rate'))

        # if cfg.sync_replicas:
        #   # If sync_replicas is enabled, the averaging will be done in the chief
        #   # queue runner.
        #   optimizer = tf.train.SyncReplicasOptimizer(
        #       opt=optimizer,
        #       replicas_to_aggregate=,
        #       variable_averages=variable_averages,
        #       variables_to_average=moving_average_variables,
        #       replica_id=tf.constant(cfg.task, tf.int32, shape=()),
        #       total_num_replicas=cfg.worker_replicas)
        # elif cfg.moving_average_decay:
        #   # Update ops executed locally by trainer.
        #   update_ops.append(variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = _get_variables_to_train()
        tf.logging.info('Training the following variables: {}'.format(
            ', '.join([var.op.name for var in variables_to_train])))

        #  and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones,
            optimizer,
            var_list=variables_to_train,
            clip_gradients=cfg.TRAIN.CLIP_GRADIENTS)
        # Add total_loss to summary.
        summaries.add(tf.summary.scalar(tensor=total_loss, name='total_loss'))

        # Create gradient updates.
        train_ops = {}
        if cfg.TRAIN.ITER_SIZE == 1:
            grad_updates = optimizer.apply_gradients(clones_gradients,
                                                     global_step=global_step)
            update_ops.append(grad_updates)

            update_op = tf.group(*update_ops)
            train_tensor = control_flow_ops.with_dependencies([update_op],
                                                              total_loss,
                                                              name='train_op')
            train_ops = train_tensor
        else:
            with tf.name_scope('AccumulateGradients'):
                # copied as is from my previous code
                gvs = [(grad, var) for grad, var in clones_gradients]
                varnames = [var.name for grad, var in gvs]
                varname_to_var = {var.name: var for grad, var in gvs}
                varname_to_grad = {var.name: grad for grad, var in gvs}
                varname_to_ref_grad = {}
                for vn in varnames:
                    grad = varname_to_grad[vn]
                    print("accumulating ... ", (vn, grad.get_shape()))
                    with tf.variable_scope("ref_grad"):
                        with tf.device(deploy_config.variables_device()):
                            ref_var = slim.local_variable(np.zeros(
                                grad.get_shape(), dtype=np.float32),
                                                          name=vn[:-2])
                            varname_to_ref_grad[vn] = ref_var

                all_assign_ref_op = [
                    ref.assign(varname_to_grad[vn])
                    for vn, ref in varname_to_ref_grad.items()
                ]
                all_assign_add_ref_op = [
                    ref.assign_add(varname_to_grad[vn])
                    for vn, ref in varname_to_ref_grad.items()
                ]
                assign_gradients_ref_op = tf.group(*all_assign_ref_op)
                accmulate_gradients_op = tf.group(*all_assign_add_ref_op)
                with tf.control_dependencies([accmulate_gradients_op]):
                    final_gvs = [(varname_to_ref_grad[var.name] /
                                  float(cfg.TRAIN.ITER_SIZE), var)
                                 for grad, var in gvs]
                    apply_gradients_op = optimizer.apply_gradients(
                        final_gvs, global_step=global_step)
                    update_ops.append(apply_gradients_op)
                    update_op = tf.group(*update_ops)
                    train_tensor = control_flow_ops.with_dependencies(
                        [update_op], total_loss, name='train_op')
                for i in range(cfg.TRAIN.ITER_SIZE):
                    if i == 0:
                        train_ops[i] = assign_gradients_ref_op
                    elif i < cfg.TRAIN.ITER_SIZE - 1:  # because apply_gradients also computes
                        # (see control_dependency), so
                        # no need of running an extra iteration
                        train_ops[i] = accmulate_gradients_op
                    else:
                        train_ops[i] = train_tensor

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        config.intra_op_parallelism_threads = 4  # to avoid too many threads
        # The following seems optimal... though not sure
        config.inter_op_parallelism_threads = max(
            cfg.NUM_PREPROCESSING_THREADS, 12)
        ###########################
        # Kicks off the training. #
        ###########################
        slim.learning.train(train_ops,
                            train_step_fn=_train_step,
                            logdir=train_dir,
                            master='',
                            is_chief=True,
                            init_fn=_get_init_fn(train_dir),
                            summary_op=summary_op,
                            number_of_steps=cfg.TRAIN.MAX_NUMBER_OF_STEPS,
                            log_every_n_steps=cfg.TRAIN.LOG_EVERY_N_STEPS,
                            save_summaries_secs=cfg.TRAIN.SAVE_SUMMARIES_SECS,
                            save_interval_secs=cfg.TRAIN.SAVE_INTERVAL_SECS,
                            sync_optimizer=None,
                            session_config=config)
Пример #28
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        ######################
        # Config model_deploy#
        ######################
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=FLAGS.task,
            num_replicas=FLAGS.worker_replicas,
            num_ps_tasks=FLAGS.num_ps_tasks)

        # Create global_step
        with tf.device(deploy_config.variables_device()):
            global_step = slim.create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the network #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            weight_decay=FLAGS.weight_decay,
            is_training=True)

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        with tf.device(deploy_config.inputs_device()):
            provider = slim.dataset_data_provider.DatasetDataProvider(
                dataset,
                num_readers=FLAGS.num_readers,
                common_queue_capacity=20 * FLAGS.batch_size,
                common_queue_min=10 * FLAGS.batch_size)
            [image, label] = provider.get(['image', 'label'])
            label -= FLAGS.labels_offset

            train_image_size = FLAGS.train_image_size or network_fn.default_image_size

            image = image_preprocessing_fn(image, train_image_size,
                                           train_image_size)

            images, labels = tf.train.batch(
                [image, label],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size)
            labels = slim.one_hot_encoding(
                labels, dataset.num_classes - FLAGS.labels_offset)
            batch_queue = slim.prefetch_queue.prefetch_queue(
                [images, labels], capacity=2 * deploy_config.num_clones)

        ####################
        # Define the model #
        ####################
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple clones of network_fn."""
            images, labels = batch_queue.dequeue()
            logits, end_points = network_fn(images)

            #############################
            # Specify the loss function #
            ############################
            accurancy = 0
            global logits_global
            global labels_global

            logits_global = logits
            labels_global = labels

            if 'AuxLogits' in end_points:
                print('auxlogits')
                slim.losses.softmax_cross_entropy(
                    end_points['AuxLogits'],
                    labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weight=0.4,
                    scope='aux_loss')
                slim.losses.softmax_cross_entropy(
                    logits,
                    labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weight=1.0)
                #predictions = tf.argmax(logits, 1)
                #accurancy =

                return end_points
            return end_points

        global logits_global
        global labels_global
        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])

        print(logits_global)
        print(labels_global)

        predictions = tf.squeeze(tf.argmax(logits_global, 1))
        labels = tf.squeeze(tf.argmax(labels_global, 1))

        accurancy = 1.0 - slim.metrics.streaming_accuracy(predictions,
                                                          labels)[1]
        #accurancies = get_loss(batch_queue)
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.histogram_summary('activations/' + end_point, x))
            summaries.add(
                tf.scalar_summary('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))

        # Add summaries for losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(tf.scalar_summary('losses/%s' % loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.histogram_summary(variable.op.name, variable))

        #################################
        # Configure the moving averages #
        #################################
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        #########################################
        # Configure the optimization procedure. #
        #########################################
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = _configure_learning_rate(dataset.num_samples,
                                                     global_step)
            optimizer = _configure_optimizer(learning_rate)
            summaries.add(
                tf.scalar_summary('learning_rate',
                                  learning_rate,
                                  name='learning_rate'))

        if FLAGS.sync_replicas:
            # If sync_replicas is enabled, the averaging will be done in the chief
            # queue runner.
            optimizer = tf.train.SyncReplicasOptimizer(
                opt=optimizer,
                replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                variable_averages=variable_averages,
                variables_to_average=moving_average_variables,
                replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
                total_num_replicas=FLAGS.worker_replicas)
        elif FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = _get_variables_to_train()

        #  and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)
        # Add total_loss to summary.
        summaries.add(
            tf.scalar_summary('total_loss', total_loss, name='total_loss'))

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)
        update_ops.append(grad_updates)

        update_op = tf.group(*update_ops)
        train_tensor = control_flow_ops.with_dependencies([update_op],
                                                          total_loss,
                                                          name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        # created by model_fn and either optimize_clones() or _gather_clone_loss().
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

        # Merge all summaries together.
        summary_op = tf.merge_summary(list(summaries), name='summary_op')

        ###########################
        # Kicks off the training. #
        ###########################
        loss = learning.train(
            train_tensor,
            logdir=FLAGS.train_dir,
            master=FLAGS.master,
            is_chief=(FLAGS.task == 0),
            init_fn=_get_init_fn(),
            summary_op=summary_op,
            number_of_steps=FLAGS.max_number_of_steps,
            log_every_n_steps=FLAGS.log_every_n_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            save_interval_secs=FLAGS.save_interval_secs,
            sync_optimizer=optimizer if FLAGS.sync_replicas else None,
            accurancies=accurancy)

        print('Training loss: ' + str(loss))
Пример #29
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():  # 创建图 with tf.Graph().as_default()	*
    #######################
    # Config model_deploy # 利用model_deploy配置模型部署
    #######################
	'''num_clones=1, clone_on_cpu=False,
	replica_id=0, worker_replicas=1, num_ps_tasks=0
	'''
    deploy_config = model_deploy.DeploymentConfig(
        num_clones=FLAGS.num_clones,
        clone_on_cpu=FLAGS.clone_on_cpu,
        replica_id=FLAGS.task,
        num_replicas=FLAGS.worker_replicas,
        num_ps_tasks=FLAGS.num_ps_tasks)

    # Create global_step 创建全局步
	# 创建图 tf.device() 使用默认图形的Graph.device()包装器				**
	# 要在上下文中使用的设备名称或函数device='/device:CPU:0'
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    ######################
    # Select the dataset # 选择数据集
    ###################### 'name','train'or'validation'or'test','/tmp/'
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ######################
    # Select the network #  选择神经网络
    ######################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        weight_decay=FLAGS.weight_decay,
        is_training=True)

    #####################################
    # Select the preprocessing function #  选择预处理函数
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=True)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
	# 创建一个从数据集加载数据的提供程序
    ##############################################################
    with tf.device(deploy_config.inputs_device()):
      provider = slim.dataset_data_provider.DatasetDataProvider(
          dataset,
          num_readers=FLAGS.num_readers,
          common_queue_capacity=20 * FLAGS.batch_size,
          common_queue_min=10 * FLAGS.batch_size)
      [image, label] = provider.get(['image', 'label'])
      label -= FLAGS.labels_offset

      train_image_size = FLAGS.train_image_size or network_fn.default_image_size

      image = image_preprocessing_fn(image, train_image_size, train_image_size)

      images, labels = tf.train.batch(
          [image, label],
          batch_size=FLAGS.batch_size,
          num_threads=FLAGS.num_preprocessing_threads,
          capacity=5 * FLAGS.batch_size)
      labels = slim.one_hot_encoding(
          labels, dataset.num_classes - FLAGS.labels_offset)
      batch_queue = slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * deploy_config.num_clones)

    ####################
    # Define the model # 定义模型
    ####################
    def clone_fn(batch_queue):
      """Allows data parallelism by creating multiple clones of network_fn.
	  通过创建network_fn的多个克隆来允许数据并行"""
      images, labels = batch_queue.dequeue()
      logits, end_points = network_fn(images)

      #############################
      # Specify the loss function # 指定损失函数
      #############################
      if 'AuxLogits' in end_points:
        slim.losses.softmax_cross_entropy(
            end_points['AuxLogits'], labels,
            label_smoothing=FLAGS.label_smoothing, weights=0.4,
            scope='aux_loss')
      slim.losses.softmax_cross_entropy(
          logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0)
      return end_points

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
    first_clone_scope = deploy_config.clone_scope(0)
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    # Add summaries for end_points.
    end_points = clones[0].outputs
    for end_point in end_points:
      x = end_points[end_point]
      summaries.add(tf.summary.histogram('activations/' + end_point, x))
      summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
      summaries.add(tf.summary.histogram(variable.op.name, variable))

    #################################
    # Configure the moving averages #
    #################################
    if FLAGS.moving_average_decay:
      moving_average_variables = slim.get_model_variables()
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, global_step)
    else:
      moving_average_variables, variable_averages = None, None

    if FLAGS.quantize_delay >= 0:
      tf.contrib.quantize.create_training_graph(
          quant_delay=FLAGS.quantize_delay)

    #########################################
    # Configure the optimization procedure. #
    #########################################
    with tf.device(deploy_config.optimizer_device()):
      learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
      optimizer = _configure_optimizer(learning_rate)
      summaries.add(tf.summary.scalar('learning_rate', learning_rate))

    if FLAGS.sync_replicas:
      # If sync_replicas is enabled, the averaging will be done in the chief
      # queue runner.
      optimizer = tf.train.SyncReplicasOptimizer(
          opt=optimizer,
          replicas_to_aggregate=FLAGS.replicas_to_aggregate,
          total_num_replicas=FLAGS.worker_replicas,
          variable_averages=variable_averages,
          variables_to_average=moving_average_variables)
    elif FLAGS.moving_average_decay:
      # Update ops executed locally by trainer.
      update_ops.append(variable_averages.apply(moving_average_variables))

    # Variables to train.
    variables_to_train = _get_variables_to_train()

    #  and returns a train_tensor and summary_op
    total_loss, clones_gradients = model_deploy.optimize_clones(
        clones,
        optimizer,
        var_list=variables_to_train)
    # Add total_loss to summary.
    summaries.add(tf.summary.scalar('total_loss', total_loss))

    # Create gradient updates.
    grad_updates = optimizer.apply_gradients(clones_gradients,
                                             global_step=global_step)
    update_ops.append(grad_updates)

    update_op = tf.group(*update_ops)
    with tf.control_dependencies([update_op]):
      train_tensor = tf.identity(total_loss, name='train_op')

    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    ###########################
    # Kicks off the training. #
    ###########################
    slim.learning.train(
        train_tensor,
        logdir=FLAGS.train_dir,
        master=FLAGS.master,
        is_chief=(FLAGS.task == 0),
        init_fn=_get_init_fn(),
        summary_op=summary_op,
        number_of_steps=FLAGS.max_number_of_steps,
        log_every_n_steps=FLAGS.log_every_n_steps,
        save_summaries_secs=FLAGS.save_summaries_secs,
        save_interval_secs=FLAGS.save_interval_secs,
        sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_):
    if FLAGS.train_on_cpu:
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    else:
        os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpu_device

    if not FLAGS.dataset_dir:
        raise ValueError(
            "You must supply the dataset directory with --dataset-dir.")

    tf.logging.set_verbosity(tf.logging.DEBUG)

    g = tf.Graph()
    with g.as_default():
        # select the dataset
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        # create global step, used for optimizer moving average decay
        with tf.device("/cpu:0"):
            global_step = tf.train.create_global_step()

        # pdb.set_trace()
        # get the ssd network and its anchors
        ssd_cls = ssd.SSDnet
        ssd_params = ssd_cls.default_params._replace(
            num_classes=FLAGS.num_classes)
        ssd_net = ssd_cls(ssd_params)
        image_size = ssd_net.params.img_shape

        ssd_anchors = ssd_net.anchors(img_shape=image_size)

        # select the preprocessing function
        preprocessing_name = FLAGS.preprocessing_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        tf_utils.print_configuration(FLAGS.__flags, ssd_params,
                                     dataset.data_sources, FLAGS.train_dir)

        # create a dataset provider and batches.
        with tf.device("/cpu:0"):
            with tf.name_scope(FLAGS.dataset_name + "_data_provider"):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size,
                    shuffle=True)
                # get for ssd network: image,labels,bboxes
                [image, shape, glabels, gbboxes] = provider.get(
                    ["image", "shape", "object/label", "object/bbox"])

                # pdb.set_trace()
                # preprocessing
                image,glabels,gbboxes = \
                            image_preprocessing_fn(image,
                                                                glabels,gbboxes,
                                                                out_shape=image_size,
                                                                data_format="NHWC")

                # encode groundtruth labels and bboxes
                gclasses,glocalisations,gscores= \
                    ssd_net.bboxes_encode(glabels,gbboxes,ssd_anchors)
                batch_shape = [1] + [len(ssd_anchors)] * 3

                # training batches and queue
                r = tf.train.batch(tf_utils.reshape_list(
                    [image, gclasses, glocalisations, gscores]),
                                   batch_size=FLAGS.batch_size,
                                   num_threads=FLAGS.num_preprocessing_threads,
                                   capacity=5 * FLAGS.batch_size)
                b_image,b_gclasses,b_glocalisations,b_gscores = \
                    tf_utils.reshape_list(r,batch_shape)

                # prefetch queue
                batch_queue = slim.prefetch_queue.prefetch_queue(
                    tf_utils.reshape_list(
                        [b_image, b_gclasses, b_glocalisations, b_gscores]),
                    capacity=8)

        # dequeue batch
        b_image, b_gclasses, b_glocalisations, b_gscores = \
                tf_utils.reshape_list(batch_queue.dequeue(), batch_shape)

        # gather initial summaries
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))
        arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay)
        with slim.arg_scope(arg_scope):
            predictions,localisations,logits,end_points,mobilenet_var_list = \
                    ssd_net.net(b_image,is_training=True)

        # add loss function
        ssd_net.losses(logits,
                       localisations,
                       b_gclasses,
                       b_glocalisations,
                       b_gscores,
                       match_threshold=FLAGS.match_threshold,
                       negative_ratio=FLAGS.negative_ratio,
                       alpha=FLAGS.loss_alpha,
                       label_smoothing=FLAGS.label_smoothing)

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        # add summaries for end_points
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram("activations/" + end_point, x))
            summaries.add(
                tf.summary.scalar("sparsity/" + end_point,
                                  tf.nn.zero_fraction(x)))

        # add summaries for losses and extra losses
        for loss in tf.get_collection(tf.GraphKeys.LOSSES):
            summaries.add(tf.summary.scalar(loss.op.name, loss))
        for loss in tf.get_collection("EXTRA_LOSSES"):
            summaries.add(tf.summary.scalar(loss.op.name, loss))

        # add summaries for variables
        for var in slim.get_model_variables():
            summaries.add(tf.summary.histogram(var.op.name, var))

        # configure the moving averages
        if FLAGS.moving_average_decay:  # use moving average decay on weights variables
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        # configure the optimization procedure
        with tf.device("/cpu:0"):
            learning_rate = tf_utils.configure_learning_rate(
                FLAGS, dataset.num_samples, global_step)
            optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
            summaries.add(tf.summary.scalar("learning_rate", learning_rate))

        if FLAGS.moving_average_decay:
            # update ops executed by trainer
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # get variables to train
        variables_to_train = tf_utils.get_variables_to_train(FLAGS)

        # return a train tensor and summary op
        total_losses = tf.get_collection(tf.GraphKeys.LOSSES)
        total_loss = tf.add_n(total_losses, name="total_loss")
        summaries.add(tf.summary.scalar("total_loss", total_loss))

        # create gradient updates
        grads = optimizer.compute_gradients(total_loss,
                                            var_list=variables_to_train)
        grad_updates = optimizer.apply_gradients(grads,
                                                 global_step=global_step)
        update_ops.append(grad_updates)

        # create train op
        update_op = tf.group(*update_ops)
        train_tensor = control_flow_ops.with_dependencies([update_op],
                                                          total_loss,
                                                          name="train_op")

        # merge all summaries together
        summary_op = tf.summary.merge(list(summaries), name="summary_op")

        # start training
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction,
            allow_growth=FLAGS.allow_growth)
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        saver = tf.train.Saver(max_to_keep=2,
                               keep_checkpoint_every_n_hours=1.0,
                               write_version=2,
                               pad_step_number=False)

        # create initial assignment op
        init_assign_op, init_feed_dict = slim.assign_from_checkpoint(
            FLAGS.checkpoint_path,
            mobilenet_var_list,
            ignore_missing_vars=FLAGS.ignore_missing_vars)

        # create an initial assignment function
        for k, v in init_feed_dict.items():
            if "global_step" in k.name:
                g_step = k

        init_feed_dict[g_step] = 0  # change the global_step to zero.
        init_fn = lambda sess: sess.run(init_assign_op, init_feed_dict)

        # run training
        slim.learning.train(
            train_tensor,
            logdir=FLAGS.train_dir,
            init_fn=init_fn,
            summary_op=summary_op,
            number_of_steps=FLAGS.max_number_of_steps,
            save_summaries_secs=FLAGS.save_summaries_secs,
            save_interval_secs=FLAGS.save_interval_secs,
            session_config=config,
            saver=saver,
        )
Пример #31
0
def get_from_tfrecord():

    return dataset_factory.get_dataset('pascalvoc_2007', 'train',
                                       'D:\Data\VOC\\train')
Пример #32
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        # =================================================================== #
        # Dataset + SSD model + Pre-processing
        # =================================================================== #
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        # Get the SSD network and its anchors.
        ssd_class = nets_factory.get_network(FLAGS.model_name)
        ssd_params = ssd_class.default_params._replace(
            num_classes=FLAGS.num_classes)
        ssd_net = ssd_class(ssd_params)

        # Evaluation shape and associated anchors: eval_image_size
        ssd_shape = ssd_net.params.img_shape
        ssd_anchors = ssd_net.anchors(ssd_shape)

        # Select the preprocessing function.
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        tf_utils.print_configuration(FLAGS.__flags, ssd_params,
                                     dataset.data_sources, FLAGS.eval_dir)
        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.device('/cpu:0'):
            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    common_queue_capacity=2 * FLAGS.batch_size,
                    common_queue_min=FLAGS.batch_size,
                    shuffle=False)
            # Get for SSD network: image, labels, bboxes.
            [image, shape, glabels, gbboxes] = provider.get(
                ['image', 'shape', 'object/label', 'object/bbox'])
            if FLAGS.remove_difficult:
                [gdifficults] = provider.get(['object/difficult'])
            else:
                gdifficults = tf.zeros(tf.shape(glabels), dtype=tf.int64)

            # Pre-processing image, labels and bboxes.
            image, glabels, gbboxes, gbbox_img = \
                image_preprocessing_fn(image, glabels, gbboxes,
                                       out_shape=ssd_shape,
                                       data_format=DATA_FORMAT,
                                       resize=FLAGS.eval_resize,
                                       difficults=None)

            # Encode groundtruth labels and bboxes.
            gclasses, glocalisations, gscores = \
                ssd_net.bboxes_encode(glabels, gbboxes, ssd_anchors)
            batch_shape = [1] * 5 + [len(ssd_anchors)] * 3

            # Evaluation batch.
            r = tf.train.batch(tf_utils.reshape_list([
                image, glabels, gbboxes, gdifficults, gbbox_img, gclasses,
                glocalisations, gscores
            ]),
                               batch_size=FLAGS.batch_size,
                               num_threads=FLAGS.num_preprocessing_threads,
                               capacity=5 * FLAGS.batch_size,
                               dynamic_pad=True)
            (b_image, b_glabels, b_gbboxes, b_gdifficults, b_gbbox_img,
             b_gclasses, b_glocalisations,
             b_gscores) = tf_utils.reshape_list(r, batch_shape)

        # =================================================================== #
        # SSD Network + Ouputs decoding.
        # =================================================================== #
        dict_metrics = {}
        arg_scope = ssd_net.arg_scope(data_format=DATA_FORMAT)
        with slim.arg_scope(arg_scope):
            predictions, localisations, logits, end_points = \
                ssd_net.net(b_image, is_training=False)
        # Add losses functions.
        ssd_net.losses(logits, localisations, b_gclasses, b_glocalisations,
                       b_gscores)

        # Performing post-processing on CPU: loop-intensive, usually more efficient.
        with tf.device('/device:CPU:0'):
            # Detected objects from SSD output.
            localisations = ssd_net.bboxes_decode(localisations, ssd_anchors)
            rscores, rbboxes = \
                ssd_net.detected_bboxes(predictions, localisations,
                                        select_threshold=FLAGS.select_threshold,
                                        nms_threshold=FLAGS.nms_threshold,
                                        clipping_bbox=None,
                                        top_k=FLAGS.select_top_k,
                                        keep_top_k=FLAGS.keep_top_k)
            # Compute TP and FP statistics.
            num_gbboxes, tp, fp, rscores = \
                tfe.bboxes_matching_batch(rscores.keys(), rscores, rbboxes,
                                          b_glabels, b_gbboxes, b_gdifficults,
                                          matching_threshold=FLAGS.matching_threshold)

        # Variables to restore: moving avg. or normal weights.
        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        # =================================================================== #
        # Evaluation metrics.
        # =================================================================== #
        with tf.device('/device:CPU:0'):
            dict_metrics = {}
            # First add all losses.
            for loss in tf.get_collection(tf.GraphKeys.LOSSES):
                dict_metrics[loss.op.name] = slim.metrics.streaming_mean(loss)
            # Extra losses as well.
            for loss in tf.get_collection('EXTRA_LOSSES'):
                dict_metrics[loss.op.name] = slim.metrics.streaming_mean(loss)

            # Add metrics to summaries and Print on screen.
            for name, metric in dict_metrics.items():
                # summary_name = 'eval/%s' % name
                summary_name = name
                op = tf.summary.scalar(summary_name, metric[0], collections=[])
                # op = tf.Print(op, [metric[0]], summary_name)
                tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

            # FP and TP metrics.
            tp_fp_metric = tfe.streaming_tp_fp_arrays(num_gbboxes, tp, fp,
                                                      rscores)
            for c in tp_fp_metric[0].keys():
                dict_metrics['tp_fp_%s' % c] = (tp_fp_metric[0][c],
                                                tp_fp_metric[1][c])

            # Add to summaries precision/recall values.
            aps_voc07 = {}
            aps_voc12 = {}
            for c in tp_fp_metric[0].keys():
                # Precison and recall values.
                prec, rec = tfe.precision_recall(*tp_fp_metric[0][c])

                # Average precision VOC07.
                v = tfe.average_precision_voc07(prec, rec)
                summary_name = 'AP_VOC07/%s' % c
                op = tf.summary.scalar(summary_name, v, collections=[])
                # op = tf.Print(op, [v], summary_name)
                tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
                aps_voc07[c] = v

                # Average precision VOC12.
                v = tfe.average_precision_voc12(prec, rec)
                summary_name = 'AP_VOC12/%s' % c
                op = tf.summary.scalar(summary_name, v, collections=[])
                # op = tf.Print(op, [v], summary_name)
                tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)
                aps_voc12[c] = v

            # Mean average precision VOC07.
            summary_name = 'AP_VOC07/mAP'
            mAP = tf.add_n(list(aps_voc07.values())) / len(aps_voc07)
            op = tf.summary.scalar(summary_name, mAP, collections=[])
            op = tf.Print(op, [mAP], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

            # Mean average precision VOC12.
            summary_name = 'AP_VOC12/mAP'
            mAP = tf.add_n(list(aps_voc12.values())) / len(aps_voc12)
            op = tf.summary.scalar(summary_name, mAP, collections=[])
            op = tf.Print(op, [mAP], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # for i, v in enumerate(l_precisions):
        #     summary_name = 'eval/precision_at_recall_%.2f' % LIST_RECALLS[i]
        #     op = tf.summary.scalar(summary_name, v, collections=[])
        #     op = tf.Print(op, [v], summary_name)
        #     tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # Split into values and updates ops.
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map(
            dict_metrics)

        # =================================================================== #
        # Evaluation loop.
        # =================================================================== #
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1

        # Number of batches...
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if not FLAGS.wait_for_checkpoints:
            if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
                checkpoint_path = tf.train.latest_checkpoint(
                    FLAGS.checkpoint_path)
            else:
                checkpoint_path = FLAGS.checkpoint_path
            tf.logging.info('Evaluating %s' % checkpoint_path)

            # Standard evaluation loop.
            start = time.time()
            slim.evaluation.evaluate_once(
                master=FLAGS.master,
                checkpoint_path=checkpoint_path,
                logdir=FLAGS.eval_dir,
                num_evals=num_batches,
                eval_op=list(names_to_updates.values()),
                variables_to_restore=variables_to_restore,
                session_config=config)
            # Log time spent.
            elapsed = time.time()
            elapsed = elapsed - start
            print('Time spent : %.3f seconds.' % elapsed)
            print('Time spent per BATCH: %.3f seconds.' %
                  (elapsed / num_batches))

        else:
            checkpoint_path = FLAGS.checkpoint_path
            tf.logging.info('Evaluating %s' % checkpoint_path)

            # Waiting loop.
            slim.evaluation.evaluation_loop(
                master=FLAGS.master,
                checkpoint_dir=checkpoint_path,
                logdir=FLAGS.eval_dir,
                num_evals=num_batches,
                eval_op=list(names_to_updates.values()),
                variables_to_restore=variables_to_restore,
                eval_interval_secs=60,
                max_number_of_evaluations=np.inf,
                session_config=config,
                timeout=None)
Пример #33
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #######################
    # Config model_deploy #
    #######################
    deploy_config = model_deploy.DeploymentConfig(
        num_clones=1,
        clone_on_cpu=False,
        replica_id=0,
        num_replicas=1,
        num_ps_tasks=0)

    # Create global_step
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        'flowers', 'train', FLAGS.dataset_dir)

    ######################
    # Select the network #
    ######################
    network_fn = nets_factory.get_network_fn(
        'mobilenet_v1',
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        weight_decay=FLAGS.weight_decay,
        is_training=True)

    #####################################
    # Select the preprocessing function #
    #####################################
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        'mobilenet_v1',
        is_training=True)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    with tf.device(deploy_config.inputs_device()):
      provider = slim.dataset_data_provider.DatasetDataProvider(
          dataset,
          num_readers=4,
          common_queue_capacity=20 * FLAGS.batch_size,
          common_queue_min=10 * FLAGS.batch_size)
      [image, label] = provider.get(['image', 'label'])
      label -= FLAGS.labels_offset

      train_image_size = network_fn.default_image_size

      image = image_preprocessing_fn(image, train_image_size, train_image_size)

      images, labels = tf.train.batch(
          [image, label],
          batch_size=FLAGS.batch_size,
          num_threads=4,
          capacity=5 * FLAGS.batch_size)
      labels = slim.one_hot_encoding(
          labels, dataset.num_classes - FLAGS.labels_offset)
      batch_queue = slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * deploy_config.num_clones)

    ####################
    # Define the model #
    ####################
    def clone_fn(batch_queue):
      """Allows data parallelism by creating multiple clones of network_fn."""
      images, labels = batch_queue.dequeue()
      logits, end_points = network_fn(images)

      #############################
      # Specify the loss function #
      #############################
      slim.losses.softmax_cross_entropy(
          logits, labels, label_smoothing=FLAGS.label_smoothing, weights=1.0)
      return end_points

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
    first_clone_scope = deploy_config.clone_scope(0)
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    # Add summaries for end_points.
    end_points = clones[0].outputs
    for end_point in end_points:
      x = end_points[end_point]
      summaries.add(tf.summary.histogram('activations/' + end_point, x))
      summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
      summaries.add(tf.summary.histogram(variable.op.name, variable))

    moving_average_variables, variable_averages = None, None

    #########################################
    # Configure the optimization procedure. #
    #########################################
    with tf.device(deploy_config.optimizer_device()):

      num_epochs_per_decay = 2.5
      decay_steps = int(dataset.num_samples / FLAGS.batch_size *
                        num_epochs_per_decay)
      learning_rate = tf.train.exponential_decay(FLAGS.learning_rate,
                                  global_step,
                                  decay_steps,
                                  _LEARNING_RATE_DECAY_FACTOR,
                                  staircase=True,
                                    name='exponential_decay_learning_rate')

      optimizer = tf.train.RMSPropOptimizer(
                           learning_rate,
                           decay=FLAGS.rmsprop_decay,
                           momentum=FLAGS.rmsprop_momentum,
                           epsilon=FLAGS.opt_epsilon)
      summaries.add(tf.summary.scalar('learning_rate', learning_rate))

    # Variables to train.
    variables_to_train = _get_variables_to_train()

    #  and returns a train_tensor and summary_op
    total_loss, clones_gradients = model_deploy.optimize_clones(
        clones,
        optimizer,
        var_list=variables_to_train)
    # Add total_loss to summary.
    summaries.add(tf.summary.scalar('total_loss', total_loss))

    # Create gradient updates.
    grad_updates = optimizer.apply_gradients(clones_gradients,
                                             global_step=global_step)
    update_ops.append(grad_updates)

    update_op = tf.group(*update_ops)
    with tf.control_dependencies([update_op]):
      train_tensor = tf.identity(total_loss, name='train_op')

    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    session_config = tf.ConfigProto(allow_soft_placement=True,
                                    log_device_placement=False)

    ###########################
    # Kicks off the training. #
    ###########################
    slim.learning.train(
        train_tensor,
        logdir=FLAGS.train_dir,
        master=FLAGS.master,
        is_chief=True,
        session_config=session_config,
        init_fn=_get_init_fn(),
        summary_op=summary_op,
        number_of_steps=FLAGS.max_number_of_steps,
        log_every_n_steps=10,
        save_summaries_secs=300,
        save_interval_secs=300,
        sync_optimizer=optimizer if False else None)
Пример #34
0
def main(opt):
    # torch.manual_seed(opt.seed)
    # torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    paddle.seed(opt.seed)
    print('Setting up data...')
    Dataset = get_dataset(opt.dataset, opt.task)
    f = open(opt.data_cfg)
    data_config = json.load(f)
    trainset_paths = data_config['train']
    dataset_root = data_config['root']
    f.close()
    transforms = T.Compose([T.ToTensor()])
    dataset = Dataset(opt,
                      dataset_root,
                      trainset_paths, (1088, 608),
                      augment=True,
                      transforms=transforms)
    opt = opts().update_dataset_info_and_set_heads(opt, dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    # opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')
    opt.device = paddle.get_device()

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)

    start_epoch = 0

    # Get dataloader

    # train_loader = torch.utils.data.DataLoader(
    #     dataset,
    #     batch_size=opt.batch_size,
    #     shuffle=True,
    #     num_workers=opt.num_workers,
    #     pin_memory=True,
    #     drop_last=True
    # )
    train_loader = DataLoader(dataset,
                              batch_size=opt.batch_size,
                              shuffle=True,
                              num_workers=opt.num_workers,
                              use_shared_memory=False,
                              drop_last=True)
    print('Starting training...')
    Trainer = train_factory[opt.task]
    # optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    # optimizer = paddle.optimizer.Adam(learning_rate=opt.lr, parameters=model.parameters()) # 这句代码的作用纯粹是为了传个参数,

    # trainer = Trainer(opt, model, optimizer)
    trainer = Trainer(opt, model)
    optimizer = trainer.optimizer  # 见base_trainer.py
    id_classifier = trainer.loss.classifier  # 见base_trainer.py
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)
    if 'fairmot_hrnet_w18' in opt.load_model:
        model = load_model(model, opt.load_model)
    elif opt.load_model != '':
        model, optimizer, start_epoch, id_classifier = load_model(
            model, opt.load_model, trainer.optimizer, trainer.loss.classifier,
            opt.resume, opt.lr, opt.lr_step)

    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))

        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pdparams'.format(mark)),
                epoch, model, optimizer, id_classifier)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pdparams'),
                       epoch, model, optimizer, id_classifier)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pdparams'.format(epoch)),
                epoch, model, optimizer, id_classifier)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            # for param_group in optimizer.param_groups:
            #     param_group['lr'] = lr
            optimizer.set_lr(lr)
        if epoch % 5 == 0 or epoch >= 25:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pdparams'.format(epoch)),
                epoch, model, optimizer, id_classifier)
    logger.close()
Пример #35
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError('You must supply the dataset directory with --dataset_dir')
    tf.logging.set_verbosity(tf.logging.DEBUG)
    batch_size = FLAGS.batch_size;
    with tf.Graph().as_default():
        # Select the dataset.
        dataset = dataset_factory.get_dataset(
            FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

        util.proc.set_proc_name(FLAGS.model_name + '_' + FLAGS.dataset_name)


        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.device('/cpu:0'):
            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * batch_size,
                    common_queue_min=10 * batch_size,
                    shuffle=True)
            # Get for SSD network: image, labels, bboxes.
            [image, shape, gignored, gbboxes, x1, x2, x3, x4, y1, y2, y3, y4] = provider.get(['image', 'shape',
                                                             'object/ignored',
                                                             'object/bbox', 
                                                             'object/oriented_bbox/x1',
                                                             'object/oriented_bbox/x2',
                                                             'object/oriented_bbox/x3',
                                                             'object/oriented_bbox/x4',
                                                             'object/oriented_bbox/y1',
                                                             'object/oriented_bbox/y2',
                                                             'object/oriented_bbox/y3',
                                                             'object/oriented_bbox/y4'
                                                             ])
            gxs = tf.transpose(tf.stack([x1, x2, x3, x4])) #shape = (N, 4)
            gys = tf.transpose(tf.stack([y1, y2, y3, y4]))
            image = tf.identity(image, 'input_image')
            # Pre-processing image, labels and bboxes.
            image_shape = (FLAGS.train_image_size, FLAGS.train_image_size)
            image, gignored, gbboxes, gxs, gys = \
                            ssd_vgg_preprocessing.preprocess_image(image, gignored, gbboxes, gxs, gys, 
                                                               out_shape=image_shape,
                                                               is_training = True)
            gxs = gxs * tf.cast(image_shape[1], gxs.dtype)
            gys = gys * tf.cast(image_shape[0], gys.dtype)
            gorbboxes = tfe_seglink.tf_min_area_rect(gxs, gys)
            image = tf.identity(image, 'processed_image')
            
            with tf.Session() as sess:
                coord = tf.train.Coordinator()
                threads = tf.train.start_queue_runners(sess=sess, coord=coord)
                i = 0
                while i < 2:
                    i += 1
                    image_data, label_data, bbox_data, xs_data, ys_data, orbboxes = \
                                 sess.run([image, gignored, gbboxes, gxs, gys, gorbboxes])
                    image_data = image_data + [123., 117., 104.]
                    image_data = np.asarray(image_data, np.uint8)
                    h, w = image_data.shape[0:-1]
                    bbox_data = bbox_data * [h, w, h, w]
                    I_bbox = image_data.copy()
                    I_xys = image_data.copy()
                    I_orbbox = image_data.copy()
                    
                    for idx in range(bbox_data.shape[0]):
                        
                        def draw_bbox():
                            y1, x1, y2, x2 = bbox_data[idx, :]
                            util.img.rectangle(I_bbox, (x1, y1), (x2, y2), color = util.img.COLOR_WHITE)
                        
                        def draw_xys():
                            points = zip(xs_data[idx, :], ys_data[idx, :])
                            cnts = util.img.points_to_contours(points);
                            util.img.draw_contours(I_xys, cnts, -1, color = util.img.COLOR_GREEN)

                        def draw_orbbox():
                            orbox = orbboxes[idx, :]
                            import cv2
                            rect = ((orbox[0], orbox[1]), (orbox[2], orbox[3]), orbox[4])
                            box = cv2.cv.BoxPoints(rect)
                            box = np.int0(box)
                            cv2.drawContours(I_orbbox, [box], 0, util.img.COLOR_RGB_RED, 1)
                        
                        draw_bbox()
                        draw_xys();
                        draw_orbbox();
                        
                    print util.sit(I_bbox)
                    print util.sit(I_xys)
                    print util.sit(I_orbbox)
                    print 'check the images and make sure that bboxes in difference colors are the same.'
                coord.request_stop()
                coord.join(threads)
Пример #36
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.DEBUG)
    with tf.Graph().as_default():
        # Config model_deploy. Keep TF Slim Models structure.
        # Useful if want to need multiple GPUs and/or servers in the future.
        deploy_config = model_deploy.DeploymentConfig(
            num_clones=FLAGS.num_clones,
            clone_on_cpu=FLAGS.clone_on_cpu,
            replica_id=0,
            num_replicas=1,
            num_ps_tasks=0)
        # Create global_step.
        with tf.device(deploy_config.variables_device()):  # 分配设备
            global_step = slim.create_global_step()

        # Select the dataset.#得到数据
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        # Get the SSD network and its anchors.
        ssd_class = nets_factory.get_network(
            FLAGS.model_name)  # 返回ssd_vgg_300.SSDNet
        ssd_params = ssd_class.default_params._replace(
            num_classes=FLAGS.num_classes)
        ssd_net = ssd_class(ssd_params)
        ssd_shape = ssd_net.params.img_shape
        ssd_anchors = ssd_net.anchors(ssd_shape)  # 为每个特征图生成anchors

        # Select the preprocessing function.
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        # 得到处理数据的程序
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=True)

        tf_utils.print_configuration(FLAGS.__flags, ssd_params,
                                     dataset.data_sources, FLAGS.train_dir)
        # =================================================================== #
        # Create a dataset provider and batches.
        # =================================================================== #
        with tf.device(deploy_config.inputs_device()):
            with tf.name_scope(FLAGS.dataset_name + '_data_provider'):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size,
                    shuffle=True)
            # Get for SSD network: image, labels, bboxes.
            [image, glabels,
             gbboxes] = provider.get(['image', 'object/label', 'object/bbox'])

            # Pre-processing image, labels and bboxes.
            # 对图像进行预处理
            image, glabels, gbboxes = image_preprocessing_fn(
                image,
                glabels,
                gbboxes,
                out_shape=ssd_shape,
                data_format=DATA_FORMAT)

            # Encode groundtruth labels and bboxes.
            ###############################################################没看懂
            gclasses, glocalisations, gscores = ssd_net.bboxes_encode(
                glabels, gbboxes, ssd_anchors)
            batch_shape = [1] + [len(ssd_anchors)] * 3

            # Training batches and queue.
            r = tf.train.batch(tf_utils.reshape_list(
                [image, gclasses, glocalisations, gscores]),
                               batch_size=FLAGS.batch_size,
                               num_threads=FLAGS.num_preprocessing_threads,
                               capacity=5 * FLAGS.batch_size)
            b_image, b_gclasses, b_glocalisations, b_gscores = \
                tf_utils.reshape_list(r, batch_shape)

            # Intermediate queueing: unique batch computation pipeline for all
            # GPUs running the training.
            batch_queue = slim.prefetch_queue.prefetch_queue(
                tf_utils.reshape_list(
                    [b_image, b_gclasses, b_glocalisations, b_gscores]),
                capacity=2 * deploy_config.num_clones)

        # =================================================================== #
        # Define the model running on every GPU.
        # =================================================================== #
        def clone_fn(batch_queue):
            """Allows data parallelism by creating multiple
            clones of network_fn."""
            # Dequeue batch.
            b_image, b_gclasses, b_glocalisations, b_gscores = \
                tf_utils.reshape_list(batch_queue.dequeue(), batch_shape)

            # Construct SSD network.
            arg_scope = ssd_net.arg_scope(weight_decay=FLAGS.weight_decay,
                                          data_format=DATA_FORMAT)
            with slim.arg_scope(arg_scope):
                predictions, localisations, logits, end_points = \
                    ssd_net.net(b_image, is_training=True,DSSD_FLAG = FLAGS.DSSD_FLAG)
            # Add loss function.
            ssd_net.losses(logits,
                           localisations,
                           b_gclasses,
                           b_glocalisations,
                           b_gscores,
                           match_threshold=FLAGS.match_threshold,
                           negative_ratio=FLAGS.negative_ratio,
                           alpha=FLAGS.loss_alpha,
                           label_smoothing=FLAGS.label_smoothing)
            return end_points

        # Gather initial summaries.
        summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

        # =================================================================== #
        # Add summaries from first clone.
        # =================================================================== #
        ##########################################没看懂
        clones = model_deploy.create_clones(deploy_config, clone_fn,
                                            [batch_queue])
        first_clone_scope = deploy_config.clone_scope(0)
        # Gather update_ops from the first clone. These contain, for example,
        # the updates for the batch_norm variables created by network_fn.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                       first_clone_scope)

        # Add summaries for end_points.
        end_points = clones[0].outputs
        for end_point in end_points:
            x = end_points[end_point]
            summaries.add(tf.summary.histogram('activations/' + end_point, x))
            summaries.add(
                tf.summary.scalar('sparsity/' + end_point,
                                  tf.nn.zero_fraction(x)))
        # Add summaries for losses and extra losses.
        for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
            summaries.add(tf.summary.scalar(loss.op.name, loss))
        for loss in tf.get_collection('EXTRA_LOSSES', first_clone_scope):
            summaries.add(tf.summary.scalar(loss.op.name, loss))

        # Add summaries for variables.
        for variable in slim.get_model_variables():
            summaries.add(tf.summary.histogram(variable.op.name, variable))

        # =================================================================== #
        # Configure the moving averages.
        # =================================================================== #
        if FLAGS.moving_average_decay:
            moving_average_variables = slim.get_model_variables()
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, global_step)
        else:
            moving_average_variables, variable_averages = None, None

        # =================================================================== #
        # Configure the optimization procedure.
        # =================================================================== #
        with tf.device(deploy_config.optimizer_device()):
            learning_rate = tf_utils.configure_learning_rate(
                FLAGS, dataset.num_samples, global_step)
            optimizer = tf_utils.configure_optimizer(FLAGS, learning_rate)
            summaries.add(tf.summary.scalar('learning_rate', learning_rate))

        if FLAGS.moving_average_decay:
            # Update ops executed locally by trainer.
            update_ops.append(
                variable_averages.apply(moving_average_variables))

        # Variables to train.
        variables_to_train = tf_utils.get_variables_to_train(FLAGS)

        # and returns a train_tensor and summary_op
        total_loss, clones_gradients = model_deploy.optimize_clones(
            clones, optimizer, var_list=variables_to_train)
        # Add total_loss to summary.
        summaries.add(tf.summary.scalar('total_loss', total_loss))

        # Create gradient updates.
        grad_updates = optimizer.apply_gradients(clones_gradients,
                                                 global_step=global_step)
        update_ops.append(grad_updates)
        update_op = tf.group(*update_ops)
        train_tensor = control_flow_ops.with_dependencies([update_op],
                                                          total_loss,
                                                          name='train_op')

        # Add the summaries from the first clone. These contain the summaries
        summaries |= set(
            tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))
        # Merge all summaries together.
        summary_op = tf.summary.merge(list(summaries), name='summary_op')

        # =================================================================== #
        # Kicks off the training.
        # =================================================================== #
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
        config = tf.ConfigProto(log_device_placement=False,
                                gpu_options=gpu_options)
        saver = tf.train.Saver(max_to_keep=5,
                               keep_checkpoint_every_n_hours=1.0,
                               write_version=2,
                               pad_step_number=False)
        # n = tf.all_variables()
        if FLAGS.DSSD_FLAG:
            ckpt = tf.train.get_checkpoint_state(FLAGS.checkpoint_path)
            # reader = tf.train.NewCheckpointReader(ckpt.model_checkpoint_path)

            variables_to_restore = [
                var.name for var in tf.all_variables()
                if var.name.startswith("_box", 18)
                or var.name.startswith("_box", 19)
            ]

            variables_to_restore = slim.get_variables_to_restore(
                exclude=variables_to_restore)
            #
            # restore = tf.train.Saver(variables_to_restore)
            init_fn = slim.assign_from_checkpoint_fn(
                ckpt.model_checkpoint_path,
                variables_to_restore,
                ignore_missing_vars=True,
                reshape_variables=False)
        else:
            init_fn = tf_utils.get_init_fn(FLAGS)

        # with tf.Session() as sess:
        #     # init_fn(sess)
        #     ckpt_filename = './checkpoints_fpn/model.ckpt-87149'
        #     saver.restore(sess, ckpt_filename)
        #     print(".................................")

        slim.learning.train(train_tensor,
                            logdir=FLAGS.train_dir,
                            master='',
                            is_chief=True,
                            init_fn=init_fn,
                            summary_op=summary_op,
                            number_of_steps=FLAGS.max_number_of_steps,
                            log_every_n_steps=FLAGS.log_every_n_steps,
                            save_summaries_secs=FLAGS.save_summaries_secs,
                            saver=saver,
                            save_interval_secs=FLAGS.save_interval_secs,
                            session_config=config,
                            sync_optimizer=None)
def main_fun(argv, ctx):
  import tensorflow as tf
  from tensorflow.python.ops import control_flow_ops
  from datasets import dataset_factory
  from deployment import model_deploy
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'num_gpus', '1', 'The number of GPUs to use per node')

  tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'train_dir', '/tmp/tfmodel/',
      'Directory where checkpoints and event logs are written to.')

  tf.app.flags.DEFINE_integer('num_clones', 1,
                              'Number of model clones to deploy.')

  tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                              'Use CPUs to deploy clones.')

  tf.app.flags.DEFINE_integer('worker_replicas', 1, 'Number of worker replicas.')

  tf.app.flags.DEFINE_integer(
      'num_ps_tasks', 0,
      'The number of parameter servers. If the value is 0, then the parameters '
      'are handled locally by the worker.')

  tf.app.flags.DEFINE_integer(
      'num_readers', 4,
      'The number of parallel readers that read data from the dataset.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_integer(
      'log_every_n_steps', 10,
      'The frequency with which logs are print.')

  tf.app.flags.DEFINE_integer(
      'save_summaries_secs', 600,
      'The frequency with which summaries are saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'save_interval_secs', 600,
      'The frequency with which the model is saved, in seconds.')

  tf.app.flags.DEFINE_integer(
      'task', 0, 'Task id of the replica running the training.')

  ######################
  # Optimization Flags #
  ######################

  tf.app.flags.DEFINE_float(
      'weight_decay', 0.00004, 'The weight decay on the model weights.')

  tf.app.flags.DEFINE_string(
      'optimizer', 'rmsprop',
      'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
      '"ftrl", "momentum", "sgd" or "rmsprop".')

  tf.app.flags.DEFINE_float(
      'adadelta_rho', 0.95,
      'The decay rate for adadelta.')

  tf.app.flags.DEFINE_float(
      'adagrad_initial_accumulator_value', 0.1,
      'Starting value for the AdaGrad accumulators.')

  tf.app.flags.DEFINE_float(
      'adam_beta1', 0.9,
      'The exponential decay rate for the 1st moment estimates.')

  tf.app.flags.DEFINE_float(
      'adam_beta2', 0.999,
      'The exponential decay rate for the 2nd moment estimates.')

  tf.app.flags.DEFINE_float('opt_epsilon', 1.0, 'Epsilon term for the optimizer.')

  tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5,
                            'The learning rate power.')

  tf.app.flags.DEFINE_float(
      'ftrl_initial_accumulator_value', 0.1,
      'Starting value for the FTRL accumulators.')

  tf.app.flags.DEFINE_float(
      'ftrl_l1', 0.0, 'The FTRL l1 regularization strength.')

  tf.app.flags.DEFINE_float(
      'ftrl_l2', 0.0, 'The FTRL l2 regularization strength.')

  tf.app.flags.DEFINE_float(
      'momentum', 0.9,
      'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

  tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')

  #######################
  # Learning Rate Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'learning_rate_decay_type',
      'exponential',
      'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
      ' or "polynomial"')

  tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

  tf.app.flags.DEFINE_float(
      'end_learning_rate', 0.0001,
      'The minimal end learning rate used by a polynomial decay learning rate.')

  tf.app.flags.DEFINE_float(
      'label_smoothing', 0.0, 'The amount of label smoothing.')

  tf.app.flags.DEFINE_float(
      'learning_rate_decay_factor', 0.94, 'Learning rate decay factor.')

  tf.app.flags.DEFINE_float(
      'num_epochs_per_decay', 2.0,
      'Number of epochs after which learning rate decays.')

  tf.app.flags.DEFINE_bool(
      'sync_replicas', False,
      'Whether or not to synchronize the replicas during training.')

  tf.app.flags.DEFINE_integer(
      'replicas_to_aggregate', 1,
      'The Number of gradients to collect before updating params.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  #######################
  # Dataset Flags #
  #######################

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'train', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to train.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_integer(
      'batch_size', 32, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'train_image_size', None, 'Train image size')

  tf.app.flags.DEFINE_integer('max_number_of_steps', None,
                              'The maximum number of training steps.')

  #####################
  # Fine-Tuning Flags #
  #####################

  tf.app.flags.DEFINE_string(
      'checkpoint_path', None,
      'The path to a checkpoint from which to fine-tune.')

  tf.app.flags.DEFINE_string(
      'checkpoint_exclude_scopes', None,
      'Comma-separated list of scopes of variables to exclude when restoring '
      'from a checkpoint.')

  tf.app.flags.DEFINE_string(
      'trainable_scopes', None,
      'Comma-separated list of scopes to filter the set of variables to train.'
      'By default, None would train all the variables.')

  tf.app.flags.DEFINE_boolean(
      'ignore_missing_vars', False,
      'When restoring a checkpoint would ignore missing variables.')

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = ctx.job_name
  FLAGS.task = ctx.task_index
  FLAGS.num_clones = FLAGS.num_gpus
  FLAGS.worker_replicas = len(ctx.cluster_spec['worker'])
  assert(FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps']) if 'ps' in ctx.cluster_spec else 0))

  def _configure_learning_rate(num_samples_per_epoch, global_step):
    """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
    decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                      FLAGS.num_epochs_per_decay)
    if FLAGS.sync_replicas:
      decay_steps /= FLAGS.replicas_to_aggregate

    if FLAGS.learning_rate_decay_type == 'exponential':
      return tf.train.exponential_decay(FLAGS.learning_rate,
                                        global_step,
                                        decay_steps,
                                        FLAGS.learning_rate_decay_factor,
                                        staircase=True,
                                        name='exponential_decay_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'fixed':
      return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
    elif FLAGS.learning_rate_decay_type == 'polynomial':
      return tf.train.polynomial_decay(FLAGS.learning_rate,
                                       global_step,
                                       decay_steps,
                                       FLAGS.end_learning_rate,
                                       power=1.0,
                                       cycle=False,
                                       name='polynomial_decay_learning_rate')
    else:
      raise ValueError('learning_rate_decay_type [%s] was not recognized',
                       FLAGS.learning_rate_decay_type)


  def _configure_optimizer(learning_rate):
    """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """
    if FLAGS.optimizer == 'adadelta':
      optimizer = tf.train.AdadeltaOptimizer(
          learning_rate,
          rho=FLAGS.adadelta_rho,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'adagrad':
      optimizer = tf.train.AdagradOptimizer(
          learning_rate,
          initial_accumulator_value=FLAGS.adagrad_initial_accumulator_value)
    elif FLAGS.optimizer == 'adam':
      optimizer = tf.train.AdamOptimizer(
          learning_rate,
          beta1=FLAGS.adam_beta1,
          beta2=FLAGS.adam_beta2,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'ftrl':
      optimizer = tf.train.FtrlOptimizer(
          learning_rate,
          learning_rate_power=FLAGS.ftrl_learning_rate_power,
          initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
          l1_regularization_strength=FLAGS.ftrl_l1,
          l2_regularization_strength=FLAGS.ftrl_l2)
    elif FLAGS.optimizer == 'momentum':
      optimizer = tf.train.MomentumOptimizer(
          learning_rate,
          momentum=FLAGS.momentum,
          name='Momentum')
    elif FLAGS.optimizer == 'rmsprop':
      optimizer = tf.train.RMSPropOptimizer(
          learning_rate,
          decay=FLAGS.rmsprop_decay,
          momentum=FLAGS.momentum,
          epsilon=FLAGS.opt_epsilon)
    elif FLAGS.optimizer == 'sgd':
      optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    else:
      raise ValueError('Optimizer [%s] was not recognized', FLAGS.optimizer)
    return optimizer


  def _add_variables_summaries(learning_rate):
    summaries = []
    for variable in slim.get_model_variables():
      summaries.append(tf.summary.histogram(variable.op.name, variable))
    summaries.append(tf.summary.scalar('training/Learning Rate', learning_rate))
    return summaries


  def _get_init_fn():
    """Returns a function run by the chief worker to warm-start the training.

    Note that the init_fn is only run when initializing the model during the very
    first global step.

    Returns:
      An init function run by the supervisor.
    """
    if FLAGS.checkpoint_path is None:
      return None

    # Warn the user if a checkpoint exists in the train_dir. Then we'll be
    # ignoring the checkpoint anyway.
    if tf.train.latest_checkpoint(FLAGS.train_dir):
      tf.logging.info(
          'Ignoring --checkpoint_path because a checkpoint already exists in %s'
          % FLAGS.train_dir)
      return None

    exclusions = []
    if FLAGS.checkpoint_exclude_scopes:
      exclusions = [scope.strip()
                    for scope in FLAGS.checkpoint_exclude_scopes.split(',')]

    # TODO(sguada) variables.filter_variables()
    variables_to_restore = []
    for var in slim.get_model_variables():
      excluded = False
      for exclusion in exclusions:
        if var.op.name.startswith(exclusion):
          excluded = True
          break
      if not excluded:
        variables_to_restore.append(var)

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Fine-tuning from %s' % checkpoint_path)

    return slim.assign_from_checkpoint_fn(
        checkpoint_path,
        variables_to_restore,
        ignore_missing_vars=FLAGS.ignore_missing_vars)


  def _get_variables_to_train():
    """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
    if FLAGS.trainable_scopes is None:
      return tf.trainable_variables()
    else:
      scopes = [scope.strip() for scope in FLAGS.trainable_scopes.split(',')]

    variables_to_train = []
    for scope in scopes:
      variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
      variables_to_train.extend(variables)
    return variables_to_train

  # main
  cluster_spec, server = TFNode.start_cluster_server(ctx=ctx, num_gpus=FLAGS.num_gpus, rdma=FLAGS.rdma)
  if ctx.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    if not FLAGS.dataset_dir:
      raise ValueError('You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
      #######################
      # Config model_deploy #
      #######################
      deploy_config = model_deploy.DeploymentConfig(
          num_clones=FLAGS.num_clones,
          clone_on_cpu=FLAGS.clone_on_cpu,
          replica_id=FLAGS.task,
          num_replicas=FLAGS.worker_replicas,
          num_ps_tasks=FLAGS.num_ps_tasks)

      # Create global_step
      #with tf.device(deploy_config.variables_device()):
      #  global_step = slim.create_global_step()
      with tf.device("/job:ps/task:0"):
        global_step = tf.Variable(0, name="global_step")

      ######################
      # Select the dataset #
      ######################
      dataset = dataset_factory.get_dataset(
          FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

      ######################
      # Select the network #
      ######################
      network_fn = nets_factory.get_network_fn(
          FLAGS.model_name,
          num_classes=(dataset.num_classes - FLAGS.labels_offset),
          weight_decay=FLAGS.weight_decay,
          is_training=True)

      #####################################
      # Select the preprocessing function #
      #####################################
      preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
      image_preprocessing_fn = preprocessing_factory.get_preprocessing(
          preprocessing_name,
          is_training=True)

      ##############################################################
      # Create a dataset provider that loads data from the dataset #
      ##############################################################
      with tf.device(deploy_config.inputs_device()):
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            num_readers=FLAGS.num_readers,
            common_queue_capacity=20 * FLAGS.batch_size,
            common_queue_min=10 * FLAGS.batch_size)
        [image, label] = provider.get(['image', 'label'])
        label -= FLAGS.labels_offset

        train_image_size = FLAGS.train_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, train_image_size, train_image_size)

        images, labels = tf.train.batch(
            [image, label],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)
        labels = slim.one_hot_encoding(
            labels, dataset.num_classes - FLAGS.labels_offset)
        batch_queue = slim.prefetch_queue.prefetch_queue(
            [images, labels], capacity=2 * deploy_config.num_clones)

      ####################
      # Define the model #
      ####################
      def clone_fn(batch_queue):
        """Allows data parallelism by creating multiple clones of network_fn."""
        images, labels = batch_queue.dequeue()
        logits, end_points = network_fn(images)

        #############################
        # Specify the loss function #
        #############################
        if 'AuxLogits' in end_points:
          tf.losses.softmax_cross_entropy(
              logits=end_points['AuxLogits'], onehot_labels=labels,
              label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss')
        tf.losses.softmax_cross_entropy(
            logits=logits, onehot_labels=labels,
            label_smoothing=FLAGS.label_smoothing, weights=1.0)
        return end_points

      # Gather initial summaries.
      summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

      clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
      first_clone_scope = deploy_config.clone_scope(0)
      # Gather update_ops from the first clone. These contain, for example,
      # the updates for the batch_norm variables created by network_fn.
      update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

      # Add summaries for end_points.
      end_points = clones[0].outputs
      for end_point in end_points:
        x = end_points[end_point]
        summaries.add(tf.summary.histogram('activations/' + end_point, x))
        summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                        tf.nn.zero_fraction(x)))

      # Add summaries for losses.
      for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
        summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

      # Add summaries for variables.
      for variable in slim.get_model_variables():
        summaries.add(tf.summary.histogram(variable.op.name, variable))

      #################################
      # Configure the moving averages #
      #################################
      if FLAGS.moving_average_decay:
        moving_average_variables = slim.get_model_variables()
        variable_averages = tf.train.ExponentialMovingAverage(
            FLAGS.moving_average_decay, global_step)
      else:
        moving_average_variables, variable_averages = None, None

      #########################################
      # Configure the optimization procedure. #
      #########################################
      with tf.device(deploy_config.optimizer_device()):
        learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
        optimizer = _configure_optimizer(learning_rate)
        summaries.add(tf.summary.scalar('learning_rate', learning_rate))

      if FLAGS.sync_replicas:
        # If sync_replicas is enabled, the averaging will be done in the chief
        # queue runner.
        optimizer = tf.train.SyncReplicasOptimizer(
            opt=optimizer,
            replicas_to_aggregate=FLAGS.replicas_to_aggregate,
            variable_averages=variable_averages,
            variables_to_average=moving_average_variables,
            replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
            total_num_replicas=FLAGS.worker_replicas)
      elif FLAGS.moving_average_decay:
        # Update ops executed locally by trainer.
        update_ops.append(variable_averages.apply(moving_average_variables))

      # Variables to train.
      variables_to_train = _get_variables_to_train()

      #  and returns a train_tensor and summary_op
      total_loss, clones_gradients = model_deploy.optimize_clones(
          clones,
          optimizer,
          var_list=variables_to_train)
      # Add total_loss to summary.
      summaries.add(tf.summary.scalar('total_loss', total_loss))

      # Create gradient updates.
      grad_updates = optimizer.apply_gradients(clones_gradients,
                                               global_step=global_step)
      update_ops.append(grad_updates)

      update_op = tf.group(*update_ops)
      train_tensor = control_flow_ops.with_dependencies([update_op], total_loss,
                                                        name='train_op')

      # Add the summaries from the first clone. These contain the summaries
      # created by model_fn and either optimize_clones() or _gather_clone_loss().
      summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                         first_clone_scope))

      # Merge all summaries together.
      summary_op = tf.summary.merge(list(summaries), name='summary_op')


      ###########################
      # Kicks off the training. #
      ###########################
      summary_writer = tf.summary.FileWriter("tensorboard_%d" %(ctx.worker_num), graph=tf.get_default_graph())
      slim.learning.train(
          train_tensor,
          logdir=FLAGS.train_dir,
          master=server.target,
          is_chief=(FLAGS.task == 0),
          init_fn=_get_init_fn(),
          summary_op=summary_op,
          number_of_steps=FLAGS.max_number_of_steps,
          log_every_n_steps=FLAGS.log_every_n_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          summary_writer=summary_writer,
          sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #38
0
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import tensorflow as tf
from preprocessing import preprocessing_factory
from configs.kitti_config import config
from nets.mobilenetdet import scale_bboxes

from datasets import dataset_factory
from tensorflow.contrib import slim

dataset = dataset_factory.get_dataset(
  'kitti', 'train', '/home/zehao/Dataset/KITII/tfrecord')

# def conver_box(bboxes, img_h, img_w):
#   [ymin, xmin, ymax, xmax] = tf.unstack(bboxes, axis=1)
#   img_h = tf.cast(img_h, tf.float32)
#   img_w = tf.cast(img_w, tf.float32)
#   ymin = tf.truediv(ymin, img_h)
#   xmin = tf.truediv(xmin, img_w)
#   ymax = tf.truediv(ymax, img_h)
#   xmax = tf.truediv(xmax, img_w)
#   return tf.expand_dims(tf.stack([ymin,xmin,ymax,xmax], axis=1), axis=0)

with tf.Graph().as_default() as graph:
  with tf.device('/cpu:0'):
    provider = slim.dataset_data_provider.DatasetDataProvider(
      dataset,
      num_readers=1,
      common_queue_capacity=20 * 1,
Пример #39
0
def main(opt):
    torch.manual_seed(opt.seed)
    torch.backends.cudnn.benchmark = not opt.not_cuda_benchmark and not opt.test
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    print(opt)

    logger = Logger(opt)

    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpus_str
    opt.device = torch.device('cuda' if opt.gpus[0] >= 0 else 'cpu')

    print('Creating model...')
    model = create_model(opt.arch, opt.heads, opt.head_conv)
    optimizer = torch.optim.Adam(model.parameters(), opt.lr)
    start_epoch = 0
    if opt.load_model != '':
        model, optimizer, start_epoch = load_model(model, opt.load_model,
                                                   optimizer, opt.resume,
                                                   opt.lr, opt.lr_step)

    Trainer = train_factory[opt.task]
    trainer = Trainer(opt, model, optimizer)
    trainer.set_device(opt.gpus, opt.chunk_sizes, opt.device)

    print('Setting up data...')
    val_loader = torch.utils.data.DataLoader(Dataset(opt, 'val'),
                                             batch_size=1,
                                             shuffle=False,
                                             num_workers=1,
                                             pin_memory=True)

    if opt.test:
        _, preds = trainer.val(0, val_loader)
        val_loader.dataset.run_eval(preds, opt.save_dir)
        return

    train_loader = torch.utils.data.DataLoader(Dataset(opt, 'train'),
                                               batch_size=opt.batch_size,
                                               shuffle=True,
                                               num_workers=opt.num_workers,
                                               pin_memory=True,
                                               drop_last=True)

    csvl_train = CSVLoggerDL(os.path.join(opt.save_dir, "hist_train.csv"))
    csvl_val = CSVLoggerDL(os.path.join(opt.save_dir, "hist_val.csv"))

    print('Starting training...')
    best = 1e10
    for epoch in range(start_epoch + 1, opt.num_epochs + 1):
        mark = epoch if opt.save_all else 'last'
        log_dict_train, _ = trainer.train(epoch, train_loader)
        csvl_train.add(log_dict_train, epoch=epoch)
        logger.write('epoch: {} |'.format(epoch))
        for k, v in log_dict_train.items():
            logger.scalar_summary('train_{}'.format(k), v, epoch)
            logger.write('{} {:8f} | '.format(k, v))
        if opt.val_intervals > 0 and epoch % opt.val_intervals == 0:
            save_model(os.path.join(opt.save_dir, 'model_{}.pth'.format(mark)),
                       epoch, model, optimizer)
            with torch.no_grad():
                log_dict_val, preds = trainer.val(epoch, val_loader)
                csvl_val.add(log_dict_val, epoch=epoch)
            for k, v in log_dict_val.items():
                logger.scalar_summary('val_{}'.format(k), v, epoch)
                logger.write('{} {:8f} | '.format(k, v))
            if log_dict_val[opt.metric] < best:
                best = log_dict_val[opt.metric]
                save_model(os.path.join(opt.save_dir, 'model_best.pth'), epoch,
                           model)
        else:
            save_model(os.path.join(opt.save_dir, 'model_last.pth'), epoch,
                       model, optimizer)
        logger.write('\n')
        if epoch in opt.lr_step:
            save_model(
                os.path.join(opt.save_dir, 'model_{}.pth'.format(epoch)),
                epoch, model, optimizer)
            lr = opt.lr * (0.1**(opt.lr_step.index(epoch) + 1))
            print('Drop LR to', lr)
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr
    logger.close()
Пример #40
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(dataset.num_classes - FLAGS.labels_offset),
            is_training=False)

        ##############################################################
        # Create a dataset provider that loads data from the dataset #
        ##############################################################
        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            shuffle=False,
            common_queue_capacity=2 * FLAGS.batch_size,
            common_queue_min=FLAGS.batch_size)
        [image, label, name] = provider.get(['image', 'label', 'name'])
        label -= FLAGS.labels_offset

        #####################################
        # Select the preprocessing function #
        #####################################
        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

        images, labels, names = tf.train.batch(
            [image, label, name],
            batch_size=FLAGS.batch_size,
            num_threads=FLAGS.num_preprocessing_threads,
            capacity=5 * FLAGS.batch_size)

        ####################
        # Define the model #
        ####################
        logits, _ = network_fn(images)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        prob = tf.nn.softmax(logits, 1)
        labels = tf.squeeze(labels)

        # Define the metrics:
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
            #'Recall_5': slim.metrics.streaming_sparse_recall_at_k(
            #    logits, labels, 3),
            #"summary_result" : _get_streaming_metrics(logits, label,
            #                                           2),
        })

        # Print the summaries to screen.
        for name, value in names_to_values.items():
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        restore_fn = slim.assign_from_checkpoint_fn(checkpoint_path,
                                                    variables_to_restore)
        sv = tf.train.Supervisor(logdir=FLAGS.eval_dir,
                                 saver=None,
                                 init_fn=restore_fn)
        pi_prediction = []
        pi_name = []
        pi_label = []
        with sv.managed_session() as sess:
            for step in range(int(num_batches)):
                sess.run(sv.global_step)
                (s_pred, s_label, s_name) = sess.run([prob, labels, names])
                pi_prediction.extend(s_pred)
                pi_label.extend(s_label)
                pi_name.extend(s_name)
        csv_path = ""
        if FLAGS.csv_name is None:
            csv_path = os.path.join(
                FLAGS.eval_dir,
                strftime("%Y_%m_%d_%H_%M_%S", gmtime()) + "_" +
                FLAGS.dataset_split_name + "_prediciton.csv")
        else:
            csv_path = FLAGS.csv_name
        with open(csv_path, "w+") as f:
            fieldnames = ['src', 'label', 'predict']
            writer = csv.DictWriter(f, fieldnames=fieldnames)
            writer.writeheader()
            for i in range(len(pi_name)):
                d = {
                    'src': pi_name[i],
                    'label': pi_label[i],
                    'predict': pi_prediction[i]
                }
                writer.writerow(d)
            print("write file to %s", csv_path)
def main_fun(argv, ctx):
  import math
  import six
  import tensorflow as tf

  from datasets import dataset_factory
  from nets import nets_factory
  from preprocessing import preprocessing_factory

  sys.argv = argv

  slim = tf.contrib.slim

  tf.app.flags.DEFINE_integer(
      'batch_size', 100, 'The number of samples in each batch.')

  tf.app.flags.DEFINE_integer(
      'max_num_batches', None,
      'Max number of batches to evaluate by default use all.')

  tf.app.flags.DEFINE_string(
      'master', '', 'The address of the TensorFlow master to use.')

  tf.app.flags.DEFINE_string(
      'checkpoint_path', '/tmp/tfmodel/',
      'The directory where the model was written to or an absolute path to a '
      'checkpoint file.')

  tf.app.flags.DEFINE_string(
      'eval_dir', '/tmp/tfmodel/', 'Directory where the results are saved to.')

  tf.app.flags.DEFINE_integer(
      'num_preprocessing_threads', 4,
      'The number of threads used to create the batches.')

  tf.app.flags.DEFINE_string(
      'dataset_name', 'imagenet', 'The name of the dataset to load.')

  tf.app.flags.DEFINE_string(
      'dataset_split_name', 'test', 'The name of the train/test split.')

  tf.app.flags.DEFINE_string(
      'dataset_dir', None, 'The directory where the dataset files are stored.')

  tf.app.flags.DEFINE_integer(
      'labels_offset', 0,
      'An offset for the labels in the dataset. This flag is primarily used to '
      'evaluate the VGG and ResNet architectures which do not use a background '
      'class for the ImageNet dataset.')

  tf.app.flags.DEFINE_string(
      'model_name', 'inception_v3', 'The name of the architecture to evaluate.')

  tf.app.flags.DEFINE_string(
      'preprocessing_name', None, 'The name of the preprocessing to use. If left '
      'as `None`, then the model_name flag is used.')

  tf.app.flags.DEFINE_float(
      'moving_average_decay', None,
      'The decay to use for the moving average.'
      'If left as None, then moving averages are not used.')

  tf.app.flags.DEFINE_integer(
      'eval_image_size', None, 'Eval image size')

  FLAGS = tf.app.flags.FLAGS

  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #tf_global_step = slim.get_or_create_global_step()
    tf_global_step = tf.Variable(0, name="global_step")

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label] = provider.get(['image', 'label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels = tf.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(logits, 1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        'Recall_5': slim.metrics.streaming_recall_at_k(
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in six.iteritems(names_to_values):
      summary_name = 'eval/%s' % name
      op = tf.summary.scalar(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % checkpoint_path)

    slim.evaluation.evaluate_once(
        master=FLAGS.master,
        checkpoint_path=checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
        variables_to_restore=variables_to_restore)
def main_fun(argv, ctx):
    import tensorflow as tf
    from tensorflow.python.ops import control_flow_ops
    from datasets import dataset_factory
    from deployment import model_deploy
    from nets import nets_factory
    from preprocessing import preprocessing_factory

    sys.argv = argv

    slim = tf.contrib.slim

    tf.app.flags.DEFINE_integer('num_gpus', '1',
                                'The number of GPUs to use per node')

    tf.app.flags.DEFINE_boolean('rdma', False, 'Whether to use rdma.')

    tf.app.flags.DEFINE_string('master', '',
                               'The address of the TensorFlow master to use.')

    tf.app.flags.DEFINE_string(
        'train_dir', '/tmp/tfmodel/',
        'Directory where checkpoints and event logs are written to.')

    tf.app.flags.DEFINE_integer('num_clones', 1,
                                'Number of model clones to deploy.')

    tf.app.flags.DEFINE_boolean('clone_on_cpu', False,
                                'Use CPUs to deploy clones.')

    tf.app.flags.DEFINE_integer('worker_replicas', 1,
                                'Number of worker replicas.')

    tf.app.flags.DEFINE_integer(
        'num_ps_tasks', 0,
        'The number of parameter servers. If the value is 0, then the parameters '
        'are handled locally by the worker.')

    tf.app.flags.DEFINE_integer(
        'num_readers', 4,
        'The number of parallel readers that read data from the dataset.')

    tf.app.flags.DEFINE_integer(
        'num_preprocessing_threads', 4,
        'The number of threads used to create the batches.')

    tf.app.flags.DEFINE_integer('log_every_n_steps', 10,
                                'The frequency with which logs are print.')

    tf.app.flags.DEFINE_integer(
        'save_summaries_secs', 600,
        'The frequency with which summaries are saved, in seconds.')

    tf.app.flags.DEFINE_integer(
        'save_interval_secs', 600,
        'The frequency with which the model is saved, in seconds.')

    tf.app.flags.DEFINE_integer(
        'task', 0, 'Task id of the replica running the training.')

    ######################
    # Optimization Flags #
    ######################

    tf.app.flags.DEFINE_float('weight_decay', 0.00004,
                              'The weight decay on the model weights.')

    tf.app.flags.DEFINE_string(
        'optimizer', 'rmsprop',
        'The name of the optimizer, one of "adadelta", "adagrad", "adam",'
        '"ftrl", "momentum", "sgd" or "rmsprop".')

    tf.app.flags.DEFINE_float('adadelta_rho', 0.95,
                              'The decay rate for adadelta.')

    tf.app.flags.DEFINE_float('adagrad_initial_accumulator_value', 0.1,
                              'Starting value for the AdaGrad accumulators.')

    tf.app.flags.DEFINE_float(
        'adam_beta1', 0.9,
        'The exponential decay rate for the 1st moment estimates.')

    tf.app.flags.DEFINE_float(
        'adam_beta2', 0.999,
        'The exponential decay rate for the 2nd moment estimates.')

    tf.app.flags.DEFINE_float('opt_epsilon', 1.0,
                              'Epsilon term for the optimizer.')

    tf.app.flags.DEFINE_float('ftrl_learning_rate_power', -0.5,
                              'The learning rate power.')

    tf.app.flags.DEFINE_float('ftrl_initial_accumulator_value', 0.1,
                              'Starting value for the FTRL accumulators.')

    tf.app.flags.DEFINE_float('ftrl_l1', 0.0,
                              'The FTRL l1 regularization strength.')

    tf.app.flags.DEFINE_float('ftrl_l2', 0.0,
                              'The FTRL l2 regularization strength.')

    tf.app.flags.DEFINE_float(
        'momentum', 0.9,
        'The momentum for the MomentumOptimizer and RMSPropOptimizer.')

    tf.app.flags.DEFINE_float('rmsprop_decay', 0.9, 'Decay term for RMSProp.')

    #######################
    # Learning Rate Flags #
    #######################

    tf.app.flags.DEFINE_string(
        'learning_rate_decay_type', 'exponential',
        'Specifies how the learning rate is decayed. One of "fixed", "exponential",'
        ' or "polynomial"')

    tf.app.flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')

    tf.app.flags.DEFINE_float(
        'end_learning_rate', 0.0001,
        'The minimal end learning rate used by a polynomial decay learning rate.'
    )

    tf.app.flags.DEFINE_float('label_smoothing', 0.0,
                              'The amount of label smoothing.')

    tf.app.flags.DEFINE_float('learning_rate_decay_factor', 0.94,
                              'Learning rate decay factor.')

    tf.app.flags.DEFINE_float(
        'num_epochs_per_decay', 2.0,
        'Number of epochs after which learning rate decays.')

    tf.app.flags.DEFINE_bool(
        'sync_replicas', False,
        'Whether or not to synchronize the replicas during training.')

    tf.app.flags.DEFINE_integer(
        'replicas_to_aggregate', 1,
        'The Number of gradients to collect before updating params.')

    tf.app.flags.DEFINE_float(
        'moving_average_decay', None,
        'The decay to use for the moving average.'
        'If left as None, then moving averages are not used.')

    #######################
    # Dataset Flags #
    #######################

    tf.app.flags.DEFINE_string('dataset_name', 'imagenet',
                               'The name of the dataset to load.')

    tf.app.flags.DEFINE_string('dataset_split_name', 'train',
                               'The name of the train/test split.')

    tf.app.flags.DEFINE_string(
        'dataset_dir', None,
        'The directory where the dataset files are stored.')

    tf.app.flags.DEFINE_integer(
        'labels_offset', 0,
        'An offset for the labels in the dataset. This flag is primarily used to '
        'evaluate the VGG and ResNet architectures which do not use a background '
        'class for the ImageNet dataset.')

    tf.app.flags.DEFINE_string('model_name', 'inception_v3',
                               'The name of the architecture to train.')

    tf.app.flags.DEFINE_string(
        'preprocessing_name', None,
        'The name of the preprocessing to use. If left '
        'as `None`, then the model_name flag is used.')

    tf.app.flags.DEFINE_integer('batch_size', 32,
                                'The number of samples in each batch.')

    tf.app.flags.DEFINE_integer('train_image_size', None, 'Train image size')

    tf.app.flags.DEFINE_integer('max_number_of_steps', None,
                                'The maximum number of training steps.')

    #####################
    # Fine-Tuning Flags #
    #####################

    tf.app.flags.DEFINE_string(
        'checkpoint_path', None,
        'The path to a checkpoint from which to fine-tune.')

    tf.app.flags.DEFINE_string(
        'checkpoint_exclude_scopes', None,
        'Comma-separated list of scopes of variables to exclude when restoring '
        'from a checkpoint.')

    tf.app.flags.DEFINE_string(
        'trainable_scopes', None,
        'Comma-separated list of scopes to filter the set of variables to train.'
        'By default, None would train all the variables.')

    tf.app.flags.DEFINE_boolean(
        'ignore_missing_vars', False,
        'When restoring a checkpoint would ignore missing variables.')

    FLAGS = tf.app.flags.FLAGS
    FLAGS.job_name = ctx.job_name
    FLAGS.task = ctx.task_index
    FLAGS.num_clones = FLAGS.num_gpus
    FLAGS.worker_replicas = len(ctx.cluster_spec['worker'])
    assert (FLAGS.num_ps_tasks == (len(ctx.cluster_spec['ps'])
                                   if 'ps' in ctx.cluster_spec else 0))

    def _configure_learning_rate(num_samples_per_epoch, global_step):
        """Configures the learning rate.

    Args:
      num_samples_per_epoch: The number of samples in each epoch of training.
      global_step: The global_step tensor.

    Returns:
      A `Tensor` representing the learning rate.

    Raises:
      ValueError: if
    """
        decay_steps = int(num_samples_per_epoch / FLAGS.batch_size *
                          FLAGS.num_epochs_per_decay)
        if FLAGS.sync_replicas:
            decay_steps /= FLAGS.replicas_to_aggregate

        if FLAGS.learning_rate_decay_type == 'exponential':
            return tf.train.exponential_decay(
                FLAGS.learning_rate,
                global_step,
                decay_steps,
                FLAGS.learning_rate_decay_factor,
                staircase=True,
                name='exponential_decay_learning_rate')
        elif FLAGS.learning_rate_decay_type == 'fixed':
            return tf.constant(FLAGS.learning_rate, name='fixed_learning_rate')
        elif FLAGS.learning_rate_decay_type == 'polynomial':
            return tf.train.polynomial_decay(
                FLAGS.learning_rate,
                global_step,
                decay_steps,
                FLAGS.end_learning_rate,
                power=1.0,
                cycle=False,
                name='polynomial_decay_learning_rate')
        else:
            raise ValueError(
                'learning_rate_decay_type [%s] was not recognized',
                FLAGS.learning_rate_decay_type)

    def _configure_optimizer(learning_rate):
        """Configures the optimizer used for training.

    Args:
      learning_rate: A scalar or `Tensor` learning rate.

    Returns:
      An instance of an optimizer.

    Raises:
      ValueError: if FLAGS.optimizer is not recognized.
    """
        if FLAGS.optimizer == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(learning_rate,
                                                   rho=FLAGS.adadelta_rho,
                                                   epsilon=FLAGS.opt_epsilon)
        elif FLAGS.optimizer == 'adagrad':
            optimizer = tf.train.AdagradOptimizer(
                learning_rate,
                initial_accumulator_value=FLAGS.
                adagrad_initial_accumulator_value)
        elif FLAGS.optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate,
                                               beta1=FLAGS.adam_beta1,
                                               beta2=FLAGS.adam_beta2,
                                               epsilon=FLAGS.opt_epsilon)
        elif FLAGS.optimizer == 'ftrl':
            optimizer = tf.train.FtrlOptimizer(
                learning_rate,
                learning_rate_power=FLAGS.ftrl_learning_rate_power,
                initial_accumulator_value=FLAGS.ftrl_initial_accumulator_value,
                l1_regularization_strength=FLAGS.ftrl_l1,
                l2_regularization_strength=FLAGS.ftrl_l2)
        elif FLAGS.optimizer == 'momentum':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=FLAGS.momentum,
                                                   name='Momentum')
        elif FLAGS.optimizer == 'rmsprop':
            optimizer = tf.train.RMSPropOptimizer(learning_rate,
                                                  decay=FLAGS.rmsprop_decay,
                                                  momentum=FLAGS.momentum,
                                                  epsilon=FLAGS.opt_epsilon)
        elif FLAGS.optimizer == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        else:
            raise ValueError('Optimizer [%s] was not recognized',
                             FLAGS.optimizer)
        return optimizer

    def _add_variables_summaries(learning_rate):
        summaries = []
        for variable in slim.get_model_variables():
            summaries.append(tf.summary.histogram(variable.op.name, variable))
        summaries.append(
            tf.summary.scalar('training/Learning Rate', learning_rate))
        return summaries

    def _get_init_fn():
        """Returns a function run by the chief worker to warm-start the training.

    Note that the init_fn is only run when initializing the model during the very
    first global step.

    Returns:
      An init function run by the supervisor.
    """
        if FLAGS.checkpoint_path is None:
            return None

        # Warn the user if a checkpoint exists in the train_dir. Then we'll be
        # ignoring the checkpoint anyway.
        if tf.train.latest_checkpoint(FLAGS.train_dir):
            tf.logging.info(
                'Ignoring --checkpoint_path because a checkpoint already exists in %s'
                % FLAGS.train_dir)
            return None

        exclusions = []
        if FLAGS.checkpoint_exclude_scopes:
            exclusions = [
                scope.strip()
                for scope in FLAGS.checkpoint_exclude_scopes.split(',')
            ]

        # TODO(sguada) variables.filter_variables()
        variables_to_restore = []
        for var in slim.get_model_variables():
            excluded = False
            for exclusion in exclusions:
                if var.op.name.startswith(exclusion):
                    excluded = True
                    break
            if not excluded:
                variables_to_restore.append(var)

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Fine-tuning from %s' % checkpoint_path)

        return slim.assign_from_checkpoint_fn(
            checkpoint_path,
            variables_to_restore,
            ignore_missing_vars=FLAGS.ignore_missing_vars)

    def _get_variables_to_train():
        """Returns a list of variables to train.

    Returns:
      A list of variables to train by the optimizer.
    """
        if FLAGS.trainable_scopes is None:
            return tf.trainable_variables()
        else:
            scopes = [
                scope.strip() for scope in FLAGS.trainable_scopes.split(',')
            ]

        variables_to_train = []
        for scope in scopes:
            variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                          scope)
            variables_to_train.extend(variables)
        return variables_to_train

    # main
    cluster_spec, server = TFNode.start_cluster_server(ctx=ctx,
                                                       num_gpus=FLAGS.num_gpus,
                                                       rdma=FLAGS.rdma)
    if ctx.job_name == 'ps':
        # `ps` jobs wait for incoming connections from the workers.
        server.join()
    else:
        # `worker` jobs will actually do the work.
        if not FLAGS.dataset_dir:
            raise ValueError(
                'You must supply the dataset directory with --dataset_dir')

        tf.logging.set_verbosity(tf.logging.INFO)
        with tf.Graph().as_default():
            #######################
            # Config model_deploy #
            #######################
            deploy_config = model_deploy.DeploymentConfig(
                num_clones=FLAGS.num_clones,
                clone_on_cpu=FLAGS.clone_on_cpu,
                replica_id=FLAGS.task,
                num_replicas=FLAGS.worker_replicas,
                num_ps_tasks=FLAGS.num_ps_tasks)

            # Create global_step
            #with tf.device(deploy_config.variables_device()):
            #  global_step = slim.create_global_step()
            with tf.device("/job:ps/task:0"):
                global_step = tf.Variable(0, name="global_step")

            ######################
            # Select the dataset #
            ######################
            dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                                  FLAGS.dataset_split_name,
                                                  FLAGS.dataset_dir)

            ######################
            # Select the network #
            ######################
            network_fn = nets_factory.get_network_fn(
                FLAGS.model_name,
                num_classes=(dataset.num_classes - FLAGS.labels_offset),
                weight_decay=FLAGS.weight_decay,
                is_training=True)

            #####################################
            # Select the preprocessing function #
            #####################################
            preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
            image_preprocessing_fn = preprocessing_factory.get_preprocessing(
                preprocessing_name, is_training=True)

            ##############################################################
            # Create a dataset provider that loads data from the dataset #
            ##############################################################
            with tf.device(deploy_config.inputs_device()):
                provider = slim.dataset_data_provider.DatasetDataProvider(
                    dataset,
                    num_readers=FLAGS.num_readers,
                    common_queue_capacity=20 * FLAGS.batch_size,
                    common_queue_min=10 * FLAGS.batch_size)
                [image, label] = provider.get(['image', 'label'])
                label -= FLAGS.labels_offset

                train_image_size = FLAGS.train_image_size or network_fn.default_image_size

                image = image_preprocessing_fn(image, train_image_size,
                                               train_image_size)

                images, labels = tf.train.batch(
                    [image, label],
                    batch_size=FLAGS.batch_size,
                    num_threads=FLAGS.num_preprocessing_threads,
                    capacity=5 * FLAGS.batch_size)
                labels = slim.one_hot_encoding(
                    labels, dataset.num_classes - FLAGS.labels_offset)
                batch_queue = slim.prefetch_queue.prefetch_queue(
                    [images, labels], capacity=2 * deploy_config.num_clones)

            ####################
            # Define the model #
            ####################
            def clone_fn(batch_queue):
                """Allows data parallelism by creating multiple clones of network_fn."""
                images, labels = batch_queue.dequeue()
                logits, end_points = network_fn(images)

                #############################
                # Specify the loss function #
                #############################
                if 'AuxLogits' in end_points:
                    tf.losses.softmax_cross_entropy(
                        logits=end_points['AuxLogits'],
                        onehot_labels=labels,
                        label_smoothing=FLAGS.label_smoothing,
                        weights=0.4,
                        scope='aux_loss')
                tf.losses.softmax_cross_entropy(
                    logits=logits,
                    onehot_labels=labels,
                    label_smoothing=FLAGS.label_smoothing,
                    weights=1.0)
                return end_points

            # Gather initial summaries.
            summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

            clones = model_deploy.create_clones(deploy_config, clone_fn,
                                                [batch_queue])
            first_clone_scope = deploy_config.clone_scope(0)
            # Gather update_ops from the first clone. These contain, for example,
            # the updates for the batch_norm variables created by network_fn.
            update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                           first_clone_scope)

            # Add summaries for end_points.
            end_points = clones[0].outputs
            for end_point in end_points:
                x = end_points[end_point]
                summaries.add(
                    tf.summary.histogram('activations/' + end_point, x))
                summaries.add(
                    tf.summary.scalar('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))

            # Add summaries for losses.
            for loss in tf.get_collection(tf.GraphKeys.LOSSES,
                                          first_clone_scope):
                summaries.add(
                    tf.summary.scalar('losses/%s' % loss.op.name, loss))

            # Add summaries for variables.
            for variable in slim.get_model_variables():
                summaries.add(tf.summary.histogram(variable.op.name, variable))

            #################################
            # Configure the moving averages #
            #################################
            if FLAGS.moving_average_decay:
                moving_average_variables = slim.get_model_variables()
                variable_averages = tf.train.ExponentialMovingAverage(
                    FLAGS.moving_average_decay, global_step)
            else:
                moving_average_variables, variable_averages = None, None

            #########################################
            # Configure the optimization procedure. #
            #########################################
            with tf.device(deploy_config.optimizer_device()):
                learning_rate = _configure_learning_rate(
                    dataset.num_samples, global_step)
                optimizer = _configure_optimizer(learning_rate)
                summaries.add(tf.summary.scalar('learning_rate',
                                                learning_rate))

            if FLAGS.sync_replicas:
                # If sync_replicas is enabled, the averaging will be done in the chief
                # queue runner.
                optimizer = tf.train.SyncReplicasOptimizer(
                    opt=optimizer,
                    replicas_to_aggregate=FLAGS.replicas_to_aggregate,
                    variable_averages=variable_averages,
                    variables_to_average=moving_average_variables,
                    replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
                    total_num_replicas=FLAGS.worker_replicas)
            elif FLAGS.moving_average_decay:
                # Update ops executed locally by trainer.
                update_ops.append(
                    variable_averages.apply(moving_average_variables))

            # Variables to train.
            variables_to_train = _get_variables_to_train()

            #  and returns a train_tensor and summary_op
            total_loss, clones_gradients = model_deploy.optimize_clones(
                clones, optimizer, var_list=variables_to_train)
            # Add total_loss to summary.
            summaries.add(tf.summary.scalar('total_loss', total_loss))

            # Create gradient updates.
            grad_updates = optimizer.apply_gradients(clones_gradients,
                                                     global_step=global_step)
            update_ops.append(grad_updates)

            update_op = tf.group(*update_ops)
            train_tensor = control_flow_ops.with_dependencies([update_op],
                                                              total_loss,
                                                              name='train_op')

            # Add the summaries from the first clone. These contain the summaries
            # created by model_fn and either optimize_clones() or _gather_clone_loss().
            summaries |= set(
                tf.get_collection(tf.GraphKeys.SUMMARIES, first_clone_scope))

            # Merge all summaries together.
            summary_op = tf.summary.merge(list(summaries), name='summary_op')

            ###########################
            # Kicks off the training. #
            ###########################
            summary_writer = tf.summary.FileWriter(
                "tensorboard_%d" % (ctx.worker_num),
                graph=tf.get_default_graph())
            slim.learning.train(
                train_tensor,
                logdir=FLAGS.train_dir,
                master=server.target,
                is_chief=(FLAGS.task == 0),
                init_fn=_get_init_fn(),
                summary_op=summary_op,
                number_of_steps=FLAGS.max_number_of_steps,
                log_every_n_steps=FLAGS.log_every_n_steps,
                save_summaries_secs=FLAGS.save_summaries_secs,
                save_interval_secs=FLAGS.save_interval_secs,
                summary_writer=summary_writer,
                sync_optimizer=optimizer if FLAGS.sync_replicas else None)
Пример #43
0
def main(_):
  assert FLAGS.output_dir, '--output_dir has to be provided'
  if not tf.gfile.Exists(FLAGS.output_dir):
    tf.gfile.MakeDirs(FLAGS.output_dir)
  params = model_lib.default_hparams()
  params.parse(FLAGS.hparams)
  tf.logging.info('User provided hparams: %s', FLAGS.hparams)
  tf.logging.info('All hyper parameters: %s', params)
  batch_size = params.batch_size
  graph = tf.Graph()
  with graph.as_default():
    with tf.device(tf.train.replica_device_setter(ps_tasks=FLAGS.ps_tasks)):
      # dataset
      dataset, examples_per_epoch, num_classes, bounds = (
          dataset_factory.get_dataset(
              FLAGS.dataset,
              'train',
              batch_size,
              FLAGS.dataset_image_size,
              is_training=True))
      dataset_iterator = dataset.make_one_shot_iterator()
      images, labels = dataset_iterator.get_next()
      one_hot_labels = tf.one_hot(labels, num_classes)

      # set up model
      global_step = tf.train.get_or_create_global_step()
      model_fn = model_lib.get_model(FLAGS.model_name, num_classes)
      if params.train_adv_method == 'clean':
        logits = model_fn(images, is_training=True)
        adv_examples = None
      else:
        model_fn_eval_mode = lambda x: model_fn(x, is_training=False)
        adv_examples = adversarial_attack.generate_adversarial_examples(
            images, bounds, model_fn_eval_mode, params.train_adv_method)
        all_examples = tf.concat([images, adv_examples], axis=0)
        logits = model_fn(all_examples, is_training=True)
        one_hot_labels = tf.concat([one_hot_labels, one_hot_labels], axis=0)

      # update trainable variables if fine tuning is used
      model_lib.filter_trainable_variables(
          FLAGS.finetune_trainable_scopes)

      # set up losses
      total_loss = tf.losses.softmax_cross_entropy(
          onehot_labels=one_hot_labels,
          logits=logits,
          label_smoothing=params.label_smoothing)
      tf.summary.scalar('loss_xent', total_loss)

      if params.train_lp_weight > 0:
        images1, images2 = tf.split(logits, 2)
        loss_lp = tf.losses.mean_squared_error(
            images1, images2, weights=params.train_lp_weight)
        tf.summary.scalar('loss_lp', loss_lp)
        total_loss += loss_lp

      if params.weight_decay > 0:
        loss_wd = (
            params.weight_decay
            * tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables()])
        )
        tf.summary.scalar('loss_wd', loss_wd)
        total_loss += loss_wd

      # Setup the moving averages:
      if FLAGS.moving_average_decay and (FLAGS.moving_average_decay > 0):
        with tf.name_scope('moving_average'):
          moving_average_variables = tf.contrib.framework.get_model_variables()
          variable_averages = tf.train.ExponentialMovingAverage(
              FLAGS.moving_average_decay, global_step)
      else:
        moving_average_variables = None
        variable_averages = None

      # set up optimizer and training op
      learning_rate, steps_per_epoch = model_lib.get_lr_schedule(
          params, examples_per_epoch, FLAGS.replicas_to_aggregate)

      optimizer = model_lib.get_optimizer(params, learning_rate)

      optimizer = tf.train.SyncReplicasOptimizer(
          opt=optimizer,
          replicas_to_aggregate=FLAGS.replicas_to_aggregate,
          total_num_replicas=FLAGS.worker_replicas,
          variable_averages=variable_averages,
          variables_to_average=moving_average_variables)

      train_op = tf.contrib.training.create_train_op(
          total_loss, optimizer,
          update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS))

      tf.summary.image('images', images[0:FLAGS.num_summary_images])
      if adv_examples is not None:
        tf.summary.image('adv_images', adv_examples[0:FLAGS.num_summary_images])
      tf.summary.scalar('total_loss', total_loss)
      tf.summary.scalar('learning_rate', learning_rate)
      tf.summary.scalar('current_epoch',
                        tf.to_double(global_step) / steps_per_epoch)

      # Training
      is_chief = FLAGS.task == 0

      scaffold = tf.train.Scaffold(
          init_fn=_get_finetuning_init_fn(variable_averages))
      hooks = [
          tf.train.LoggingTensorHook({'total_loss': total_loss,
                                      'global_step': global_step},
                                     every_n_iter=1),
          tf.train.NanTensorHook(total_loss),
      ]
      chief_only_hooks = [
          tf.train.SummarySaverHook(save_steps=FLAGS.save_summaries_steps,
                                    save_secs=FLAGS.save_summaries_secs,
                                    output_dir=FLAGS.output_dir,
                                    scaffold=scaffold),
          tf.train.CheckpointSaverHook(FLAGS.output_dir,
                                       save_steps=FLAGS.save_model_steps,
                                       scaffold=scaffold),
      ]

      if FLAGS.max_steps > 0:
        hooks.append(
            tf.train.StopAtStepHook(last_step=FLAGS.max_steps))

      # hook for sync replica training
      hooks.append(optimizer.make_session_run_hook(is_chief))

      with tf.train.MonitoredTrainingSession(
          master=FLAGS.master,
          is_chief=is_chief,
          checkpoint_dir=FLAGS.output_dir,
          scaffold=scaffold,
          hooks=hooks,
          chief_only_hooks=chief_only_hooks,
          save_checkpoint_secs=None,
          save_summaries_steps=None,
          save_summaries_secs=None) as session:
        while not session.should_stop():
          session.run([train_op])
Пример #44
0
with.tf.Graph().as_default():

  deploy_config = model_deploy.DeploymentConfig(
        num_clones=1,
        clone_on_cpu=False,
        replica_id=0,
        num_replicas=2,
        num_ps_tasks=0)

  with tf.device(deploy.config.variables_device()):
      global_step = slim.create_global_step



  dataset = dataset_factory.get_dataset(cve_diseases, train, "/home/johnnyof/workspace/slim/tmp")

  cnn1 = nets_factory.get_network_fn(
         inception_resnet_v2,
         num_classes=(1001),
         weight_decay=0.00004,
         is_training=True)

  cnn2 = nets_factory.get_network_fn(
        alexnet_v2,
        num_classes=(1001),
        weight_decay=0.00004,
        is_training=True)

  image_preprocessing_fn = preprocessing_factory.get_preprocessing(
     inception,
Пример #45
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    tf_global_step = slim.get_or_create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label, coarse_label] = provider.get(
        ['image', 'label', 'coarse_label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

#    image = tf.image.grayscale_to_rgb(image)

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels, coarse_labels = tf.train.batch(
        [image, label, coarse_label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)
    coarse_labels = tf.cast(coarse_labels, tf.int32)
    tf.image_summary('image', images, max_images=5)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    one_hot_labels = slim.one_hot_encoding(labels, 2)
    loss = slim.losses.softmax_cross_entropy(logits, one_hot_labels)

    predictions = tf.argmax(logits, 1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Total_Loss': slim.metrics.streaming_mean(loss),
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
    })

  with tf.variable_scope('coarse_label_accuracy',
                         values=[predictions, labels, coarse_labels]):
    totals = tf.Variable(
        initial_value=tf.zeros([len(dataset.coarse_labels_to_names)]),
        trainable=False,
        collections=[tf.GraphKeys.LOCAL_VARIABLES],
        dtype=tf.float32,
        name='totals')

    counts = tf.Variable(
        initial_value=tf.zeros([len(dataset.coarse_labels_to_names)]),
        trainable=False,
        collections=[tf.GraphKeys.LOCAL_VARIABLES],
        dtype=tf.float32,
        name='counts')

    correct = tf.cast(tf.equal(predictions, labels), tf.int32)
    accuracy_ops = []
    for index, coarse_key in list(enumerate(dataset.coarse_labels_to_names)):
      label_correct = tf.boolean_mask(correct, tf.equal(coarse_key, coarse_labels))
      sum_correct = tf.reduce_sum(label_correct)
      sum_correct = tf.cast(tf.expand_dims(sum_correct, 0), tf.float32)
      delta_totals = tf.SparseTensor([[index]], sum_correct, totals.get_shape())
      label_count = tf.cast(tf.shape(label_correct), tf.float32)
      delta_counts = tf.SparseTensor([[index]], label_count, counts.get_shape())

      totals_compute_op = tf.assign_add(
          totals,
          tf.sparse_tensor_to_dense(delta_totals),
          use_locking=True)
      counts_compute_op = tf.assign_add(
          counts,
          tf.sparse_tensor_to_dense(delta_counts),
          use_locking=True)

      accuracy_ops.append(totals_compute_op)
      accuracy_ops.append(counts_compute_op)
    with tf.control_dependencies(accuracy_ops):
      update_op = tf.select(tf.equal(counts, 0),
                            tf.zeros_like(counts, tf.float32),
                            tf.div(totals, counts))
      names_to_updates['Coarse_Label_Accuracy'] = update_op

    if FLAGS.recall:
      recall_value, recall_update = slim.metrics.streaming_recall_at_k(
          logits, labels, 5)
      names_to_values['Recall@5'] = recall_value
      names_to_updates['Recall@5'] = recall_update

    # Print the summaries to screen.
    # TODO(vonclites) list(d.items()) is for Python 3... check compatibility
    for name, value in list(names_to_values.items()):
      summary_name = 'eval/%s' % name
      op = tf.scalar_summary(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    for index, label_name in list(enumerate(dataset.coarse_labels_to_names.values())):
      summary_name = 'eval/%s' % label_name
      op = tf.scalar_summary(summary_name, update_op[index], collections=[])
      op = tf.Print(op, [update_op[index]], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

#    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
#      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
#    else:
#      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % FLAGS.checkpoint_path)

    slim.evaluation.evaluation_loop(
        master=FLAGS.master,
        checkpoint_dir=FLAGS.checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
        eval_interval_secs=FLAGS.eval_interval_secs,
        variables_to_restore=slim.get_variables_to_restore())
Пример #46
0
def main(_):
    if not FLAGS.dataset_dir:
        raise ValueError(
            'You must supply the dataset directory with --dataset_dir')

    tf.logging.set_verbosity(tf.logging.INFO)
    with tf.Graph().as_default():
        tf_global_step = slim.get_or_create_global_step()

        ######################
        # Select the dataset #
        ######################
        dataset = dataset_factory.get_dataset(FLAGS.dataset_name,
                                              FLAGS.dataset_split_name,
                                              FLAGS.dataset_dir)

        num_classes = dataset.num_styles if FLAGS.target_style else dataset.num_classes
        ####################
        # Select the model #
        ####################
        network_fn = nets_factory.get_network_fn(
            FLAGS.model_name,
            num_classes=(num_classes - FLAGS.labels_offset),
            is_training=False)

        preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            preprocessing_name, is_training=False)

        eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

        provider = slim.dataset_data_provider.DatasetDataProvider(
            dataset,
            common_queue_capacity=20 * FLAGS.batch_size,
            common_queue_min=10 * FLAGS.batch_size,
            shuffle=False)
        [data, content_label, style_label, file_path, u_data, u_labels, u_file_paths] = \
            provider.get(['image', 'label', 'uid', 'file_path', 'user_set/images', 'user_set/labels',
                          'user_set/file_paths'])
        content_label -= FLAGS.labels_offset

        data = image_preprocessing_fn(data, eval_image_size, eval_image_size)
        u_data = tf.map_fn(lambda u_instance: image_preprocessing_fn(
            u_instance, eval_image_size, eval_image_size),
                           u_data,
                           dtype=tf.float32)

        image_batch, content_label_batch, style_label_batch, f_path_batch, u_data_batch, u_labels_batch, u_file_paths_batch = \
            tf.train.batch(
                [data, content_label, style_label, file_path, u_data, u_labels, u_file_paths],
                batch_size=FLAGS.batch_size,
                num_threads=FLAGS.num_preprocessing_threads,
                capacity=5 * FLAGS.batch_size,
                #allow_smaller_final_batch=True,
                dynamic_pad=True)

        label_batch = style_label_batch // 2 if FLAGS.target_style else content_label_batch
        num_classes = dataset.num_styles if FLAGS.target_style else dataset.num_classes
        label_batch = slim.one_hot_encoding(label_batch,
                                            num_classes - FLAGS.labels_offset)
        image_batch = tf.Print(image_batch, [tf.reduce_mean(image_batch)],
                               message="mean")
        ####################
        # Define the model #
        ####################
        ds = DataStream(Task.CLASSIFICATION, DataType.IMAGE)
        # = batch_queue.dequeue()
        data_instance_list = [
            ds.encode(*t) for t in zip(
                tf.unstack(image_batch), tf.unstack(content_label_batch),
                tf.unstack(style_label_batch), tf.unstack(u_data_batch),
                tf.unstack(u_labels_batch), tf.unstack(f_path_batch),
                tf.unstack(u_file_paths_batch))
        ]
        f_path_batch = tf.Print(f_path_batch, [tf.shape(f_path_batch)],
                                message='path batch')
        with tf.variable_scope('network_fn'):
            logits, _ = network_fn(data_instance_list)

        if FLAGS.moving_average_decay:
            variable_averages = tf.train.ExponentialMovingAverage(
                FLAGS.moving_average_decay, tf_global_step)
            variables_to_restore = variable_averages.variables_to_restore(
                slim.get_model_variables())
            variables_to_restore[tf_global_step.op.name] = tf_global_step
        else:
            variables_to_restore = slim.get_variables_to_restore()

        predictions = tf.argmax(logits, 1)
        loss = tf.losses.softmax_cross_entropy(label_batch,
                                               logits,
                                               weights=1.0)
        label_batch = tf.argmax(label_batch, 1)
        # predictions = tf.Print(predictions, data=[loss], message="Loss value")

        # Define the metrics:
        idxs = tf.squeeze(tf.where(tf.not_equal(predictions, label_batch)))
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, label_batch),
            'Recall@5':
            slim.metrics.streaming_recall_at_k(logits, label_batch, 5),
            'uid_missed':
            slim.metrics.streaming_concat(
                tf.reshape(tf.gather(style_label_batch, idxs), [-1]))
        })

        _md = os.path.join(FLAGS.eval_dir, FLAGS.dataset_split_name,
                           "mistakes/")
        if not os.path.exists(_md):
            os.system("mkdir -p %s" % _md)
        mis_dir = tf.constant(_md)
        # uid_mistakes = tf.get_variable("uids_mistaken", initializer=tf.zeros([0], dtype=tf.int64))
        # with tf.control_dependencies(names_to_updates.values()):
        #    um_v, um_u =
        # uid_mistakes = tf.concat([uid_mistakes, ], 0)
        # uid_batch = tf.Print(uid_batch, [uid_batch, tf.shape(uid_batch)], "uids and its shape")
        """
        with tf.control_dependencies(names_to_updates.values()):
            idxs = tf.cast(idxs, tf.int32)
            # eval_op = tf.Print(idxs, [idxs, tf.shape(idxs)], message="IDX and shape")
            def body(i):
                fp = f_path_batch[idxs[i]]
                s = tf.string_split([fp], "/").values
                fp = tf.string_join([s[0], s[1]], "_")
                                    
                w_op = tf.write_file(
                    tf.string_join([mis_dir, fp]),
                    #label_batch[idx],
                    #tf.constant("_as_"),
                    #predictions[idx]]),
                    tf.image.encode_png(tf.cast(image_batch[idxs[i]]*128+128, tf.uint8)))
                deps = [tf.cond(tf.rank(f_path_batch)>0, lambda: w_op, lambda: tf.no_op())]
                with tf.control_dependencies(deps):
                    i += 1
                # i = tf.Print(i, [tf.string_join([mis_dir, fp])], message="filename")
                return [i]

            eval_op = tf.while_loop(
                lambda i: tf.less(i, tf.shape(f_path_batch)[0]),
                body,
                [tf.constant(0)])"""

        # Print the summaries to screen.
        for name, value in names_to_values.iteritems():
            if name == 'uid_missed':
                continue
            summary_name = 'eval/%s' % name
            op = tf.summary.scalar(summary_name, value, collections=[])
            op = tf.Print(op, [value], summary_name)
            tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

        # TODO(sguada) use num_epochs=1
        if FLAGS.max_num_batches:
            num_batches = FLAGS.max_num_batches
        else:
            # This ensures that we make a single pass over all of the data.
            num_batches = math.ceil(dataset.num_samples /
                                    float(FLAGS.batch_size))

        if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
            checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
        else:
            checkpoint_path = FLAGS.checkpoint_path

        tf.logging.info('Evaluating %s' % checkpoint_path)

        print("debug %s %s " % (label_batch, predictions))
        _um, conf_matrix, otp, pred = slim.evaluation.evaluate_once(
            master=FLAGS.master,
            checkpoint_path=checkpoint_path,
            logdir=FLAGS.eval_dir,
            num_evals=num_batches,
            eval_op=names_to_updates.values(),
            variables_to_restore=variables_to_restore,
            final_op=[
                names_to_values['uid_missed'],
                tf.confusion_matrix(label_batch,
                                    predictions,
                                    num_classes=dataset.num_classes),
                label_batch, predictions
            ])

        import numpy as np
        import sys
        import collections

        # full print of the confusion matrix
        np.set_printoptions(
            threshold=np.nan,
            linewidth=np.inf)  #, formatter={'int': '{: 03d}'.format})
        #names = [dataset.labels_to_names[label] for label in range(dataset.num_classes)[:36]]
        #sys.stdout.write("  " + str(np.asarray(names)) + "\n")
        #for i, row in enumerate(conf_matrix):
        #sys.stdout.write (names[i] + str(row) + '\n')

        #print (conf_matrix)
        print(len(pred), pred)
        print(len(otp), otp)

        #_um = [(u//2)*2 for u in _um]

        _um = [u for u in _um]

        font_freq = collections.Counter(_um)
        print("Mistakes per font label: %s" % font_freq)
        print("Total number of mistaken uids %d" % len(_um))

        if FLAGS.metadir:
            import scipy.misc as misc
            im_file = os.path.join(FLAGS.metadir, "sprite.png")
            labels_file = os.path.join(FLAGS.metadir, "labels.tsv")
            if os.path.exists(im_file) and os.path.exists(labels_file):
                sprite_img = misc.imread(im_file)
                with open(labels_file, "r") as lf:
                    labels = [int(line) for line in lf.readlines()]
                ncol = int(math.sqrt(len(labels))) + 1
                ms_arr = np.zeros([ncol, ncol], np.int32)
                for li in range(len(labels)):
                    ms_arr[li // ncol, li % ncol] = font_freq[labels[li]]

                _ims = eval_image_size
                mask = np.zeros([_ims * ncol, _ims * ncol])
                mx = max(font_freq.values())
                for r in range(ncol):
                    for c in range(ncol):
                        mask[r * _ims:(r + 1) * _ims, c * _ims:(c + 1) *
                             _ims] = (ms_arr[r][c] / mx) * 255.
                misc.imsave(
                    os.path.join(FLAGS.eval_dir, FLAGS.dataset_split_name,
                                 "mask.png"), mask)
                print("Number of mistakes per font %s", ms_arr)
            else:
                print(
                    "Metadata dir supplied is missing either the sprite image or labels file"
                )
Пример #47
0
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    tf_global_step = slim.get_or_create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = slim.dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=False,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size)
    [image, label] = provider.get(['image', 'label'])
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size)

    images, labels = tf.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    logits, _ = network_fn(images)

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(logits, 1)
    labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        'Recall_5': slim.metrics.streaming_recall_at_k(
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in names_to_values.items():
      summary_name = 'eval/%s' % name
      op = tf.summary.scalar(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = math.ceil(dataset.num_samples / float(FLAGS.batch_size))

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % checkpoint_path)

    slim.evaluation.evaluate_once(
        master=FLAGS.master,
        checkpoint_path=checkpoint_path,
        logdir=FLAGS.eval_dir,
        num_evals=num_batches,
        eval_op=list(names_to_updates.values()),
        variables_to_restore=variables_to_restore)
Пример #48
0
import cv2
import pycocotools.coco as coco
from lib.detectors.detector_factory import detector_factory
from datasets.dataset_factory import get_dataset
from utils.debugger import Debugger
from opts import opts

#img_dir = os.path.join(os.getcwd(), 'data\\egg\\val')
img_dir = r'P:\\Robert\\tf-test\\workspace\\egg-counting\\images\\test-2'

if __name__ == '__main__':
    performance_results = [[
        'filename', 'num labelled', 'num predicted', 'abs. error', 'pct. error'
    ]]
    opt = opts().init()
    Dataset = get_dataset(opt.dataset, opt.task)
    opt = opts().update_dataset_info_and_set_heads(opt, Dataset)
    Detector = detector_factory[opt.task]
    start_t = timeit.default_timer()
    detector = Detector(opt)
    print('model load time:', timeit.default_timer() - start_t)
    parsed = coco.COCO(opt.demo)
    for i, imgId in enumerate(parsed.imgs):
        file_name = parsed.imgs[imgId]['file_name']
        img = cv2.imread(os.path.join(img_dir, file_name))
        print('processing image at', os.path.join(img_dir, file_name))
        run_dict = detector.run(img)
        num_predicted = len(
            [result for result in run_dict['results'][1] if result[-1] > 0.3])
        num_labelled = len([
            parsed.loadAnns(ids=[annID])
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    #######################
    # Config model_deploy #
    #######################
    deploy_config = model_deploy.DeploymentConfig(
        num_clones=FLAGS.num_clones,
        clone_on_cpu=FLAGS.clone_on_cpu,
        replica_id=FLAGS.task,
        num_replicas=FLAGS.worker_replicas,
        num_ps_tasks=FLAGS.num_ps_tasks)

    # Create global_step
    with tf.device(deploy_config.variables_device()):
      global_step = slim.create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name, FLAGS.dataset_dir)

    ######################
    # Select the network #
    ######################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        weight_decay=FLAGS.weight_decay,
        is_training=True)

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=True)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    with tf.device(deploy_config.inputs_device()):
      provider = slim.dataset_data_provider.DatasetDataProvider(
          dataset,
          num_readers=FLAGS.num_readers,
          common_queue_capacity=20 * FLAGS.batch_size,
          common_queue_min=10 * FLAGS.batch_size)
      [image, label] = provider.get(['image', 'label'])
      label -= FLAGS.labels_offset

      train_image_size = FLAGS.train_image_size or network_fn.default_image_size

      image = image_preprocessing_fn(image, train_image_size, train_image_size)

      images, labels = tf.train.batch(
          [image, label],
          batch_size=FLAGS.batch_size,
          num_threads=FLAGS.num_preprocessing_threads,
          capacity=5 * FLAGS.batch_size)
      labels = slim.one_hot_encoding(
          labels, dataset.num_classes - FLAGS.labels_offset)
      batch_queue = slim.prefetch_queue.prefetch_queue(
          [images, labels], capacity=2 * deploy_config.num_clones)

    ####################
    # Define the model #
    ####################
    def clone_fn(batch_queue):
      """Allows data parallelism by creating multiple clones of network_fn."""
      with tf.device(deploy_config.inputs_device()):
        images, labels = batch_queue.dequeue()
      logits, end_points = network_fn(images)

      #############################
      # Specify the loss function #
      #############################
      if 'AuxLogits' in end_points:
        tf.losses.softmax_cross_entropy(
            logits=end_points['AuxLogits'], onehot_labels=labels,
            label_smoothing=FLAGS.label_smoothing, weights=0.4, scope='aux_loss')
      tf.losses.softmax_cross_entropy(
          logits=logits, onehot_labels=labels,
          label_smoothing=FLAGS.label_smoothing, weights=1.0)
      return end_points

    # Gather initial summaries.
    summaries = set(tf.get_collection(tf.GraphKeys.SUMMARIES))

    clones = model_deploy.create_clones(deploy_config, clone_fn, [batch_queue])
    first_clone_scope = deploy_config.clone_scope(0)
    # Gather update_ops from the first clone. These contain, for example,
    # the updates for the batch_norm variables created by network_fn.
    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, first_clone_scope)

    # Add summaries for end_points.
    end_points = clones[0].outputs
    for end_point in end_points:
      x = end_points[end_point]
      summaries.add(tf.summary.histogram('activations/' + end_point, x))
      summaries.add(tf.summary.scalar('sparsity/' + end_point,
                                      tf.nn.zero_fraction(x)))

    # Add summaries for losses.
    for loss in tf.get_collection(tf.GraphKeys.LOSSES, first_clone_scope):
      summaries.add(tf.summary.scalar('losses/%s' % loss.op.name, loss))

    # Add summaries for variables.
    for variable in slim.get_model_variables():
      summaries.add(tf.summary.histogram(variable.op.name, variable))

    #################################
    # Configure the moving averages #
    #################################
    if FLAGS.moving_average_decay:
      moving_average_variables = slim.get_model_variables()
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, global_step)
    else:
      moving_average_variables, variable_averages = None, None

    #########################################
    # Configure the optimization procedure. #
    #########################################
    with tf.device(deploy_config.optimizer_device()):
      learning_rate = _configure_learning_rate(dataset.num_samples, global_step)
      optimizer = _configure_optimizer(learning_rate)
      summaries.add(tf.summary.scalar('learning_rate', learning_rate))

    if FLAGS.sync_replicas:
      # If sync_replicas is enabled, the averaging will be done in the chief
      # queue runner.
      optimizer = tf.train.SyncReplicasOptimizer(
          opt=optimizer,
          replicas_to_aggregate=FLAGS.replicas_to_aggregate,
          variable_averages=variable_averages,
          variables_to_average=moving_average_variables,
          replica_id=tf.constant(FLAGS.task, tf.int32, shape=()),
          total_num_replicas=FLAGS.worker_replicas)
    elif FLAGS.moving_average_decay:
      # Update ops executed locally by trainer.
      update_ops.append(variable_averages.apply(moving_average_variables))

    # Variables to train.
    variables_to_train = _get_variables_to_train()

    #  and returns a train_tensor and summary_op
    total_loss, clones_gradients = model_deploy.optimize_clones(
        clones,
        optimizer,
        var_list=variables_to_train)
    # Add total_loss to summary.
    summaries.add(tf.summary.scalar('total_loss', total_loss))

    # Create gradient updates.
    grad_updates = optimizer.apply_gradients(clones_gradients,
                                             global_step=global_step)
    update_ops.append(grad_updates)

    update_op = tf.group(*update_ops)
    with tf.control_dependencies([update_op]):
      train_tensor = tf.identity(total_loss, name='train_op')

    # Add the summaries from the first clone. These contain the summaries
    # created by model_fn and either optimize_clones() or _gather_clone_loss().
    summaries |= set(tf.get_collection(tf.GraphKeys.SUMMARIES,
                                       first_clone_scope))

    # Merge all summaries together.
    summary_op = tf.summary.merge(list(summaries), name='summary_op')

    ###########################
    # Kicks off the training. #
    ###########################
    slim.learning.train(
        train_tensor,
        logdir=FLAGS.train_dir,
        master=FLAGS.master,
        is_chief=(FLAGS.task == 0),
        init_fn=_get_init_fn(),
        summary_op=summary_op,
        number_of_steps=FLAGS.max_number_of_steps,
        log_every_n_steps=FLAGS.log_every_n_steps,
        save_summaries_secs=FLAGS.save_summaries_secs,
        save_interval_secs=FLAGS.save_interval_secs,
        sync_optimizer=optimizer if FLAGS.sync_replicas else None)
def main(_):
  if not FLAGS.dataset_dir:
    raise ValueError('You must supply the dataset directory with --dataset_dir')

  if not os.path.isfile(FLAGS.checkpoint_path):
    FLAGS.eval_dir = os.path.join(FLAGS.checkpoint_path, 'eval')
  else:
    FLAGS.eval_dir = os.path.join(
        os.path.dirname(FLAGS.checkpoint_path), 'eval')

  try:
    os.makedirs(FLAGS.eval_dir)
  except OSError:
    pass

  tf.logging.set_verbosity(tf.logging.INFO)
  with tf.Graph().as_default():
    tf_global_step = slim.get_or_create_global_step()

    ######################
    # Select the dataset #
    ######################
    dataset = dataset_factory.get_dataset(
        FLAGS.dataset_name, FLAGS.dataset_split_name,
        FLAGS.dataset_dir.split(','),
        FLAGS.dataset_list_dir,
        num_samples=FLAGS.frames_per_video,
        modality=FLAGS.modality,
        split_id=FLAGS.split_id)

    ####################
    # Select the model #
    ####################
    network_fn = nets_factory.get_network_fn(
        FLAGS.model_name,
        num_classes=(dataset.num_classes - FLAGS.labels_offset),
        batch_size=FLAGS.batch_size,
        is_training=False)

    ##############################################################
    # Create a dataset provider that loads data from the dataset #
    ##############################################################
    provider = dataset_data_provider.DatasetDataProvider(
        dataset,
        shuffle=FLAGS.force_random_shuffle,
        common_queue_capacity=2 * FLAGS.batch_size,
        common_queue_min=FLAGS.batch_size,
        bgr_flips=FLAGS.bgr_flip)
    [image, label] = provider.get(['image', 'label'])
    label = tf.cast(tf.string_to_number(label, tf.int32),
        tf.int64)
    label.set_shape(())
    label -= FLAGS.labels_offset

    #####################################
    # Select the preprocessing function #
    #####################################
    preprocessing_name = FLAGS.preprocessing_name or FLAGS.model_name
    image_preprocessing_fn = preprocessing_factory.get_preprocessing(
        preprocessing_name,
        is_training=False)

    eval_image_size = FLAGS.eval_image_size or network_fn.default_image_size

    image = image_preprocessing_fn(image, eval_image_size, eval_image_size,
                                   model_name=FLAGS.model_name,
                                   ncrops=FLAGS.ncrops,
                                   out_dim_scale=FLAGS.out_dim_scale)

    images, labels = tf.train.batch(
        [image, label],
        batch_size=FLAGS.batch_size,
        num_threads=1 if FLAGS.store_feat is not None else FLAGS.num_preprocessing_threads,
        capacity=5 * FLAGS.batch_size)

    ####################
    # Define the model #
    ####################
    kwargs = {}
    if FLAGS.conv_endpoint is not None:
      kwargs['conv_endpoint'] = FLAGS.conv_endpoint
    logits, end_points = network_fn(
        images, pool_type=FLAGS.pooling,
        classifier_type=FLAGS.classifier_type,
        num_channels_stream=provider.num_channels_stream,
        netvlad_centers=FLAGS.netvlad_initCenters.split(','),
        stream_pool_type=FLAGS.stream_pool_type,
        **kwargs)
    end_points['images'] = images
    end_points['labels'] = labels

    if FLAGS.moving_average_decay:
      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, tf_global_step)
      variables_to_restore = variable_averages.variables_to_restore(
          slim.get_model_variables())
      variables_to_restore[tf_global_step.op.name] = tf_global_step
    else:
      variables_to_restore = slim.get_variables_to_restore()

    predictions = tf.argmax(logits, 1)
    # rgirdhar: Because of the following, can't use with batch_size=1
    if FLAGS.batch_size > 1:
      labels = tf.squeeze(labels)

    # Define the metrics:
    names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
        'Accuracy': slim.metrics.streaming_accuracy(predictions, labels),
        'Recall@5': slim.metrics.streaming_recall_at_k(
            logits, labels, 5),
    })

    # Print the summaries to screen.
    for name, value in names_to_values.iteritems():
      summary_name = 'eval/%s' % name
      op = tf.scalar_summary(summary_name, value, collections=[])
      op = tf.Print(op, [value], summary_name)
      tf.add_to_collection(tf.GraphKeys.SUMMARIES, op)

    # TODO(sguada) use num_epochs=1
    if FLAGS.max_num_batches:
      num_batches = FLAGS.max_num_batches
    else:
      # This ensures that we make a single pass over all of the data.
      num_batches = int(math.ceil(dataset.num_samples /
                                  float(FLAGS.batch_size)))

    if tf.gfile.IsDirectory(FLAGS.checkpoint_path):
      checkpoint_path = tf.train.latest_checkpoint(FLAGS.checkpoint_path)
    else:
      checkpoint_path = FLAGS.checkpoint_path

    tf.logging.info('Evaluating %s' % checkpoint_path)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    config.allow_soft_placement = True

    if FLAGS.store_feat is not None:
      assert(FLAGS.store_feat_path is not None)
      from tensorflow.python.training import supervisor
      from tensorflow.python.framework import ops
      import h5py
      saver = tf.train.Saver(variables_to_restore)
      sv = supervisor.Supervisor(graph=ops.get_default_graph(),
                                 logdir=None,
                                 summary_op=None,
                                 summary_writer=None,
                                 global_step=None,
                                 saver=None)
      ept_names_to_store = FLAGS.store_feat.split(',')
      try:
        ept_to_store = [end_points[el] for el in ept_names_to_store]
      except:
        logging.error('Endpoint not found')
        logging.error('Choose from %s' % ','.join(end_points.keys()))
        raise KeyError()
      res = dict([(epname, []) for epname in ept_names_to_store])
      with sv.managed_session(
          FLAGS.master, start_standard_services=False,
          config=config) as sess:
        saver.restore(sess, checkpoint_path)
        sv.start_queue_runners(sess)
        for j in range(num_batches):
          if j % 10 == 0:
            logging.info('Doing batch %d/%d' % (j, num_batches))
          feats = sess.run(ept_to_store)
          for eid, epname in enumerate(ept_names_to_store):
            res[epname].append(feats[eid])
      logging.info('Writing out features to %s' % FLAGS.store_feat_path)
      with h5py.File(FLAGS.store_feat_path, 'w') as fout:
        for epname in res.keys():
          fout.create_dataset(epname,
              data=np.concatenate(res[epname], axis=0),
              compression='gzip',
              compression_opts=FLAGS.feat_store_compression_opt)
    else:
      slim.evaluation.evaluate_once(
          master=FLAGS.master,
          checkpoint_path=checkpoint_path,
          logdir=FLAGS.eval_dir,
          num_evals=num_batches,
          eval_op=names_to_updates.values(),
          variables_to_restore=variables_to_restore,
          session_config=config)