Пример #1
0
def main(_):
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_train.train(dataset)
Пример #2
0
def main(unused_argv=None):
  dataset = ImagenetData(subset=FLAGS.subset)
  assert dataset.data_files()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  inception_eval.evaluate(dataset)
def main(unused_args):
  assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker'

  # Extract all the hostnames for the ps and worker jobs to construct the
  # cluster spec.
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  tf.logging.info('PS hosts are: %s' % ps_hosts)
  tf.logging.info('Worker hosts are: %s' % worker_hosts)

  cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,
                                       'worker': worker_hosts})
  server = tf.train.Server(
      {'ps': ps_hosts,
       'worker': worker_hosts},
      job_name=FLAGS.job_name,
      task_index=FLAGS.task_id,
      protocol=FLAGS.protocol)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec)
Пример #4
0
def main(unused_argv=None):
  dataset = ImagenetData(subset=FLAGS.subset)
  assert dataset.data_files()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)
  inception_eval.evaluate(dataset)
Пример #5
0
def main(unused_argv):
    ps_hosts = FLAGS.ps_hosts.split(",")
    worker_hosts = FLAGS.worker_hosts.split(",")
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})
    server = tf.train.Server(cluster,
                             job_name=FLAGS.job_name,
                             task_index=FLAGS.worker_index)

    if FLAGS.job_name == "ps":
        server.join()
        sys.exit(0)

    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
        if not tf.gfile.Exists(FLAGS.train_dir):
            tf.gfile.MakeDirs(FLAGS.train_dir)
            inception_distributed_train.train(server.target, dataset,
                                              cluster_spec)

    num_workers = len(worker_hosts)
    worker_grpc_url = 'grpc://' + worker_hosts[0]
    print("Worker GRPC URL: %s" % worker_grpc_url)
    print("Worker index = %d" % FLAGS.worker_index)
    print("Number of workers = %d" % num_workers)
Пример #6
0
def main(unused_args):
  assert FLAGS.job_name in ['ps', 'worker'], 'job_name must be ps or worker'

  # Extract all the hostnames for the ps and worker jobs to construct the
  # cluster spec.
  ps_hosts = FLAGS.ps_hosts.split(',')
  worker_hosts = FLAGS.worker_hosts.split(',')
  tf.logging.info('PS hosts are: %s' % ps_hosts)
  tf.logging.info('Worker hosts are: %s' % worker_hosts)

  cluster_spec = tf.train.ClusterSpec({'ps': ps_hosts,
                                       'worker': worker_hosts})
  server = tf.train.Server(
      {'ps': ps_hosts,
       'worker': worker_hosts},
      job_name=FLAGS.job_name,
      task_index=FLAGS.task_id)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec)
Пример #7
0
def main(_) :
    dataset = ImagenetData(subset = FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.train_dir) :
        tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_train.train(dataset)
Пример #8
0
def main(unused_argv=None):
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)
    FLAGS.dataset_name = 'imagenet'
    FLAGS.num_examples = dataset.num_examples_per_epoch()
    inception_eval.evaluate(dataset)
def main(_):
    # Load dataset
    tf.app.flags.FLAGS.data_dir = '/work/haeusser/data/imagenet/shards'
    dataset = ImagenetData(subset='validation')
    assert dataset.data_files()

    num_labels = dataset.num_classes() + 1
    image_shape = [FLAGS.image_size, FLAGS.image_size, 3]

    graph = tf.Graph()
    with graph.as_default():

        images, labels = image_processing.batch_inputs(
            dataset,
            32,
            train=True,
            num_preprocess_threads=16,
            num_readers=FLAGS.num_readers)

        # Set up semisup model.
        model = semisup.SemisupModel(semisup.architectures.inception_model,
                                     num_labels,
                                     image_shape,
                                     test_in=images)

        # Add moving average variables.
        for var in tf.get_collection('moving_vars'):
            tf.add_to_collection(tf.GraphKeys.MOVING_AVERAGE_VARIABLES, var)
        for var in slim.get_model_variables():
            tf.add_to_collection(tf.GraphKeys.MOVING_AVERAGE_VARIABLES, var)

        # Get prediction tensor from semisup model.
        predictions = tf.argmax(model.test_logit, 1)

        # Accuracy metric for summaries.
        names_to_values, names_to_updates = slim.metrics.aggregate_metric_map({
            'Accuracy':
            slim.metrics.streaming_accuracy(predictions, labels),
        })
        for name, value in names_to_values.iteritems():
            tf.summary.scalar(name, value)

        # Run the actual evaluation loop.
        num_batches = math.ceil(dataset.num_examples_per_epoch() /
                                float(FLAGS.eval_batch_size))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        slim.evaluation.evaluation_loop(
            master=FLAGS.master,
            checkpoint_dir=FLAGS.logdir,
            logdir=FLAGS.logdir,
            num_evals=num_batches,
            eval_op=names_to_updates.values(),
            eval_interval_secs=FLAGS.eval_interval_secs,
            session_config=config)
Пример #10
0
def main_fun(argv, ctx):
    import tensorflow as tf
    from inception import inception_eval
    from inception.imagenet_data import ImagenetData

    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS._parse_flags()
    print("FLAGS:", FLAGS.__dict__['__flags'])

    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    if tf.gfile.Exists(FLAGS.eval_dir):
        tf.gfile.DeleteRecursively(FLAGS.eval_dir)
    tf.gfile.MakeDirs(FLAGS.eval_dir)

    cluster_spec, server = TFNode.start_cluster_server(ctx, 1, FLAGS.rdma)

    inception_eval.evaluate(dataset)
def main_fun(argv, ctx):
  import tensorflow as tf
  from inception import inception_eval
  from inception.imagenet_data import ImagenetData

  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS._parse_flags()
  print("FLAGS:", FLAGS.__dict__['__flags'])

  dataset = ImagenetData(subset=FLAGS.subset)
  assert dataset.data_files()
  if tf.gfile.Exists(FLAGS.eval_dir):
    tf.gfile.DeleteRecursively(FLAGS.eval_dir)
  tf.gfile.MakeDirs(FLAGS.eval_dir)

  cluster_spec, server = TFNode.start_cluster_server(ctx)

  inception_eval.evaluate(dataset)
def main_fun(argv, ctx):

    # extract node metadata from ctx
    worker_num = ctx.worker_num
    job_name = ctx.job_name
    task_index = ctx.task_index

    assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

    from inception import inception_distributed_train
    from inception.imagenet_data import ImagenetData
    import tensorflow as tf

    # instantiate FLAGS on workers using argv from driver and add job_name and task_id
    print("argv:", argv)
    sys.argv = argv

    FLAGS = tf.app.flags.FLAGS
    FLAGS.job_name = job_name
    FLAGS.task_id = task_index
    print("FLAGS:", FLAGS.__dict__['__flags'])

    # Get TF cluster and server instances
    cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus,
                                                       FLAGS.rdma)

    if FLAGS.job_name == 'ps':
        # `ps` jobs wait for incoming connections from the workers.
        server.join()
    else:
        # `worker` jobs will actually do the work.
        dataset = ImagenetData(subset=FLAGS.subset)
        assert dataset.data_files()
        # Only the chief checks for or creates train_dir.
        if FLAGS.task_id == 0:
            if not tf.gfile.Exists(FLAGS.train_dir):
                tf.gfile.MakeDirs(FLAGS.train_dir)
        inception_distributed_train.train(server.target, dataset, cluster_spec,
                                          ctx)
def main_fun(argv, ctx):

  # extract node metadata from ctx
  worker_num = ctx.worker_num
  job_name = ctx.job_name
  task_index = ctx.task_index

  assert job_name in ['ps', 'worker'], 'job_name must be ps or worker'

  from inception import inception_distributed_train
  from inception.imagenet_data import ImagenetData
  import tensorflow as tf

  # instantiate FLAGS on workers using argv from driver and add job_name and task_id
  print("argv:", argv)
  sys.argv = argv

  FLAGS = tf.app.flags.FLAGS
  FLAGS.job_name = job_name
  FLAGS.task_id = task_index
  print("FLAGS:", FLAGS.__dict__['__flags'])

  # Get TF cluster and server instances
  cluster_spec, server = TFNode.start_cluster_server(ctx, FLAGS.num_gpus, FLAGS.rdma)

  if FLAGS.job_name == 'ps':
    # `ps` jobs wait for incoming connections from the workers.
    server.join()
  else:
    # `worker` jobs will actually do the work.
    dataset = ImagenetData(subset=FLAGS.subset)
    assert dataset.data_files()
    # Only the chief checks for or creates train_dir.
    if FLAGS.task_id == 0:
      if not tf.gfile.Exists(FLAGS.train_dir):
        tf.gfile.MakeDirs(FLAGS.train_dir)
    inception_distributed_train.train(server.target, dataset, cluster_spec, ctx)
Пример #14
0
def main(_):
    from inception.imagenet_data import ImagenetData
    from inception import image_processing
    dataset = ImagenetData(subset='train')
    assert dataset.data_files()
    NUM_LABELS = dataset.num_classes() + 1
    IMAGE_SHAPE = [FLAGS.image_size, FLAGS.image_size, 3]
    graph = tf.Graph()
    with graph.as_default():
        model = semisup.SemisupModel(inception_model, NUM_LABELS,
                                     IMAGE_SHAPE)

        # t_sup_images, t_sup_labels = tools.get_data('train')
        # t_unsup_images, _ = tools.get_data('unlabeled')

        images, labels = image_processing.batch_inputs(
            dataset, 32, train=True,
            num_preprocess_threads=FLAGS.num_readers,
            num_readers=FLAGS.num_readers)

        t_sup_images, t_sup_labels = tf.train.batch(
            [images, labels],
            batch_size=FLAGS.sup_batch_size,
            enqueue_many=True,
            num_threads=FLAGS.num_readers,
            capacity=1000 + 3 * FLAGS.sup_batch_size,
        )

        t_unsup_images, t_unsup_labels = tf.train.batch(
            [images, labels],
            batch_size=FLAGS.sup_batch_size,
            enqueue_many=True,
            num_threads=FLAGS.num_readers,
            capacity=1000 + 3 * FLAGS.sup_batch_size,
        )

        # Compute embeddings and logits.
        t_sup_emb = model.image_to_embedding(t_sup_images)
        t_unsup_emb = model.image_to_embedding(t_unsup_images)
        t_sup_logit = model.embedding_to_logit(t_sup_emb)

        # Add losses.
        model.add_semisup_loss(
            t_sup_emb, t_unsup_emb, t_sup_labels, visit_weight=FLAGS.visit_weight)

        model.add_logit_loss(t_sup_logit, t_sup_labels)


        t_learning_rate = tf.maximum(
                tf.train.exponential_decay(
                    FLAGS.learning_rate,
                    model.step,
                    FLAGS.decay_steps,
                    FLAGS.decay_factor,
                    staircase=True),
                FLAGS.minimum_learning_rate)

        # Create training operation and start the actual training loop.
        train_op = model.create_train_op(t_learning_rate)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        slim.learning.train(
          train_op,
          logdir=FLAGS.logdir,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs,
          master=FLAGS.master,
          is_chief=(FLAGS.task == 0),
          startup_delay_steps=(FLAGS.task * 20),
          log_every_n_steps=FLAGS.log_every_n_steps,
          session_config=config)