示例#1
0
def main(argv):
  del argv

  # Hyperparameters derived from the paper
  hparams = mobilenet_hparams()
  hparams.parse(FLAGS.hparams)

  params = dict(
      hparams.values(),
      num_eval_examples=FLAGS.num_eval_examples,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      num_shards=FLAGS.num_shards,
      num_batches_per_epoch=FLAGS.num_examples_per_epoch / FLAGS.batch_size,
  )

  with tf.gfile.GFile(FLAGS.model_dir + "/hparams.json", "w") as f:
    tf.gfile.MakeDirs(FLAGS.model_dir)
    f.write(hparams.to_json())

  num_training_examples = FLAGS.num_examples_per_epoch * params["num_epochs"]
  num_eval_batches = FLAGS.num_eval_examples // FLAGS.batch_size
  num_training_batches = num_training_examples // FLAGS.batch_size

  run_config = tpu_config.RunConfig(
      master=FLAGS.master,
      model_dir=FLAGS.model_dir,
      save_checkpoints_secs=FLAGS.save_checkpoints_secs,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=False),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=100,
          num_shards=FLAGS.num_shards,
      ),
  )

  estimator = tpu_estimator.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      config=run_config,
      train_batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.batch_size,
      params=dict(params, use_tpu=FLAGS.use_tpu),
  )

  # Evaluate the test set after each epoch of the training set is processed.
  for _ in range(FLAGS.num_epochs):
    tf.logging.info("Training one epoch: %s steps",
                    num_training_batches // FLAGS.num_epochs)
    estimator.train(
        input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True),
        steps=num_training_batches // FLAGS.num_epochs)

    tf.logging.info("Running evaluation")
    tf.logging.info("%s",
                    estimator.evaluate(
                        input_fn=data_pipeline.InputReader(
                            FLAGS.data_dir, is_training=False),
                        steps=num_eval_batches,
                    ))
示例#2
0
def main(argv):
    del argv

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    training_examples = 1300 * 1000 * FLAGS.num_epochs
    eval_examples = 50 * 1000

    params = {
        "num_classes": 1001,
        "lr": FLAGS.learning_rate,
        "min_lr": 0.005,
        "momentum": FLAGS.momentum,
        "optimizer": FLAGS.optimizer,
        "num_eval_examples": eval_examples,
        "num_shards": FLAGS.num_shards,
        "num_epochs": FLAGS.num_epochs,
    }

    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=100,
            num_shards=FLAGS.num_shards,
        ),
    )

    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=squeezenet_model.model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        params=dict(params, use_tpu=FLAGS.use_tpu),
    )

    num_evals = max(FLAGS.num_evals, 1)
    examples_per_eval = training_examples // num_evals
    for _ in range(num_evals):
        estimator.train(input_fn=data_pipeline.InputReader(FLAGS.data_dir,
                                                           is_training=True),
                        steps=examples_per_eval // FLAGS.batch_size)

        tf.logging.info("Running evaluation")
        tf.logging.info(
            "%s",
            estimator.evaluate(
                input_fn=data_pipeline.InputReader(FLAGS.data_dir,
                                                   is_training=False),
                steps=eval_examples // FLAGS.batch_size,
            ))
示例#3
0
def main(argv):
  del argv
  training_examples = 1300 * 1000 * FLAGS.num_epochs
  eval_examples = 50 * 1000

  params = {
      "num_classes": 1001,
      "lr": 0.04,
      "min_lr": 0.0004,
      "momentum": FLAGS.momentum,
      "optimizer": FLAGS.optimizer,
      "num_eval_examples": eval_examples,
      "num_shards": FLAGS.num_shards,
      "num_epochs": FLAGS.num_epochs,
  }

  run_config = tpu_config.RunConfig(
      master=FLAGS.master,
      model_dir=FLAGS.model_dir,
      save_checkpoints_secs=FLAGS.save_checkpoints_secs,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=False),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=100,
          num_shards=FLAGS.num_shards,
      ),
  )

  estimator = tpu_estimator.TPUEstimator(
      model_fn=squeezenet_model.model_fn,
      use_tpu=FLAGS.use_tpu,
      config=run_config,
      train_batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.batch_size,
      params=dict(params, use_tpu=FLAGS.use_tpu),
  )

  # Evaluate the test set after 5% of training examples are finished.
  num_evals = 20
  for _ in range(num_evals):
    estimator.train(
        input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True),
        steps=training_examples // (num_evals * FLAGS.batch_size))

    tf.logging.info("Running evaluation")
    tf.logging.info("%s",
                    estimator.evaluate(
                        input_fn=data_pipeline.InputReader(
                            FLAGS.data_dir, is_training=False),
                        steps=eval_examples // FLAGS.batch_size,
                    ))
示例#4
0
def train_and_eval(deeplab_estimator, train_dataset, eval_dataset,
                   num_batches_per_epoch):
  """Interleaves training and evaluation."""
  # pylint: disable=protected-access
  current_step = estimator._load_global_step_from_checkpoint_dir(
      FLAGS.model_dir)
  tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                  ' step %d.' %
                  (FLAGS.train_steps,
                   FLAGS.train_steps / num_batches_per_epoch,
                   current_step))
  start_timestamp = time.time()
  while current_step < FLAGS.train_steps:
    # Train for up to steps_per_eval number of steps. At the end of training,
    # a checkpoint will be written to --model_dir.
    next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                          FLAGS.train_steps)

    train_input_fn = data_pipeline.InputReader(
        train_dataset,
        FLAGS.train_split,
        is_training=True,
        model_variant=FLAGS.model_variant
    )
    deeplab_estimator.train(
        input_fn=train_input_fn,
        max_steps=next_checkpoint
    )
    current_step = next_checkpoint

    elapsed_time = int(time.time() - start_timestamp)
    tf.logging.info('Finished training up to step %d. Elapsed seconds %d.' %
                    (current_step, elapsed_time))

    tf.logging.info('Starting to evaluate.')

    eval_input_fn = data_pipeline.InputReader(
        eval_dataset,
        FLAGS.eval_split,
        is_training=False,
        model_variant=FLAGS.model_variant
    )
    eval_results = deeplab_estimator.evaluate(
        input_fn=eval_input_fn,
        steps=eval_dataset.num_samples // FLAGS.eval_batch_size
    )
    tf.logging.info('Eval results: %s' % eval_results)
示例#5
0
def main(argv):
    del argv

    if FLAGS.master is None and FLAGS.tpu_name is None:
        raise RuntimeError("You must specify either --master or --tpu_name.")

    if FLAGS.master is not None:
        if FLAGS.tpu_name is not None:
            tf.logging.warn("Both --master and --tpu_name are set. Ignoring "
                            "--tpu_name and using --master.")
        tpu_grpc_url = FLAGS.master
    else:
        tpu_cluster_resolver = (tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project))
        tpu_grpc_url = tpu_cluster_resolver.get_master()

    training_examples = 1300 * 1000 * FLAGS.num_epochs
    eval_examples = 50 * 1000

    params = {
        "num_classes": 1001,
        "lr": FLAGS.learning_rate,
        "min_lr": 0.005,
        "momentum": FLAGS.momentum,
        "optimizer": FLAGS.optimizer,
        "num_eval_examples": eval_examples,
        "num_shards": FLAGS.num_shards,
        "num_epochs": FLAGS.num_epochs,
    }

    run_config = tpu_config.RunConfig(
        master=tpu_grpc_url,
        model_dir=FLAGS.model_dir,
        save_checkpoints_secs=FLAGS.save_checkpoints_secs,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=tpu_config.TPUConfig(
            iterations_per_loop=FLAGS.iterations,
            num_shards=FLAGS.num_shards,
        ),
    )

    estimator = tpu_estimator.TPUEstimator(
        model_fn=squeezenet_model.model_fn,
        use_tpu=FLAGS.use_tpu,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        params=dict(params, use_tpu=FLAGS.use_tpu),
    )

    #num_evals = max(FLAGS.num_evals, 1)
    #examples_per_eval = training_examples // num_evals
    #for _ in range(num_evals):
    estimator.train(
        input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True),
        #steps=examples_per_eval // FLAGS.batch_size)
        steps=FLAGS.train_steps)
示例#6
0
def main(unused_argv):
    params = params_dict.ParamsDict(squeezenet_config.SQUEEZENET_CFG,
                                    squeezenet_config.SQUEEZENET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)

    params = flags_to_params.override_params_from_input_flags(params, FLAGS)

    total_steps = (
        (params.train.num_epochs * params.train.num_examples_per_epoch) //
        params.train.train_batch_size)
    params.override(
        {
            "train": {
                "total_steps": total_steps
            },
            "eval": {
                "num_steps_per_eval": (total_steps // params.eval.num_evals)
            },
        },
        is_strict=False)

    params.validate()
    params.lock()

    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    if not params.use_async_checkpointing:
        save_checkpoints_steps = max(5000, params.train.iterations_per_loop)

    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=params.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        session_config=tf.ConfigProto(allow_soft_placement=True,
                                      log_device_placement=False),
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=params.train.iterations_per_loop,
            num_shards=params.train.num_cores_per_replica,
        ),
    )

    estimator = contrib_tpu.TPUEstimator(
        model_fn=squeezenet_model.model_fn,
        use_tpu=params.use_tpu,
        config=run_config,
        train_batch_size=params.train.train_batch_size,
        eval_batch_size=params.eval.eval_batch_size,
        params=params.as_dict(),
    )

    for eval_cycle in range(params.eval.num_evals):
        current_cycle_last_train_step = ((eval_cycle + 1) *
                                         params.eval.num_steps_per_eval)
        estimator.train(input_fn=data_pipeline.InputReader(FLAGS.data_dir,
                                                           is_training=True),
                        steps=current_cycle_last_train_step)

        tf.logging.info("Running evaluation")
        tf.logging.info(
            "%s",
            estimator.evaluate(input_fn=data_pipeline.InputReader(
                FLAGS.data_dir, is_training=False),
                               steps=(params.eval.num_eval_examples //
                                      params.eval.eval_batch_size)))
示例#7
0
文件: main.py 项目: grananqvist/tpu
def main(unused_argv):
    train_dataset = segmentation_dataset.get_dataset(
        FLAGS.dataset_name, FLAGS.train_split, dataset_dir=FLAGS.dataset_dir)
    eval_dataset = segmentation_dataset.get_dataset(
        FLAGS.dataset_name, FLAGS.eval_split, dataset_dir=FLAGS.dataset_dir)

    num_train_images = train_dataset.num_samples
    num_classes = train_dataset.num_classes
    ignore_label = train_dataset.ignore_label

    num_batches_per_epoch = num_train_images / FLAGS.train_batch_size

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_shards))

    params = get_params(ignore_label, num_classes, num_batches_per_epoch)

    deeplab_estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model.model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        params=params)

    if FLAGS.mode == 'train':
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total).' %
            (FLAGS.train_steps, FLAGS.train_steps / num_batches_per_epoch))
        train_input_fn = data_pipeline.InputReader(
            train_dataset,
            FLAGS.train_split,
            is_training=True,
            model_variant=FLAGS.model_variant)
        deeplab_estimator.train(input_fn=train_input_fn,
                                max_steps=FLAGS.train_steps)
    elif FLAGS.mode == 'train_and_eval':
        train_and_eval(deeplab_estimator, train_dataset, eval_dataset,
                       num_batches_per_epoch)
    elif FLAGS.mode == 'eval':

        eval_input_fn = data_pipeline.InputReader(
            eval_dataset,
            FLAGS.eval_split,
            is_training=False,
            model_variant=FLAGS.model_variant)

        # Run evaluation when there's a new checkpoint
        for ckpt in tf.contrib.training.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):

            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = deeplab_estimator.evaluate(
                    input_fn=eval_input_fn,
                    steps=eval_dataset.num_samples // FLAGS.eval_batch_size)
                tf.logging.info('Eval results: %s' % eval_results)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d' %
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint' %
                    ckpt)
    else:
        tf.logging.error('Mode not found.')
示例#8
0
def main(argv):
  del argv

  if FLAGS.master is None and FLAGS.tpu_name is None:
    raise RuntimeError("You must specify either --master or --tpu_name.")

  if FLAGS.master is not None:
    if FLAGS.tpu_name is not None:
      tf.logging.warn("Both --master and --tpu_name are set. Ignoring "
                      "--tpu_name and using --master.")
    tpu_grpc_url = FLAGS.master
  else:
    tpu_cluster_resolver = (
        tf.contrib.cluster_resolver.python.training.TPUClusterResolver(
            tpu_names=[FLAGS.tpu_name],
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project))
    tpu_grpc_url = tpu_cluster_resolver.get_master()

  # Hyperparameters derived from the paper
  hparams = mobilenet_hparams()
  hparams.parse(FLAGS.hparams)

  params = dict(
      hparams.values(),
      num_eval_examples=FLAGS.num_eval_examples,
      num_examples_per_epoch=FLAGS.num_examples_per_epoch,
      num_shards=FLAGS.num_shards,
      num_batches_per_epoch=FLAGS.num_examples_per_epoch / FLAGS.batch_size,
  )

  with tf.gfile.GFile(FLAGS.model_dir + "/hparams.json", "w") as f:
    tf.gfile.MakeDirs(FLAGS.model_dir)
    f.write(hparams.to_json())

  num_training_examples = FLAGS.num_examples_per_epoch * params["num_epochs"]
  num_eval_batches = FLAGS.num_eval_examples // FLAGS.batch_size
  num_training_batches = num_training_examples // FLAGS.batch_size

  run_config = tpu_config.RunConfig(
      master=tpu_grpc_url,
      model_dir=FLAGS.model_dir,
      save_checkpoints_secs=FLAGS.save_checkpoints_secs,
      session_config=tf.ConfigProto(
          allow_soft_placement=True, log_device_placement=False),
      tpu_config=tpu_config.TPUConfig(
          iterations_per_loop=100,
          num_shards=FLAGS.num_shards,
      ),
  )

  estimator = tpu_estimator.TPUEstimator(
      model_fn=model_fn,
      use_tpu=FLAGS.use_tpu,
      config=run_config,
      train_batch_size=FLAGS.batch_size,
      eval_batch_size=FLAGS.batch_size,
      params=dict(params, use_tpu=FLAGS.use_tpu),
  )

  # Evaluate the test set after each epoch of the training set is processed.
  for _ in range(FLAGS.num_epochs):
    tf.logging.info("Training one epoch: %s steps",
                    num_training_batches // FLAGS.num_epochs)
    estimator.train(
        input_fn=data_pipeline.InputReader(FLAGS.data_dir, is_training=True),
        steps=num_training_batches // FLAGS.num_epochs)

    tf.logging.info("Running evaluation")
    tf.logging.info("%s",
                    estimator.evaluate(
                        input_fn=data_pipeline.InputReader(
                            FLAGS.data_dir, is_training=False),
                        steps=num_eval_batches,
                    ))