예제 #1
0
    def __init__(self, iterations, hparams, per_host_v1=False):
        tf.logging.info("TrainLowLevelRunner: constructor")

        self.feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.enqueue_ops = []
        self.dataset_initializer = []
        self.is_local = ((hparams.master == "") and (hparams.tpu_name is None))
        self.per_host_v1 = per_host_v1
        self.iterations = iterations
        self.sess = None
        self.graph = tf.Graph()
        self.hparams = hparams
        with self.graph.as_default():
            self.tpu_init = [tpu.initialize_system()]
            self.tpu_shutdown = tpu.shutdown_system()

        self.resolver = get_resolver(hparams)
        session_config = tf.ConfigProto(allow_soft_placement=True,
                                        isolate_session_state=True)
        if self.hparams.tpu_name is None:
            master = self.hparams.master
        else:
            cluster_spec = self.resolver.cluster_spec()
            tf.logging.info(cluster_spec)
            if cluster_spec:
                session_config.cluster_def.CopyFrom(
                    cluster_spec.as_cluster_def())
            master = self.resolver.get_master()
        self.sess = tf.Session(master, graph=self.graph, config=session_config)
        self.sess.run(self.tpu_init)

        self.hooks = lottery.hooks_from_flags(hparams.values())
def train_and_eval_fn(hparams):
    """Train and evaluation function."""
    hooks = lottery.hooks_from_flags(hparams.values())

    mlperf_log.gnmt_print(key=mlperf_log.RUN_START)
    hparams.tgt_sos_id, hparams.tgt_eos_id = 1, 2
    model_fn = make_model_fn(hparams, hooks)
    input_fn = make_input_fn(hparams, tf.contrib.learn.ModeKeys.TRAIN)
    run_config = _get_tpu_run_config(hparams, False)
    estimator = tf.contrib.tpu.TPUEstimator(
        model_fn=model_fn,
        config=run_config,
        use_tpu=hparams.use_tpu,
        train_batch_size=hparams.batch_size,
        eval_batch_size=hparams.batch_size,
        predict_batch_size=hparams.infer_batch_size,
    )

    score = 0.0
    mlperf_log.gnmt_print(key=mlperf_log.TRAIN_LOOP)
    mlperf_log.gnmt_print(key=mlperf_log.EVAL_TARGET,
                          value=hparams.target_bleu)

    for i in range(hparams.max_train_epochs):
        mlperf_log.gnmt_print(key=mlperf_log.TRAIN_EPOCH, value=i)
        tf.logging.info("Start training epoch %d", i)
        mlperf_log.gnmt_print(key=mlperf_log.INPUT_SIZE,
                              value=hparams.num_examples_per_epoch)
        steps_per_epoch = int(hparams.num_examples_per_epoch /
                              hparams.batch_size)
        max_steps = steps_per_epoch * (i + 1)
        estimator.train(input_fn=input_fn, max_steps=max_steps)
        mlperf_log.gnmt_print(key=mlperf_log.TRAIN_CHECKPOINT,
                              value=("Under " + hparams.out_dir))
        tf.logging.info("End training epoch %d", i)

        mlperf_log.gnmt_print(key=mlperf_log.EVAL_START)
        score = get_metric_from_estimator(hparams, estimator)
        tf.logging.info("Score after epoch %d: %f", i, score)
        mlperf_log.gnmt_print(key=mlperf_log.EVAL_ACCURACY,
                              value={
                                  "value": score,
                                  "epoch": i
                              })
        mlperf_log.gnmt_print(key=mlperf_log.EVAL_STOP, value=i)

    mlperf_log.gnmt_print(mlperf_log.RUN_STOP, {"success": False})
    return score
예제 #3
0
def vgg_main(
    flags_obj, model_function, input_function, dataset_name, shape=None):
  """Shared main loop for VGG Models.

  Args:
    flags_obj: An object containing parsed flags. See define_vgg_flags()
      for details.
    model_function: the function that instantiates the Model and builds the
      ops for train/eval. This will be passed directly into the estimator.
    input_function: the function that processes the dataset and returns a
      dataset that the estimator can train on. This will be wrapped with
      all the relevant flags for running and passed to estimator.
    dataset_name: the name of the dataset for training and evaluation. This is
      used for logging purpose.
    shape: list of ints representing the shape of the images used for training.
      This is only used if flags_obj.export_dir is passed.

  Returns:
    Dict of results of the run.
  """

  model_helpers.apply_clean(flags.FLAGS)

  # Ensures flag override logic is only executed if explicitly triggered.
  if flags_obj.tf_gpu_thread_mode:
    override_flags_and_set_envars_for_gpu_thread_pool(flags_obj)

  # Creates session config. allow_soft_placement = True, is required for
  # multi-GPU and is not harmful for other modes.
  session_config = tf.ConfigProto(
      inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads,
      intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads,
      allow_soft_placement=True)

  distribution_strategy = distribution_utils.get_distribution_strategy(
      flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg)

  # Creates a `RunConfig` that checkpoints every 24 hours which essentially
  # results in checkpoints determined only by `epochs_between_evals`.
  run_config = tf.estimator.RunConfig(
      train_distribute=distribution_strategy,
      session_config=session_config,
      save_checkpoints_secs=60*60*24)

  # Initializes model with all but the dense layer from pretrained VGG.
  if flags_obj.pretrained_model_checkpoint_path is not None:
    warm_start_settings = tf.estimator.WarmStartSettings(
        flags_obj.pretrained_model_checkpoint_path,
        vars_to_warm_start='^(?!.*dense)')
  else:
    warm_start_settings = None

  classifier = tf.estimator.Estimator(
      model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config,
      warm_start_from=warm_start_settings, params={
          'vgg_size': flags_obj.vgg_size,
          'data_format': flags_obj.data_format,
          'batch_size': flags_obj.batch_size,
          'loss_scale': flags_core.get_loss_scale(flags_obj),
          'dtype': flags_core.get_tf_dtype(flags_obj),
          'fine_tune': flags_obj.fine_tune
      })

  run_params = {
      'batch_size': flags_obj.batch_size,
      'dtype': flags_core.get_tf_dtype(flags_obj),
      'vgg_size': flags_obj.vgg_size,
      'synthetic_data': flags_obj.use_synthetic_data,
      'train_epochs': flags_obj.train_epochs,
  }
  if flags_obj.use_synthetic_data:
    dataset_name = dataset_name + '-synthetic'

  benchmark_logger = logger.get_benchmark_logger()
  benchmark_logger.log_run_info('vgg', dataset_name, run_params,
                                test_id=flags_obj.benchmark_test_id)

  train_hooks = hooks_helper.get_train_hooks(
      flags_obj.hooks,
      model_dir=flags_obj.model_dir,
      batch_size=flags_obj.batch_size)

  train_hooks = list(train_hooks) + lottery.hooks_from_flags(flags_obj.flag_values_dict())

  def input_fn_train(num_epochs):
    return input_function(
        is_training=True,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=num_epochs,
        dtype=flags_core.get_tf_dtype(flags_obj),
        datasets_num_private_threads=flags_obj.datasets_num_private_threads,
        num_parallel_batches=flags_obj.datasets_num_parallel_batches)

  def input_fn_eval():
    return input_function(
        is_training=False,
        data_dir=flags_obj.data_dir,
        batch_size=distribution_utils.per_device_batch_size(
            flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)),
        num_epochs=1,
        dtype=flags_core.get_tf_dtype(flags_obj))

  if flags_obj.lth_generate_predictions:
    ckpt = tf.train.latest_checkpoint(flags_obj.model_dir)

    if flags_obj.lth_no_pruning:
      m_hooks = []
    else:
      m_hooks = lottery.hooks_from_flags(flags_obj.flag_values_dict())

    eval_results = classifier.predict(
        input_fn=input_fn_eval,
        checkpoint_path=ckpt,
        hooks=m_hooks,
    )

    assert flags_obj.lth_prediction_result_dir
    with tf.gfile.Open(os.path.join(flags_obj.data_dir, 'test_batch.bin'), 'rb') as f:
      labels = list(f.read()[::32*32*3+1])

    eval_results = list(eval_results)
    if not tf.gfile.Exists(flags_obj.lth_prediction_result_dir):
      tf.gfile.MakeDirs(flags_obj.lth_prediction_result_dir)
    with tf.gfile.Open(os.path.join(flags_obj.lth_prediction_result_dir, 'predictions'), 'wb') as f:
      for label, res in zip(labels, eval_results):
        res['label'] = label
      pickle.dump(eval_results, f)
    return

  try:
    cpr = tf.train.NewCheckpointReader(tf.train.latest_checkpoint(flags_obj.model_dir))
    current_step = cpr.get_tensor('global_step')
  except:
    current_step = 0

  while current_step < flags_obj.max_train_steps:
    next_checkpoint = min(current_step + 10000, flags_obj.max_train_steps)
    classifier.train(input_fn=lambda: input_fn_train(1000), hooks=train_hooks, max_steps=next_checkpoint)
    current_step = next_checkpoint
    tf.logging.info('Starting to evaluate.')
    eval_results = classifier.evaluate(input_fn=input_fn_eval)
    benchmark_logger.log_evaluation_result(eval_results)

  if flags_obj.export_dir is not None:
    # Exports a saved model for the given classifier.
    export_dtype = flags_core.get_tf_dtype(flags_obj)
    if flags_obj.image_bytes_as_serving_input:
      input_receiver_fn = functools.partial(
          image_bytes_serving_input_fn, shape, dtype=export_dtype)
    else:
      input_receiver_fn = export.build_tensor_serving_input_receiver_fn(
          shape, batch_size=flags_obj.batch_size, dtype=export_dtype)
    classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn,
                                 strip_default_attrs=True)
def main(unused_argv):
    params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file,
                                                 FLAGS.hparams_file, FLAGS,
                                                 FLAGS.hparams)
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(5000, params['iterations_per_loop'])
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=params['use_tpu'],
        model_fn=resnet_model_fn,
        config=config,
        params=params,
        train_batch_size=params['train_batch_size'],
        eval_batch_size=params['eval_batch_size'],
        predict_batch_size=params['eval_batch_size'],
        export_to_tpu=FLAGS.export_to_tpu)

    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=use_bfloat16,
                transpose_input=params['transpose_input'],
                selection=selection)
            for (is_training,
                 selection) in [(True, select_train), (False, select_eval)]
        ]
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']
    lottery_hooks = lottery.hooks_from_flags(params)

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                FLAGS.model_dir, timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    elif FLAGS.mode == 'predict':
        tf.logging.info('Starting to evaluate.')
        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)

        if FLAGS.lth_no_pruning:
            m_hooks = []
        else:
            m_hooks = lottery_hooks

        start_timestamp = time.time(
        )  # This time will include compilation time
        eval_results = resnet_classifier.predict(
            input_fn=imagenet_eval.input_fn,
            checkpoint_path=ckpt,
            hooks=m_hooks,
        )

        assert FLAGS.lth_prediction_result_dir
        with tf.gfile.Open(
                os.path.join(FLAGS.lth_prediction_result_dir, 'predictions'),
                'wb') as f:
            pickle.dump(list(eval_results), f)

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(5000, params['iterations_per_loop'])))
            if FLAGS.profile_every_n_steps > 0:
                hooks.append(
                    tpu_profiler_hook.TPUProfilerHook(
                        save_steps=FLAGS.profile_every_n_steps,
                        output_dir=FLAGS.model_dir,
                        tpu=FLAGS.tpu))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=int(params['train_steps']),
                                    hooks=hooks + lottery_hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=int(next_checkpoint),
                                        hooks=lottery_hooks)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
            if FLAGS.add_warmup_requests:
                inference_warmup.write_warmup_requests(
                    export_path,
                    FLAGS.model_name,
                    params['image_size'],
                    batch_sizes=FLAGS.inference_batch_sizes,
                    image_format='JPEG')