예제 #1
0
def get_training_hooks(mode, model_dir, checkpoint_path=None, skip_checkpoint_variables=None):

    assert mode in ('train', 'eval')

    training_hooks = [
        AutoLoggingHook(
            # log_every_n_steps=RUNNING_CONFIG.display_step,
            log_every_n_steps=5 if "NGC_JOB_ID" not in os.environ else 100,
            # warmup_steps=RUNNING_CONFIG.warmup_steps,
            warmup_steps=100,
            is_training=True
        )
    ]

    if not MPI_is_distributed() or MPI_rank() == 0:
        training_hooks.append(PretrainedWeightsLoadingHook(
            prefix="resnet50/",
            checkpoint_path=checkpoint_path,
            skip_variables_regex=skip_checkpoint_variables
        ))

    if MPI_is_distributed() and mode == "train":
        training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))

    if not MPI_is_distributed() or MPI_rank() == 0:
        training_hooks.append(CheckpointSaverHook(
            checkpoint_dir=model_dir,
            checkpoint_basename="model.ckpt"
        ))

    return training_hooks
예제 #2
0
  def __init__(self, runtime_config, model_fn):
    super(EstimatorExecuter, self).__init__(runtime_config, model_fn)

    if MPI_is_distributed():
      os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
      os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1'
      # os.environ['HOROVOD_AUTOTUNE'] = '2'

      logging.info("SageMaker Distributed Data Parallel successfully initialized ...")

    os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
    os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed() else str(hvd.size())

    os.environ['TF_SYNC_ON_FINISH'] = '0'
예제 #3
0
def create_optimizer(learning_rate, params):
    """Creates optimized based on the specified flags."""

    optimizer = tf.compat.v1.train.MomentumOptimizer(
        learning_rate, momentum=params['momentum'])

    if MPI_is_distributed():
        optimizer = hvd.DistributedOptimizer(
            optimizer,
            name=None,
            device_dense='/gpu:0',
            device_sparse='',
            bucket_cap_mb=64,
            # compression=hvd.Compression.fp16,
            # compression=hvd.Compression.none,
            # sparse_as_dense=False
        )

    if params["amp"]:
        loss_scale = tf.train.experimental.DynamicLossScale(
            initial_loss_scale=(2**12), increment_period=2000, multiplier=2.0)
        optimizer = tf.compat.v1.train.experimental.MixedPrecisionLossScaleOptimizer(
            optimizer, loss_scale=loss_scale)

    return optimizer
예제 #4
0
  def build_strategy_configuration(self, mode):
    """Retrieves model configuration for running TF Estimator."""

    run_config = tf.estimator.RunConfig(
        tf_random_seed=(
            self._runtime_config.seed
            if not MPI_is_distributed() or self._runtime_config.seed is None else
            self._runtime_config.seed + MPI_rank()
        ),
        model_dir=self._runtime_config.model_dir,
        save_summary_steps=None,  # disabled
        save_checkpoints_steps=None,  # disabled
        save_checkpoints_secs=None,  # disabled
        keep_checkpoint_max=20,  # disabled
        keep_checkpoint_every_n_hours=None,  # disabled
        log_step_count_steps=None,  # disabled
        session_config=self._get_session_config(
            mode=mode,
            use_xla=self._runtime_config.xla,
            use_amp=self._runtime_config.amp,
            use_tf_distributed=False,
            allow_xla_at_inference=self._runtime_config.allow_xla_at_inference  # TODO: Remove when XLA at inference fixed
        ),
        protocol=None,
        device_fn=None,
        train_distribute=None,
        eval_distribute=None,
        experimental_distribute=None
    )

    return run_config
    def after_create_session(self, session, coord):  # pylint: disable=unused-argument3
        """Called when new TensorFlow session is created.
        This is called to signal the hooks that a new session has been created. This
        has two essential differences with the situation in which `begin` is called:
        * When this is called, the graph is finalized and ops can no longer be added
            to the graph.
        * This method will also be called as a result of recovering a wrapped
            session, not only at the beginning of the overall session.
        Args:
          session: A TensorFlow Session that has been created.
          coord: A Coordinator object which keeps track of all threads.
        """

        # ========= Collect the number of GPUs ======== #
        if self._is_training:

            if MPI_is_distributed():
                self._n_gpus = MPI_size()

            elif tf.distribute.has_strategy():
                self._n_gpus = tf.distribute.get_strategy(
                ).num_replicas_in_sync

            else:
                self._n_gpus = 1

        else:
            self._n_gpus = 1

        # =========== TensorFlow Hook Setup =========== #
        _global_step, _metrics = setup_tensorflow_hook(
            sess=session,
            logging_proxy=self._logging_proxy,
            is_training=self._is_training,
            is_initialized=self._initialized)

        if _global_step >= 0:
            self._current_step = self._amp_steps_non_skipped = _global_step

        self._metrics.update(_metrics)

        if not self._is_training:

            for metric_name in self._metrics.keys():
                self._metrics[metric_name]["aggregator"].reset()

        self._initialized = True
        # %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #

        self._session_t0 = time.time()
예제 #6
0
    def __init__(self, runtime_config, model_fn):
        super(EstimatorExecuter, self).__init__(runtime_config, model_fn)

        # Handle recipe cache. Skip if externally set or empty.
        recipe_cache = runtime_config.recipe_cache
        if 'TF_RECIPE_CACHE_PATH' not in os.environ.keys() and recipe_cache:
            os.environ['TF_RECIPE_CACHE_PATH'] = recipe_cache

            # Clear previous recipe cache.
            if not MPI_is_distributed() or MPI_rank() == 0:
                if os.path.exists(recipe_cache) and os.path.isdir(
                        recipe_cache):
                    shutil.rmtree(recipe_cache)

        if MPI_is_distributed():
            os.environ['HOROVOD_GPU_ALLREDUCE'] = 'NCCL'
            os.environ['HOROVOD_NUM_NCCL_STREAMS'] = '1'
            # os.environ['HOROVOD_AUTOTUNE'] = '2'

            if runtime_config.device == "HPU":
                from TensorFlow.common.horovod_helpers import hvd_init, Framework
                hvd = hvd_init(framework=Framework.TENSORFLOW)
            else:
                hvd.init()

            # Other ranks should wait for recipe cache to be removed.
            # This operation can't be done before hvd_init.
            from mpi4py import MPI
            MPI.COMM_WORLD.Barrier()

            logging.info("Horovod successfully initialized ...")

        os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
        os.environ['TF_GPU_THREAD_COUNT'] = '1' if not MPI_is_distributed(
        ) else str(hvd.size())

        os.environ['TF_SYNC_ON_FINISH'] = '0'
예제 #7
0
    def get_last_checkpoint_path(self):
        """Search for proper checkpoint directory"""

        checkpoint_path = Path(self._runtime_config.model_dir)
        if MPI_is_distributed():
            # Checkpoints are saved only to 'worker_0' directory.
            worker_0_path = checkpoint_path.parent.joinpath('worker_0')
            if worker_0_path.exists():
                checkpoint_path = worker_0_path
            else:
                # Case when no worker directories are available.
                checkpoint_path = checkpoint_path.parent

        return tf.train.latest_checkpoint(str(checkpoint_path),
                                          latest_filename=None)
예제 #8
0
    def __init__(self, runtime_config, model_fn):

        if MPI_is_distributed():
            runtime_config.model_dir = os.path.join(runtime_config.model_dir,
                                                    f'worker_{MPI_rank()}')

        self._runtime_config = runtime_config
        self._model_fn = model_fn

        os.environ['CUDA_CACHE_DISABLE'] = '0'

        os.environ['TF_USE_CUDNN_BATCHNORM_SPATIAL_PERSISTENT'] = '1'

        os.environ['TF_ADJUST_HUE_FUSED'] = '1'
        os.environ['TF_ADJUST_SATURATION_FUSED'] = '1'
        os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

        os.environ['TF_AUTOTUNE_THRESHOLD'] = '2'
    def __call__(self, params, input_context=None):

        batch_size = params['batch_size'] if 'batch_size' in params else 1

        try:
            seed = params['seed'] if not MPI_is_distributed(
            ) else params['seed'] * MPI_rank()
        except (KeyError, TypeError):
            seed = None

        if MPI_is_distributed():
            n_gpus = MPI_size()

        elif input_context is not None:
            n_gpus = input_context.num_input_pipelines

        else:
            n_gpus = 1

        ##################################################

        dataset = tf.data.Dataset.list_files(self._file_pattern, shuffle=False)

        if self._mode == tf.estimator.ModeKeys.TRAIN:

            if input_context is not None:
                logging.info("Using Dataset Sharding with TF Distributed")
                _num_shards = input_context.num_input_pipelines
                _shard_idx = input_context.input_pipeline_id

            elif MPI_is_distributed():
                logging.info("Using Dataset Sharding with Horovod")
                _shard_idx, _num_shards = MPI_rank_and_size()

            try:
                dataset = dataset.shard(num_shards=_num_shards,
                                        index=_shard_idx)
                dataset = dataset.shuffle(math.ceil(256 / _num_shards))

            except NameError:  # Not a distributed training setup
                pass

        def _prefetch_dataset(filename):
            return tf.data.TFRecordDataset(filename).prefetch(1)

        dataset = dataset.interleave(
            map_func=_prefetch_dataset,
            cycle_length=32,
            block_length=64,
            num_parallel_calls=tf.data.experimental.AUTOTUNE,
        )

        if self._num_examples is not None and self._num_examples > 0:
            logging.info("[*] Limiting the amount of sample to: %d" %
                         self._num_examples)
            dataset = dataset.take(self._num_examples)

        dataset = dataset.cache()

        if self._mode == tf.estimator.ModeKeys.TRAIN:
            dataset = dataset.shuffle(buffer_size=4096,
                                      reshuffle_each_iteration=True,
                                      seed=seed)

            dataset = dataset.repeat()

        # Parse the fetched records to input tensors for model function.
        dataset = dataset.map(
            map_func=self._create_dataset_parser_fn(params),
            num_parallel_calls=16,
        )

        dataset = dataset.batch(batch_size=batch_size, drop_remainder=True)

        if self._use_fake_data:
            # Turn this dataset into a semi-fake dataset which always loop at the
            # first batch. This reduces variance in performance and is useful in
            # testing.
            logging.info("Using Fake Dataset Loop...")
            dataset = dataset.take(1).cache().repeat()

            if self._mode != tf.estimator.ModeKeys.TRAIN:
                dataset = dataset.take(int(5000 / batch_size))

        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE, )

        if not tf.distribute.has_strategy():
            dataset = dataset.apply(
                tf.data.experimental.prefetch_to_device(
                    '/gpu:0',  # With Horovod the local GPU is always 0
                    buffer_size=1,
                ))

        data_options = tf.data.Options()

        data_options.experimental_deterministic = seed is not None
        if LooseVersion(tf.__version__) <= LooseVersion("2.0.0"):
            data_options.experimental_distribute.auto_shard = False
        else:
            data_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
        # data_options.experimental_distribute.auto_shard = False
        data_options.experimental_slack = True

        data_options.experimental_threading.max_intra_op_parallelism = 1
        # data_options.experimental_threading.private_threadpool_size = int(multiprocessing.cpu_count() / n_gpus) * 2

        # ================= experimental_optimization ================= #

        data_options.experimental_optimization.apply_default_optimizations = False

        # data_options.experimental_optimization.autotune = True
        data_options.experimental_optimization.filter_fusion = True
        data_options.experimental_optimization.map_and_batch_fusion = True
        data_options.experimental_optimization.map_and_filter_fusion = True
        data_options.experimental_optimization.map_fusion = True
        data_options.experimental_optimization.map_parallelization = True

        map_vectorization_options = tf.data.experimental.MapVectorizationOptions(
        )
        map_vectorization_options.enabled = True
        map_vectorization_options.use_choose_fastest = True

        data_options.experimental_optimization.map_vectorization = map_vectorization_options

        data_options.experimental_optimization.noop_elimination = True
        data_options.experimental_optimization.parallel_batch = True
        data_options.experimental_optimization.shuffle_and_repeat_fusion = True

        # ========== Stats on TF Data =============
        # aggregator = tf.data.experimental.StatsAggregator()
        # data_options.experimental_stats.aggregator = aggregator
        # data_options.experimental_stats.latency_all_edges = True

        dataset = dataset.with_options(data_options)

        return dataset
예제 #10
0
  def train_and_eval(self, train_input_fn, eval_input_fn):
    """Run distributed train and eval on Mask RCNN model."""

    self._save_config()
    output_dir = os.path.join(self._runtime_config.model_dir, 'eval')
    tf.io.gfile.makedirs(output_dir)

    train_run_config = self.build_strategy_configuration('train')
    train_params = self.build_model_parameters('train')
    train_estimator = self.build_mask_rcnn_estimator(train_params, train_run_config, 'train')

    eval_estimator = None
    eval_results = None

    num_cycles = math.ceil(self._runtime_config.total_steps / self._runtime_config.num_steps_per_eval)

    training_hooks = get_training_hooks(
        mode="train",
        model_dir=self._runtime_config.model_dir,
        checkpoint_path=self._runtime_config.checkpoint,
        skip_checkpoint_variables=self._runtime_config.skip_checkpoint_variables
    )

    for cycle in range(1, num_cycles + 1):

      if not MPI_is_distributed() or MPI_rank() == 0:

        print()  # Visual Spacing
        logging.info("=================================")
        logging.info('     Start training cycle %02d' % cycle)
        logging.info("=================================\n")

      max_cycle_step = min(int(cycle * self._runtime_config.num_steps_per_eval), self._runtime_config.total_steps)

      PROFILER_ENABLED = False

      if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED:
          profiler_context_manager = tf.contrib.tfprof.ProfileContext

      else:
          from contextlib import suppress
          profiler_context_manager = lambda *args, **kwargs: suppress()  # No-Op context manager

      with profiler_context_manager(
              '/workspace/profiling/',
              trace_steps=range(100, 200, 3),
              dump_steps=[200]
      ) as pctx:

          if (not MPI_is_distributed() or MPI_rank() == 0) and PROFILER_ENABLED:
            opts = tf.compat.v1.profiler.ProfileOptionBuilder.time_and_memory()
            pctx.add_auto_profiling('op', opts, [150, 200])

          train_estimator.train(
              input_fn=train_input_fn,
              max_steps=max_cycle_step,
              hooks=training_hooks,
          )

      if not MPI_is_distributed() or MPI_rank() == 0:

          print()  # Visual Spacing
          logging.info("=================================")
          logging.info('    Start evaluation cycle %02d' % cycle)
          logging.info("=================================\n")

          if eval_estimator is None:
              eval_run_config = self.build_strategy_configuration('eval')
              eval_params = self.build_model_parameters('eval')
              eval_estimator = self.build_mask_rcnn_estimator(eval_params, eval_run_config, 'eval')

          last_ckpt = tf.train.latest_checkpoint(self._runtime_config.model_dir, latest_filename=None)
          logging.info("Restoring parameters from %s\n" % last_ckpt)

          eval_results, predictions = evaluation.evaluate(
              eval_estimator,
              eval_input_fn,
              self._runtime_config.eval_samples,
              self._runtime_config.eval_batch_size,
              self._runtime_config.include_mask,
              self._runtime_config.val_json_file,
              report_frequency=self._runtime_config.report_frequency
          )

          self._write_summary(output_dir, eval_results, predictions, max_cycle_step)

      if MPI_is_distributed():
          from mpi4py import MPI
          comm = hvd.get_worker_comm()
          comm.Barrier()  # Waiting for all MPI processes to sync

    return eval_results
예제 #11
0
  def _get_session_config(mode, use_xla, use_amp, use_tf_distributed=False, allow_xla_at_inference=False):

      assert mode in ('train', 'eval')

      rewrite_options = rewriter_config_pb2.RewriterConfig(
          # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # arithmetic_optimization=rewriter_config_pb2.RewriterConfig.ON,

          # constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
          # constant_folding=rewriter_config_pb2.RewriterConfig.ON,           # TO TEST

          # debug_stripper=rewriter_config_pb2.RewriterConfig.OFF,
          # debug_stripper=rewriter_config_pb2.RewriterConfig.ON,           # TO TEST

          # dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # dependency_optimization=rewriter_config_pb2.RewriterConfig.ON,           # TO TEST

          # disable_model_pruning=False,           # INCOMPATIBLE with AMP
          # function_optimization=True,
          # implementation_selector=True,

          # loop_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # loop_optimization=rewriter_config_pb2.RewriterConfig.ON,           # TO TEST

          # The default setting (SCHEDULING and SWAPPING HEURISTICS only)
          # memory_optimization=rewriter_config_pb2.RewriterConfig.DEFAULT_MEM_OPT,

          # Disabled in the meta-optimizer.
          # memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT,

          # Driven by manual op-level annotations.
          # memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,

          # Swapping heuristic will move a tensor from the GPU to the CPU and move it
          # back when needed to reduce peak memory usage..
          # memory_optimization=rewriter_config_pb2.RewriterConfig.SWAPPING_HEURISTICS,

          # Recomputation heuristics will recompute ops (such as Relu activation)
          # during backprop instead of storing them, reducing peak memory usage.
          # memory_optimization=rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS,

          # Scheduling will split big ops such as AddN and try to enforce a schedule of
          # the new computations that decreases peak memory usage.
          # memory_optimization=rewriter_config_pb2.RewriterConfig.SCHEDULING_HEURISTICS,

          # Use any combination of swapping and recomputation heuristics.
          # memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS,

          meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.TWO,
          # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
          # meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.DEFAULT_NUM_ITERS,

          # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.ON,         # TO TEST
          #
          # remapping=rewriter_config_pb2.RewriterConfig.OFF,
          # remapping=rewriter_config_pb2.RewriterConfig.ON,                   # TO TEST

          # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # scoped_allocator_optimization=rewriter_config_pb2.RewriterConfig.ON,  # TO TEST

          # shape_optimization=rewriter_config_pb2.RewriterConfig.OFF,
          # shape_optimization=rewriter_config_pb2.RewriterConfig.ON,           # TO TEST
      )

      if use_amp:
          logging.info("[%s] AMP is activated - Experiment Feature" % mode)
          rewrite_options.auto_mixed_precision = True

      config = tf.compat.v1.ConfigProto(
          allow_soft_placement=True,
          log_device_placement=False,
          graph_options=tf.compat.v1.GraphOptions(
              rewrite_options=rewrite_options,
              # infer_shapes=True  # Heavily drops throughput by 30%
          )
      )

      if use_tf_distributed:
        config.gpu_options.force_gpu_compatible = False

      else:
        config.gpu_options.force_gpu_compatible = True  # Force pinned memory

        if MPI_is_distributed():
            config.gpu_options.visible_device_list = str(MPI_local_rank())

      if use_xla and (mode == "train" or allow_xla_at_inference):
          logging.info("[%s] XLA is activated - Experiment Feature" % mode)
          config.graph_options.optimizer_options.global_jit_level = tf.compat.v1.OptimizerOptions.ON_1
          # config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_2

      if mode == 'train':
          config.intra_op_parallelism_threads = 1  # Avoid pool of Eigen threads

          if MPI_is_distributed():
              config.inter_op_parallelism_threads = max(2, multiprocessing.cpu_count() // hvd.local_size())

          elif not use_tf_distributed:
              config.inter_op_parallelism_threads = 4

      return config
예제 #12
0
def main(argv):
    del argv  # Unused.

    # ============================ Configure parameters ============================ #
    RUN_CONFIG = mask_rcnn_params.default_config()

    temp_config = FLAGS.flag_values_dict()
    if temp_config['device'] == 'HPU':
        if not MPI_is_distributed():
            from habana_frameworks.tensorflow import load_habana_module
            load_habana_module()

    temp_config['learning_rate_decay_levels'] = [
        float(decay) for decay in temp_config['learning_rate_decay_levels']
    ]
    temp_config['learning_rate_levels'] = [
        decay * temp_config['init_learning_rate']
        for decay in temp_config['learning_rate_decay_levels']
    ]
    temp_config['learning_rate_steps'] = [
        int(step) for step in temp_config['learning_rate_steps']
    ]

    RUN_CONFIG = params_io.override_hparams(RUN_CONFIG, temp_config)

    if temp_config['deterministic']:
        tf.config.threading.set_inter_op_parallelism_threads(1)
        tf.config.threading.set_intra_op_parallelism_threads(1)
        if temp_config['seed']:
            os.environ['TF_DETERMINISTIC_OPS'] = '1'
            tf.compat.v1.reset_default_graph()
            tf.random.set_seed(temp_config['seed'])
            if temp_config['device'] == "GPU":
                os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
        else:
            raise RuntimeError("Set seed to run in deterministic mode")
    # ============================ Configure parameters ============================ #

    if RUN_CONFIG.use_tf_distributed and MPI_is_distributed():
        raise RuntimeError(
            "Incompatible Runtime. Impossible to use `--use_tf_distributed` with MPIRun Horovod"
        )

    if RUN_CONFIG.mode in ('train', 'train_and_eval'
                           ) and not RUN_CONFIG.training_file_pattern:
        raise RuntimeError(
            'You must specify `training_file_pattern` for training.')

    if RUN_CONFIG.mode in ('eval', 'train_and_eval'):
        if not RUN_CONFIG.validation_file_pattern:
            raise RuntimeError(
                'You must specify `validation_file_pattern` for evaluation.')

        if RUN_CONFIG.val_json_file == "" and not RUN_CONFIG.include_groundtruth_in_features:
            raise RuntimeError(
                'You must specify `val_json_file` or include_groundtruth_in_features=True for evaluation.'
            )

        if not RUN_CONFIG.include_groundtruth_in_features and not os.path.isfile(
                RUN_CONFIG.val_json_file):
            raise FileNotFoundError("Validation JSON File not found: %s" %
                                    RUN_CONFIG.val_json_file)

    dllogger.init(backends=[
        dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                   filename=RUN_CONFIG.log_path)
    ])

    if RUN_CONFIG.mode in ('train', 'train_and_eval'):

        train_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.training_file_pattern,
            mode=tf.estimator.ModeKeys.TRAIN,
            num_examples=None,
            use_fake_data=RUN_CONFIG.use_fake_data,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed)

    else:
        train_input_fn = None

    if RUN_CONFIG.mode in ('eval', 'train_and_eval') or (
            RUN_CONFIG.mode == 'train' and RUN_CONFIG.eval_after_training):

        eval_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.validation_file_pattern,
            mode=tf.estimator.ModeKeys.PREDICT,
            num_examples=RUN_CONFIG.eval_samples,
            use_fake_data=False,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed)

    else:
        eval_input_fn = None

    with tf.profiler.experimental.Profile(
            RUN_CONFIG.model_dir) if RUN_CONFIG.profile else suppress():
        run_executer(RUN_CONFIG, train_input_fn, eval_input_fn)
예제 #13
0
def compute_coco_eval_metric(predictor,
                             num_batches=-1,
                             include_mask=True,
                             annotation_json_file="",
                             eval_batch_size=-1,
                             report_frequency=None):
    """Compute COCO eval metric given a prediction generator.

    Args:
    predictor: a generator that iteratively pops a dictionary of predictions
      with the format compatible with COCO eval tool.
    num_batches: the number of batches to be aggregated in eval. This is how
      many times that the predictor gets pulled.
    include_mask: a boolean that indicates whether we include the mask eval.
    annotation_json_file: the annotation json file of the eval dataset.

    Returns:
    eval_results: the aggregated COCO metric eval results.
    """

    if annotation_json_file == "":
        annotation_json_file = None

    use_groundtruth_from_json = (annotation_json_file is not None)

    predictions = dict()
    batch_idx = 0

    if use_groundtruth_from_json:
        eval_metric = coco_metric.EvaluationMetric(annotation_json_file,
                                                   include_mask=include_mask)

    else:
        eval_metric = coco_metric.EvaluationMetric(filename=None,
                                                   include_mask=include_mask)

    def evaluation_preds(preds):

        # Essential to avoid modifying the source dict
        _preds = copy.deepcopy(preds)

        for k, v in six.iteritems(_preds):
            _preds[k] = np.concatenate(_preds[k], axis=0)

        if 'orig_images' in _preds and _preds['orig_images'].shape[0] > 10:
            # Only samples a few images for visualization.
            _preds['orig_images'] = _preds['orig_images'][:10]

        if use_groundtruth_from_json:
            eval_results = eval_metric.predict_metric_fn(_preds)

        else:
            images, annotations = coco_utils.extract_coco_groundtruth(
                _preds, include_mask)
            coco_dataset = coco_utils.create_coco_format_dataset(
                images, annotations)
            eval_results = eval_metric.predict_metric_fn(
                _preds, groundtruth_data=coco_dataset)

        return eval_results

    # Take into account cuDNN & Tensorflow warmup
    # Drop N first steps for avg throughput calculation
    BURNIN_STEPS = 100
    model_throughput_list = list()
    inference_time_list = list()

    if MPI_is_distributed():
        eval_batch_size *= MPI_size()
        num_batches /= MPI_size()

    while num_batches < 0 or batch_idx < num_batches:

        try:
            step_t0 = time.time()
            step_predictions = six.next(predictor)
            if MPI_is_distributed():
                from mpi4py import MPI
                MPI.COMM_WORLD.Barrier(
                )  # Need to get time for all predictors for given batch_idx
            batch_time = time.time() - step_t0

            throughput = eval_batch_size / batch_time
            model_throughput_list.append(throughput)
            inference_time_list.append(batch_time)

            logging.info(
                'Running inference on batch %03d/%03d... - Step Time: %.4fs - Throughput: %.1f imgs/s'
                % (batch_idx + 1, num_batches, batch_time, throughput))

        except StopIteration:
            logging.info('Get StopIteration at %d batch.' % (batch_idx + 1))
            break

        step_predictions = process_prediction_for_eval(step_predictions)

        for k, v in step_predictions.items():

            if k not in predictions:
                predictions[k] = [v]

            else:
                predictions[k].append(v)

        batch_idx = batch_idx + 1

        # If you want the report to happen each report_frequency to happen each report_frequency batches.
        # Thus, each report is of eval_batch_size * report_frequency
        if report_frequency and batch_idx % report_frequency == 0:
            eval_results = evaluation_preds(preds=predictions)
            logging.info('Eval results: %s' %
                         pprint.pformat(eval_results, indent=4))

    if MPI_is_distributed():
        from mpi4py import MPI
        all_predictions = MPI.COMM_WORLD.gather(predictions, root=0)
        MPI.COMM_WORLD.Barrier(
        )  # FIXME: first gather is calling MPI_FINALIZE causing crash
        if MPI_rank() == 0:
            predictions.clear()
            for pred in all_predictions:
                for k in pred.keys():
                    if k not in predictions:
                        predictions[k] = pred[k]
                    else:
                        predictions[k].extend(pred[k])

    inference_time_list.sort()
    eval_results = evaluation_preds(preds=predictions)

    if not MPI_is_distributed() or MPI_rank() == 0:

        average_time = np.mean(inference_time_list)
        latency_50 = max(
            inference_time_list[:int(len(inference_time_list) * 0.5)])
        latency_90 = max(
            inference_time_list[:int(len(inference_time_list) * 0.90)])
        latency_95 = max(
            inference_time_list[:int(len(inference_time_list) * 0.95)])
        latency_99 = max(
            inference_time_list[:int(len(inference_time_list) * 0.99)])
        latency_100 = max(
            inference_time_list[:int(len(inference_time_list) * 1)])

        print()  # Visual Spacing
        logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")
        logging.info("         Evaluation Performance Summary          ")
        logging.info("# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ #")

        total_processing_hours, rem = divmod(np.sum(model_throughput_list),
                                             3600)
        total_processing_minutes, total_processing_seconds = divmod(rem, 60)

        if len(model_throughput_list) > BURNIN_STEPS:
            # Take into account cuDNN & Tensorflow warmup
            # Drop N first steps for avg throughput calculation
            # Also drop last step which may have a different batch size
            avg_throughput = np.mean(model_throughput_list[BURNIN_STEPS:-1])
        else:
            avg_throughput = -1.

        print()  # Visual Spacing
        logging.info("Average throughput: {throughput:.1f} samples/sec".format(
            throughput=avg_throughput))
        logging.info("Inference Latency Average (s) = {avg:.4f}".format(
            avg=average_time))
        logging.info(
            "Inference Latency 50% (s) = {cf_50:.4f}".format(cf_50=latency_50))
        logging.info("Inference Latency 90%  (s) = {cf_90:.4f}".format(
            cf_90=latency_90))
        logging.info("Inference Latency 95%  (s) = {cf_95:.4f}".format(
            cf_95=latency_95))
        logging.info("Inference Latency 99%  (s) = {cf_99:.4f}".format(
            cf_99=latency_99))
        logging.info("Inference Latency 100%  (s) = {cf_100:.4f}".format(
            cf_100=latency_100))
        logging.info("Total processed steps: {total_steps}".format(
            total_steps=len(model_throughput_list)))
        logging.info(
            "Total processing time: {hours}h {minutes:02d}m {seconds:02d}s".
            format(hours=total_processing_hours,
                   minutes=int(total_processing_minutes),
                   seconds=int(total_processing_seconds)))
        dllogger.log(step=(),
                     data={"avg_inference_throughput": avg_throughput},
                     verbosity=Verbosity.DEFAULT)
        avg_inference_time = float(total_processing_hours * 3600 +
                                   int(total_processing_minutes) * 60 +
                                   int(total_processing_seconds))
        dllogger.log(step=(),
                     data={"avg_inference_time": avg_inference_time},
                     verbosity=Verbosity.DEFAULT)
        logging.info("==================== Metrics ====================")

        # logging.info('Eval Epoch results: %s' % pprint.pformat(eval_results, indent=4))
        for key, value in sorted(eval_results.items(),
                                 key=operator.itemgetter(0)):
            logging.info("%s: %.9f" % (key, value))
        print()  # Visual Spacing

    return eval_results, predictions
def main(argv):
    del argv  # Unused.

    # ============================ Configure parameters ============================ #
    RUN_CONFIG = mask_rcnn_params.default_config()

    temp_config = FLAGS.flag_values_dict()
    temp_config['learning_rate_decay_levels'] = [float(decay) for decay in temp_config['learning_rate_decay_levels']]
    temp_config['learning_rate_levels'] = [
        decay * temp_config['init_learning_rate'] for decay in temp_config['learning_rate_decay_levels']
    ]
    temp_config['learning_rate_steps'] = [int(step) for step in temp_config['learning_rate_steps']]

    RUN_CONFIG = params_io.override_hparams(RUN_CONFIG, temp_config)
    # ============================ Configure parameters ============================ #

    if RUN_CONFIG.use_tf_distributed and MPI_is_distributed():
        raise RuntimeError("Incompatible Runtime. Impossible to use `--use_tf_distributed` with MPIRun Horovod")

    if RUN_CONFIG.mode in ('train', 'train_and_eval') and not RUN_CONFIG.training_file_pattern:
        raise RuntimeError('You must specify `training_file_pattern` for training.')

    if RUN_CONFIG.mode in ('eval', 'train_and_eval'):
        if not RUN_CONFIG.validation_file_pattern:
            raise RuntimeError('You must specify `validation_file_pattern` for evaluation.')

        if RUN_CONFIG.val_json_file == "" and not RUN_CONFIG.include_groundtruth_in_features:
            raise RuntimeError(
                'You must specify `val_json_file` or include_groundtruth_in_features=True for evaluation.')

        if not RUN_CONFIG.include_groundtruth_in_features and not os.path.isfile(RUN_CONFIG.val_json_file):
            raise FileNotFoundError("Validation JSON File not found: %s" % RUN_CONFIG.val_json_file)

    dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE,
                                                           filename=RUN_CONFIG.log_path)])

    if RUN_CONFIG.mode in ('train', 'train_and_eval'):

        train_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.training_file_pattern,
            mode=tf.estimator.ModeKeys.TRAIN,
            num_examples=None,
            use_fake_data=RUN_CONFIG.use_fake_data,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed
        )

    else:
        train_input_fn = None

    if RUN_CONFIG.mode in ('eval', 'train_and_eval' or (RUN_CONFIG.mode == 'train' and RUN_CONFIG.eval_after_training)):

        eval_input_fn = dataloader.InputReader(
            file_pattern=RUN_CONFIG.validation_file_pattern,
            mode=tf.estimator.ModeKeys.PREDICT,
            num_examples=RUN_CONFIG.eval_samples,
            use_fake_data=False,
            use_instance_mask=RUN_CONFIG.include_mask,
            seed=RUN_CONFIG.seed
        )

    else:
        eval_input_fn = None

    run_executer(RUN_CONFIG, train_input_fn, eval_input_fn)