示例#1
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    train_utils.serialize_config(params, model_dir)
    run_continuous_finetune(FLAGS.mode, params, model_dir,
                            FLAGS.pretrain_steps)
示例#3
0
    def test_recovery_nan_error(self, distribution_strategy, flag_mode):
        model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode=flag_mode,
                          model_dir=model_dir,
                          params_override=json.dumps(self._test_config))
        with flagsaver.flagsaver(**flags_dict):
            params = train_utils.parse_configuration(flags.FLAGS)
            train_utils.serialize_config(params, model_dir)
            with distribution_strategy.scope():
                # task = task_factory.get_task(params.task, logging_dir=model_dir)
                task = mock_task.MockTask(params.task, logging_dir=model_dir)

                # Set the loss to NaN to trigger RunTimeError.
                def build_losses(labels, model_outputs, aux_losses=None):
                    del labels, model_outputs
                    return tf.constant([np.nan], tf.float32) + aux_losses

                task.build_losses = build_losses

            with self.assertRaises(RuntimeError):
                train_lib.run_experiment(
                    distribution_strategy=distribution_strategy,
                    task=task,
                    mode=flag_mode,
                    params=params,
                    model_dir=model_dir)
    def test_end_to_end(self, distribution_strategy, flag_mode, run_post_eval):
        model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode=flag_mode,
                          model_dir=model_dir,
                          params_override=json.dumps(self._test_config))
        with flagsaver.flagsaver(**flags_dict):
            params = train_utils.parse_configuration(flags.FLAGS)
            train_utils.serialize_config(params, model_dir)
            with distribution_strategy.scope():
                task = task_factory.get_task(params.task,
                                             logging_dir=model_dir)

            _, logs = train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode=flag_mode,
                params=params,
                model_dir=model_dir,
                run_post_eval=run_post_eval)

        if run_post_eval:
            self.assertNotEmpty(logs)
        else:
            self.assertEmpty(logs)
        self.assertNotEmpty(
            tf.io.gfile.glob(os.path.join(model_dir, 'params.yaml')))
        if flag_mode != 'eval':
            self.assertNotEmpty(
                tf.io.gfile.glob(os.path.join(model_dir, 'checkpoint')))
示例#5
0
def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = train_utils.parse_configuration(FLAGS)
  model_dir = FLAGS.model_dir
  if "train" in FLAGS.mode:
    train_utils.serialize_config(params, model_dir)

  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu,
      **params.runtime.model_parallelism())

  with distribution_strategy.scope():
    if params.task.use_crf:
      task = ap_parsing_task.APParsingTaskCRF(params.task)
    else:
      task = ap_parsing_task.APParsingTaskBase(params.task)

    ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter(
        params, model_dir)
    trainer = train_utils.create_trainer(
        params,
        task,
        train="train" in FLAGS.mode,
        evaluate=("eval" in FLAGS.mode),
        checkpoint_exporter=ckpt_exporter)

  model, _ = train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      trainer=trainer,
      model_dir=model_dir)

  train_utils.save_gin_config(FLAGS.mode, model_dir)

  # Export saved model.
  if "train" in FLAGS.mode:
    saved_model_path = os.path.join(model_dir, "saved_models/latest")
    logging.info("Exporting SavedModel to %s", saved_model_path)
    tf.saved_model.save(model, saved_model_path)

    if ckpt_exporter:
      logging.info("Loading best checkpoint for export")
      trainer.checkpoint.restore(ckpt_exporter.best_ckpt_path)
      saved_model_path = os.path.join(model_dir, "saved_models/best")

      # Make sure restored and not re-initialized.
      if trainer.global_step > 0:
        logging.info(
            "Exporting best saved model by %s (from global step: %d) to %s",
            params.trainer.best_checkpoint_eval_metric,
            trainer.global_step.numpy(), saved_model_path)
        tf.saved_model.save(trainer.model, saved_model_path)
示例#6
0
def main(_):
    # TODO(b/177863554): consolidate to nlp/train.py
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    train_utils.serialize_config(params, model_dir)
    continuous_finetune_lib.run_continuous_finetune(
        FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps)
    train_utils.save_gin_config(FLAGS.mode, model_dir)
示例#7
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    if 'train_and_eval' in FLAGS.mode:
        assert (
            params.task.train_data.feature_shape ==
            params.task.validation_data.feature_shape), (
                f'train {params.task.train_data.feature_shape} != validate '
                f'{params.task.validation_data.feature_shape}')

    if 'assemblenet' in FLAGS.experiment:
        if 'eval' in FLAGS.mode:
            # Use the feature shape in validation_data for all jobs. The number of
            # frames in train_data will be used to construct the Assemblenet model.
            params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[
                0]
            shape = params.task.validation_data.feature_shape
        else:
            params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[
                0]
            shape = params.task.train_data.feature_shape
        logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode,
                     params.task.model.backbone.assemblenet.num_frames, shape)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
示例#8
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype,
            params.runtime.loss_scale,
            use_experimental_api=True)

    input_partition_dims = None
    if FLAGS.mode == 'train_and_eval':
        if np.prod(params.task.train_input_partition_dims) != np.prod(
                params.task.eval_input_partition_dims):
            raise ValueError('Train and eval input partition dims can not be'
                             'partitioned on the same node')
        else:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.train_input_partition_dims)
    elif FLAGS.mode == 'train':
        if params.task.train_input_partition_dims:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.train_input_partition_dims)
    elif FLAGS.mode == 'eval' or FLAGS.mode == 'continuous_eval':
        if params.task.eval_input_partition_dims:
            input_partition_dims = get_computation_shape_for_model_parallelism(
                params.task.eval_input_partition_dims)

    distribution_strategy = create_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        num_gpus=params.runtime.num_gpus,
        input_partition_dims=input_partition_dims,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)
示例#9
0
def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = train_utils.parse_configuration(FLAGS)
  model_dir = FLAGS.model_dir
  if 'train' in FLAGS.mode:
    # Pure eval modes do not output yaml files. Otherwise continuous eval job
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
<<<<<<< HEAD
                                           params.runtime.loss_scale)
示例#10
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    superglue_flags.validate_flags(FLAGS, file_exists_fn=tf.io.gfile.exists)

    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus,
        tpu_address=FLAGS.tpu)

    with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader:
        input_meta_data = json.loads(reader.read().decode('utf-8'))

    with distribution_strategy.scope():
        task = None
        if 'train_eval' in FLAGS.mode:
            logging.info('Starting training and eval...')
            logging.info('Model dir: %s', FLAGS.model_dir)

            exp_config = _get_exp_config(input_meta_data=input_meta_data,
                                         exp_config_files=FLAGS.config_file)
            train_utils.serialize_config(exp_config, FLAGS.model_dir)
            task = task_factory.get_task(exp_config.task,
                                         logging_dir=FLAGS.model_dir)
            train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode='train_and_eval',
                params=exp_config,
                model_dir=FLAGS.model_dir)

        if 'predict' in FLAGS.mode:
            logging.info('Starting predict...')
            # When mode is `predict`, `task` will be None.
            if task is None:
                exp_config = _get_exp_config(input_meta_data=input_meta_data,
                                             exp_config_files=[
                                                 os.path.join(
                                                     FLAGS.model_dir,
                                                     'params.yaml')
                                             ])
                task = task_factory.get_task(exp_config.task,
                                             logging_dir=FLAGS.model_dir)
            _write_submission_file(task, input_meta_data['max_seq_length'])
示例#11
0
    def test_recovery(self, distribution_strategy, flag_mode):
        loss_threshold = 1.0
        model_dir = self.get_temp_dir()
        flags_dict = dict(experiment='mock',
                          mode=flag_mode,
                          model_dir=model_dir,
                          params_override=json.dumps(self._test_config))
        with flagsaver.flagsaver(**flags_dict):
            params = train_utils.parse_configuration(flags.FLAGS)
            params.trainer.loss_upper_bound = loss_threshold
            params.trainer.recovery_max_trials = 1
            train_utils.serialize_config(params, model_dir)
            with distribution_strategy.scope():
                task = task_factory.get_task(params.task,
                                             logging_dir=model_dir)

            # Saves a checkpoint for reference.
            model = task.build_model()
            checkpoint = tf.train.Checkpoint(model=model)
            checkpoint_manager = tf.train.CheckpointManager(
                checkpoint, self.get_temp_dir(), max_to_keep=2)
            checkpoint_manager.save()
            before_weights = model.get_weights()

            def build_losses(labels, model_outputs, aux_losses=None):
                del labels, model_outputs
                return tf.constant([loss_threshold], tf.float32) + aux_losses

            task.build_losses = build_losses

            model, _ = train_lib.run_experiment(
                distribution_strategy=distribution_strategy,
                task=task,
                mode=flag_mode,
                params=params,
                model_dir=model_dir)
            after_weights = model.get_weights()
            for left, right in zip(before_weights, after_weights):
                self.assertAllEqual(left, right)
示例#12
0
def config_override(params, flags_obj):
    """Override ExperimentConfig according to flags."""
    # Change runtime.tpu to the real tpu.
    params.override({'runtime': {
        'tpu': flags_obj.tpu,
    }})

    # Get the first level of override from `--config_file`.
    #   `--config_file` is typically used as a template that specifies the common
    #   override for a particular experiment.
    for config_file in flags_obj.config_file or []:
        params = hyperparams.override_params_dict(params,
                                                  config_file,
                                                  is_strict=True)

    # Get the second level of override from `--params_override`.
    #   `--params_override` is typically used as a further override over the
    #   template. For example, one may define a particular template for training
    #   ResNet50 on ImageNet in a config file and pass it via `--config_file`,
    #   then define different learning rates and pass it via `--params_override`.
    if flags_obj.params_override:
        params = hyperparams.override_params_dict(params,
                                                  flags_obj.params_override,
                                                  is_strict=True)

    params.validate()
    params.lock()

    pp = pprint.PrettyPrinter()
    logging.info('Final experiment parameters: %s',
                 pp.pformat(params.as_dict()))

    model_dir = flags_obj.model_dir
    if 'train' in flags_obj.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    return params
示例#13
0
def main(_) -> None:
  """Train and evaluate the Ranking model."""
  params = train_utils.parse_configuration(FLAGS)
  mode = FLAGS.mode
  model_dir = FLAGS.model_dir
  if 'train' in FLAGS.mode:
    # Pure eval modes do not output yaml files. Otherwise continuous eval job
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

  if FLAGS.seed is not None:
    logging.info('Setting tf seed.')
    tf.random.set_seed(FLAGS.seed)

  task = RankingTask(
      params=params.task,
      optimizer_config=params.trainer.optimizer_config,
      logging_dir=model_dir,
      steps_per_execution=params.trainer.steps_per_loop,
      name='RankingTask')

  enable_tensorboard = params.trainer.callbacks.enable_tensorboard

  strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  with strategy.scope():
    model = task.build_model()

  def get_dataset_fn(params):
    return lambda input_context: task.build_inputs(params, input_context)

  train_dataset = None
  if 'train' in mode:
    train_dataset = strategy.distribute_datasets_from_function(
        get_dataset_fn(params.task.train_data),
        options=tf.distribute.InputOptions(experimental_fetch_to_device=False))

  validation_dataset = None
  if 'eval' in mode:
    validation_dataset = strategy.distribute_datasets_from_function(
        get_dataset_fn(params.task.validation_data),
        options=tf.distribute.InputOptions(experimental_fetch_to_device=False))

  if params.trainer.use_orbit:
    with strategy.scope():
      checkpoint_exporter = train_utils.maybe_create_best_ckpt_exporter(
          params, model_dir)
      trainer = RankingTrainer(
          config=params,
          task=task,
          model=model,
          optimizer=model.optimizer,
          train='train' in mode,
          evaluate='eval' in mode,
          train_dataset=train_dataset,
          validation_dataset=validation_dataset,
          checkpoint_exporter=checkpoint_exporter)

    train_lib.run_experiment(
        distribution_strategy=strategy,
        task=task,
        mode=mode,
        params=params,
        model_dir=model_dir,
        trainer=trainer)

  else:  # Compile/fit
    checkpoint = tf.train.Checkpoint(model=model, optimizer=model.optimizer)

    latest_checkpoint = tf.train.latest_checkpoint(model_dir)
    if latest_checkpoint:
      checkpoint.restore(latest_checkpoint)
      logging.info('Loaded checkpoint %s', latest_checkpoint)

    checkpoint_manager = tf.train.CheckpointManager(
        checkpoint,
        directory=model_dir,
        max_to_keep=params.trainer.max_to_keep,
        step_counter=model.optimizer.iterations,
        checkpoint_interval=params.trainer.checkpoint_interval)
    checkpoint_callback = keras_utils.SimpleCheckpoint(checkpoint_manager)

    time_callback = keras_utils.TimeHistory(
        params.task.train_data.global_batch_size,
        params.trainer.time_history.log_steps,
        logdir=model_dir if enable_tensorboard else None)
    callbacks = [checkpoint_callback, time_callback]

    if enable_tensorboard:
      tensorboard_callback = tf.keras.callbacks.TensorBoard(
          log_dir=model_dir,
          update_freq=min(1000, params.trainer.validation_interval),
          profile_batch=FLAGS.profile_steps)
      callbacks.append(tensorboard_callback)

    num_epochs = (params.trainer.train_steps //
                  params.trainer.validation_interval)
    current_step = model.optimizer.iterations.numpy()
    initial_epoch = current_step // params.trainer.validation_interval

    eval_steps = params.trainer.validation_steps if 'eval' in mode else None

    if mode in ['train', 'train_and_eval']:
      logging.info('Training started')
      history = model.fit(
          train_dataset,
          initial_epoch=initial_epoch,
          epochs=num_epochs,
          steps_per_epoch=params.trainer.validation_interval,
          validation_data=validation_dataset,
          validation_steps=eval_steps,
          callbacks=callbacks,
      )
      model.summary()
      logging.info('Train history: %s', history.history)
    elif mode == 'eval':
      logging.info('Evaluation started')
      validation_output = model.evaluate(validation_dataset, steps=eval_steps)
      logging.info('Evaluation output: %s', validation_output)
    else:
      raise NotImplementedError('The mode is not implemented: %s' % mode)
示例#14
0
def export(input_type: str,
           batch_size: Optional[int],
           input_image_size: List[int],
           params: cfg.ExperimentConfig,
           checkpoint_path: str,
           export_dir: str,
           num_channels: Optional[int] = 3,
           export_module: Optional[export_base.ExportModule] = None,
           export_checkpoint_subdir: Optional[str] = None,
           export_saved_model_subdir: Optional[str] = None,
           save_options: Optional[tf.saved_model.SaveOptions] = None):
    """Exports the model specified in the exp config.

  Saved model is stored at export_dir/saved_model, checkpoint is saved
  at export_dir/checkpoint, and params is saved at export_dir/params.yaml.

  Args:
    input_type: One of `image_tensor`, `image_bytes`, `tf_example`.
    batch_size: 'int', or None.
    input_image_size: List or Tuple of height and width.
    params: Experiment params.
    checkpoint_path: Trained checkpoint path or directory.
    export_dir: Export directory path.
    num_channels: The number of input image channels.
    export_module: Optional export module to be used instead of using params
      to create one. If None, the params will be used to create an export
      module.
    export_checkpoint_subdir: Optional subdirectory under export_dir
      to store checkpoint.
    export_saved_model_subdir: Optional subdirectory under export_dir
      to store saved model.
    save_options: `SaveOptions` for `tf.saved_model.save`.
  """

    if export_checkpoint_subdir:
        output_checkpoint_directory = os.path.join(export_dir,
                                                   export_checkpoint_subdir)
    else:
        output_checkpoint_directory = None

    if export_saved_model_subdir:
        output_saved_model_directory = os.path.join(export_dir,
                                                    export_saved_model_subdir)
    else:
        output_saved_model_directory = export_dir

    export_module = export_module_factory.get_export_module(
        params,
        input_type=input_type,
        batch_size=batch_size,
        input_image_size=input_image_size,
        num_channels=num_channels)

    export_base.export(export_module,
                       function_keys=[input_type],
                       export_savedmodel_dir=output_saved_model_directory,
                       checkpoint_path=checkpoint_path,
                       timestamped=False,
                       save_options=save_options)

    if output_checkpoint_directory:
        ckpt = tf.train.Checkpoint(model=export_module.model)
        ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt'))
    train_utils.serialize_config(params, export_dir)
示例#15
0
def export_inference_graph(
        input_type: str,
        batch_size: Optional[int],
        input_image_size: List[int],
        params: cfg.ExperimentConfig,
        checkpoint_path: str,
        export_dir: str,
        num_channels: Optional[int] = 3,
        export_module: Optional[export_base.ExportModule] = None,
        export_checkpoint_subdir: Optional[str] = None,
        export_saved_model_subdir: Optional[str] = None,
        save_options: Optional[tf.saved_model.SaveOptions] = None,
        log_model_flops_and_params: bool = False,
        checkpoint: Optional[tf.train.Checkpoint] = None,
        input_name: Optional[str] = None):
    """Exports inference graph for the model specified in the exp config.

  Saved model is stored at export_dir/saved_model, checkpoint is saved
  at export_dir/checkpoint, and params is saved at export_dir/params.yaml.

  Args:
    input_type: One of `image_tensor`, `image_bytes`, `tf_example` or `tflite`.
    batch_size: 'int', or None.
    input_image_size: List or Tuple of height and width.
    params: Experiment params.
    checkpoint_path: Trained checkpoint path or directory.
    export_dir: Export directory path.
    num_channels: The number of input image channels.
    export_module: Optional export module to be used instead of using params
      to create one. If None, the params will be used to create an export
      module.
    export_checkpoint_subdir: Optional subdirectory under export_dir
      to store checkpoint.
    export_saved_model_subdir: Optional subdirectory under export_dir
      to store saved model.
    save_options: `SaveOptions` for `tf.saved_model.save`.
    log_model_flops_and_params: If True, writes model FLOPs to model_flops.txt
      and model parameters to model_params.txt.
    checkpoint: An optional tf.train.Checkpoint. If provided, the export module
      will use it to read the weights.
    input_name: The input tensor name, default at `None` which produces input
      tensor name `inputs`.
  """

    if export_checkpoint_subdir:
        output_checkpoint_directory = os.path.join(export_dir,
                                                   export_checkpoint_subdir)
    else:
        output_checkpoint_directory = None

    if export_saved_model_subdir:
        output_saved_model_directory = os.path.join(export_dir,
                                                    export_saved_model_subdir)
    else:
        output_saved_model_directory = export_dir

    # TODO(arashwan): Offers a direct path to use ExportModule with Task objects.
    if not export_module:
        if isinstance(params.task,
                      configs.image_classification.ImageClassificationTask):
            export_module = image_classification.ClassificationModule(
                params=params,
                batch_size=batch_size,
                input_image_size=input_image_size,
                input_type=input_type,
                num_channels=num_channels,
                input_name=input_name)
        elif isinstance(params.task,
                        configs.retinanet.RetinaNetTask) or isinstance(
                            params.task, configs.maskrcnn.MaskRCNNTask):
            export_module = detection.DetectionModule(
                params=params,
                batch_size=batch_size,
                input_image_size=input_image_size,
                input_type=input_type,
                num_channels=num_channels,
                input_name=input_name)
        elif isinstance(
                params.task,
                configs.semantic_segmentation.SemanticSegmentationTask):
            export_module = semantic_segmentation.SegmentationModule(
                params=params,
                batch_size=batch_size,
                input_image_size=input_image_size,
                input_type=input_type,
                num_channels=num_channels,
                input_name=input_name)
        elif isinstance(params.task,
                        configs.video_classification.VideoClassificationTask):
            export_module = video_classification.VideoClassificationModule(
                params=params,
                batch_size=batch_size,
                input_image_size=input_image_size,
                input_type=input_type,
                num_channels=num_channels,
                input_name=input_name)
        else:
            raise ValueError(
                'Export module not implemented for {} task.'.format(
                    type(params.task)))

    export_base.export(export_module,
                       function_keys=[input_type],
                       export_savedmodel_dir=output_saved_model_directory,
                       checkpoint=checkpoint,
                       checkpoint_path=checkpoint_path,
                       timestamped=False,
                       save_options=save_options)

    if output_checkpoint_directory:
        ckpt = tf.train.Checkpoint(model=export_module.model)
        ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt'))
    train_utils.serialize_config(params, export_dir)

    if log_model_flops_and_params:
        inputs_kwargs = None
        if isinstance(
                params.task,
            (configs.retinanet.RetinaNetTask, configs.maskrcnn.MaskRCNNTask)):
            # We need to create inputs_kwargs argument to specify the input shapes for
            # subclass model that overrides model.call to take multiple inputs,
            # e.g., RetinaNet model.
            inputs_kwargs = {
                'images':
                tf.TensorSpec([1] + input_image_size + [num_channels],
                              tf.float32),
                'image_shape':
                tf.TensorSpec([1, 2], tf.float32)
            }
            dummy_inputs = {
                k: tf.ones(v.shape.as_list(), tf.float32)
                for k, v in inputs_kwargs.items()
            }
            # Must do forward pass to build the model.
            export_module.model(**dummy_inputs)
        else:
            logging.info(
                'Logging model flops and params not implemented for %s task.',
                type(params.task))
            return
        train_utils.try_count_flops(
            export_module.model, inputs_kwargs,
            os.path.join(export_dir, 'model_flops.txt'))
        train_utils.write_model_params(
            export_module.model, os.path.join(export_dir, 'model_params.txt'))
示例#16
0
def export_inference_graph(
    input_type: str,
    batch_size: Optional[int],
    input_image_size: List[int],
    params: cfg.ExperimentConfig,
    checkpoint_path: str,
    export_dir: str,
    num_channels: Optional[int] = 3,
    export_module: Optional[export_base.ExportModule] = None,
    export_checkpoint_subdir: Optional[str] = None,
    export_saved_model_subdir: Optional[str] = None,
    save_options: Optional[tf.saved_model.SaveOptions] = None,
    argmax_outputs: bool = False,
    visualise_outputs: bool = False,
    class_present_outputs: bool = False):
  """Exports inference graph for the model specified in the exp config.

  Saved model is stored at export_dir/saved_model, checkpoint is saved
  at export_dir/checkpoint, and params is saved at export_dir/params.yaml.

  Args:
    input_type: One of `image_tensor`, `image_bytes`, `tf_example`.
    batch_size: 'int', or None.
    input_image_size: List or Tuple of height and width.
    params: Experiment params.
    checkpoint_path: Trained checkpoint path or directory.
    export_dir: Export directory path.
    num_channels: The number of input image channels.
    export_module: Optional export module to be used instead of using params
      to create one. If None, the params will be used to create an export
      module.
    export_checkpoint_subdir: Optional subdirectory under export_dir
      to store checkpoint.
    export_saved_model_subdir: Optional subdirectory under export_dir
      to store saved model.
    save_options: `SaveOptions` for `tf.saved_model.save`.

  (applicable for Segmentation and MultiTask export definitions only)
    argmax_outputs: Set true to argmax the last channel of all outputs.
    visualise_outputs: Set true to apply colormap to all single channel outputs.
    class_present_outputs: Set true to gather unique values of outputs.
  """

  if export_checkpoint_subdir:
    output_checkpoint_directory = os.path.join(
        export_dir, export_checkpoint_subdir)
  else:
    output_checkpoint_directory = export_dir

  if export_saved_model_subdir:
    output_saved_model_directory = os.path.join(
        export_dir, export_saved_model_subdir)
  else:
    output_saved_model_directory = export_dir

  # TODO(arashwan): Offers a direct path to use ExportModule with Task objects.
  if not export_module:
    if isinstance(params.task,
                  configs.image_classification.ImageClassificationTask):
      export_module = image_classification.ClassificationModule(
          params=params,
          batch_size=batch_size,
          input_image_size=input_image_size,
          num_channels=num_channels)
    elif isinstance(params.task, configs.retinanet.RetinaNetTask) or isinstance(
        params.task, configs.maskrcnn.MaskRCNNTask):
      export_module = detection.DetectionModule(
          params=params,
          batch_size=batch_size,
          input_image_size=input_image_size,
          num_channels=num_channels)
    elif isinstance(params.task,
                    configs.semantic_segmentation.SemanticSegmentationTask):
      export_module = semantic_segmentation.SegmentationModule(
          params=params,
          batch_size=batch_size,
          input_image_size=input_image_size,
          num_channels=num_channels,
          argmax_outputs=argmax_outputs,
          visualise_outputs=visualise_outputs)
    elif isinstance(params.task, configs.yolo.YoloTask):
      export_module = yolo.YoloModule(
          params=params,
          batch_size=batch_size,
          input_image_size=input_image_size,
          num_channels=num_channels)
    elif isinstance(params.task, multi_cfg.MultiTaskConfig):
      export_module = multitask.MultitaskModule(
          params=params,
          batch_size=batch_size,
          input_image_size=input_image_size,
          num_channels=num_channels,
          argmax_outputs=argmax_outputs,
          visualise_outputs=visualise_outputs,
          class_present_outputs=class_present_outputs)
    else:
      raise ValueError('Export module not implemented for {} task.'.format(
          type(params.task)))

  export_base.export(
      export_module,
      function_keys=[input_type],
      export_savedmodel_dir=output_saved_model_directory,
      checkpoint_path=checkpoint_path,
      timestamped=False,
      save_options=save_options)

  ckpt = tf.train.Checkpoint(model=export_module.model)
  ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt'))
  train_utils.serialize_config(params, export_dir)
示例#17
0
def export_inference_graph(input_type,
                           batch_size,
                           input_image_size,
                           params,
                           checkpoint_path,
                           export_dir,
                           export_checkpoint_subdir=None,
                           export_saved_model_subdir=None):
    """Exports inference graph for the model specified in the exp config.

  Saved model is stored at export_dir/saved_model, checkpoint is saved
  at export_dir/checkpoint, and params is saved at export_dir/params.yaml.

  Args:
    input_type: One of `image_tensor`, `image_bytes`, `tf_example`.
    batch_size: 'int', or None.
    input_image_size: List or Tuple of height and width.
    params: Experiment params.
    checkpoint_path: Trained checkpoint path or directory.
    export_dir: Export directory path.
    export_checkpoint_subdir: Optional subdirectory under export_dir
      to store checkpoint.
    export_saved_model_subdir: Optional subdirectory under export_dir
      to store saved model.
  """

    if export_checkpoint_subdir:
        output_checkpoint_directory = os.path.join(export_dir,
                                                   export_checkpoint_subdir)
    else:
        output_checkpoint_directory = export_dir

    if export_saved_model_subdir:
        output_saved_model_directory = os.path.join(export_dir,
                                                    export_saved_model_subdir)
    else:
        output_saved_model_directory = export_dir

    if isinstance(params.task,
                  configs.image_classification.ImageClassificationTask):
        export_module = image_classification.ClassificationModule(
            params=params,
            batch_size=batch_size,
            input_image_size=input_image_size)
    elif isinstance(params.task,
                    configs.retinanet.RetinaNetTask) or isinstance(
                        params.task, configs.maskrcnn.MaskRCNNTask):
        export_module = detection.DetectionModule(
            params=params,
            batch_size=batch_size,
            input_image_size=input_image_size)
    else:
        raise ValueError('Export module not implemented for {} task.'.format(
            type(params.task)))

    model = export_module.build_model()

    ckpt = tf.train.Checkpoint(model=model)

    ckpt_dir_or_file = checkpoint_path
    if tf.io.gfile.isdir(ckpt_dir_or_file):
        ckpt_dir_or_file = tf.train.latest_checkpoint(ckpt_dir_or_file)
    status = ckpt.restore(ckpt_dir_or_file).expect_partial()

    if input_type == 'image_tensor':
        input_signature = tf.TensorSpec(
            shape=[batch_size, input_image_size[0], input_image_size[1], 3],
            dtype=tf.uint8)
        signatures = {
            'serving_default':
            export_module.inference_from_image_tensors.get_concrete_function(
                input_signature)
        }
    elif input_type == 'image_bytes':
        input_signature = tf.TensorSpec(shape=[batch_size], dtype=tf.string)
        signatures = {
            'serving_default':
            export_module.inference_from_image_bytes.get_concrete_function(
                input_signature)
        }
    elif input_type == 'tf_example':
        input_signature = tf.TensorSpec(shape=[batch_size], dtype=tf.string)
        signatures = {
            'serving_default':
            export_module.inference_from_tf_example.get_concrete_function(
                input_signature)
        }
    else:
        raise ValueError('Unrecognized `input_type`')

    status.assert_existing_objects_matched()

    ckpt.save(os.path.join(output_checkpoint_directory, 'ckpt'))

    tf.saved_model.save(export_module,
                        output_saved_model_directory,
                        signatures=signatures)

    train_utils.serialize_config(params, export_dir)
示例#18
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)

    if params.runtime.num_hpus > 0:
        import os
        #TODO: remove when SW-49334 is fixed [SW-49404]
        os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1"
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

    if params.task.train_data.deterministic or params.task.validation_data.deterministic:
        import os
        os.environ['PYTHONHASHSEED'] = '0'
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        import numpy
        numpy.random.seed(0)
        import tensorflow as tf
        tf.random.set_seed(0)
        tf.compat.v1.set_random_seed(0)
        import random
        random.seed(0)

    if FLAGS.dtype == "bf16":
        print("Using bf16 config list {}".format(FLAGS.bf16_config_path))
        os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    hls_addresses = str(os.environ.get("MULTI_HLS_IPS",
                                       "127.0.0.1")).split(",")
    TF_BASE_PORT = 2410
    mpi_rank = comm_rank()
    mpi_size = comm_size()

    if params.runtime.num_hpus > 1:
        model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank))
    else:
        model_dir = FLAGS.model_dir

    #prepare a comma-seperated list of device addreses
    worker_list = []
    for address in hls_addresses:
        for rank in range(mpi_size // len(hls_addresses)):
            worker_list.append(address + ':' + str(TF_BASE_PORT + rank))
    worker_hosts = ",".join(worker_list)
    task_index = mpi_rank

    # Configures cluster spec for distribution strategy.
    distribution_utils.configure_cluster(worker_hosts, task_index)
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        num_hpus=params.runtime.num_hpus,
        tpu_address=params.runtime.tpu)

    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)