예제 #1
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu,
        **params.runtime.model_parallelism())

    with distribution_strategy.scope():
        task = classification_example.ClassificationExampleTask(params.task)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
예제 #2
0
def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = train_utils.parse_configuration(FLAGS)
  model_dir = FLAGS.model_dir
  if "train" in FLAGS.mode:
    train_utils.serialize_config(params, model_dir)

  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu,
      **params.runtime.model_parallelism())

  with distribution_strategy.scope():
    if params.task.use_crf:
      task = ap_parsing_task.APParsingTaskCRF(params.task)
    else:
      task = ap_parsing_task.APParsingTaskBase(params.task)

    ckpt_exporter = train_utils.maybe_create_best_ckpt_exporter(
        params, model_dir)
    trainer = train_utils.create_trainer(
        params,
        task,
        train="train" in FLAGS.mode,
        evaluate=("eval" in FLAGS.mode),
        checkpoint_exporter=ckpt_exporter)

  model, _ = train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      trainer=trainer,
      model_dir=model_dir)

  train_utils.save_gin_config(FLAGS.mode, model_dir)

  # Export saved model.
  if "train" in FLAGS.mode:
    saved_model_path = os.path.join(model_dir, "saved_models/latest")
    logging.info("Exporting SavedModel to %s", saved_model_path)
    tf.saved_model.save(model, saved_model_path)

    if ckpt_exporter:
      logging.info("Loading best checkpoint for export")
      trainer.checkpoint.restore(ckpt_exporter.best_ckpt_path)
      saved_model_path = os.path.join(model_dir, "saved_models/best")

      # Make sure restored and not re-initialized.
      if trainer.global_step > 0:
        logging.info(
            "Exporting best saved model by %s (from global step: %d) to %s",
            params.trainer.best_checkpoint_eval_metric,
            trainer.global_step.numpy(), saved_model_path)
        tf.saved_model.save(trainer.model, saved_model_path)
예제 #3
0
def main(_):
    # TODO(b/177863554): consolidate to nlp/train.py
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    train_utils.serialize_config(params, model_dir)
    continuous_finetune_lib.run_continuous_finetune(
        FLAGS.mode, params, model_dir, pretrain_steps=FLAGS.pretrain_steps)
    train_utils.save_gin_config(FLAGS.mode, model_dir)
예제 #4
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)
    model_dir = FLAGS.model_dir
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    if 'train_and_eval' in FLAGS.mode:
        assert (
            params.task.train_data.feature_shape ==
            params.task.validation_data.feature_shape), (
                f'train {params.task.train_data.feature_shape} != validate '
                f'{params.task.validation_data.feature_shape}')

    if 'assemblenet' in FLAGS.experiment:
        if 'eval' in FLAGS.mode:
            # Use the feature shape in validation_data for all jobs. The number of
            # frames in train_data will be used to construct the Assemblenet model.
            params.task.model.backbone.assemblenet.num_frames = params.task.validation_data.feature_shape[
                0]
            shape = params.task.validation_data.feature_shape
        else:
            params.task.model.backbone.assemblenet.num_frames = params.task.train_data.feature_shape[
                0]
            shape = params.task.train_data.feature_shape
        logging.info('mode %r num_frames %r feature shape %r', FLAGS.mode,
                     params.task.model.backbone.assemblenet.num_frames, shape)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)
    distribution_strategy = distribute_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        tpu_address=params.runtime.tpu)
    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)
예제 #5
0
def main(_):
  gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
  params = train_utils.parse_configuration(FLAGS)
  model_dir = FLAGS.model_dir
  if 'train' in FLAGS.mode:
    # Pure eval modes do not output yaml files. Otherwise continuous eval job
    # may race against the train job for writing the same file.
    train_utils.serialize_config(params, model_dir)

  # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
  # can have significant impact on model speeds by utilizing float16 in case of
  # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
  # dtype is float16
  if params.runtime.mixed_precision_dtype:
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype)
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)

  if isinstance(params, cfg.ExperimentConfig):
    with distribution_strategy.scope():
      task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        mode=FLAGS.mode,
        params=params,
        model_dir=model_dir)

  elif isinstance(params, multi_cfg.MultiTaskExperimentConfig):
    with distribution_strategy.scope():
      task = multitask.MultiTask.from_config(params.task, model_dir)
      model = multihead_model.build_model(params.task)

    train_lib_multitask.run_experiment(
        distribution_strategy=distribution_strategy,
        task=task,
        model=model,
        mode=FLAGS.mode,
        params=params,
        model_dir=model_dir)

  else:
    raise ValueError("Expected config to be either type cfg.ExperimentConfig" + \
      "or multi_cfg.MultiTaskExperimentConfig, got %s" %type(params))

  train_utils.save_gin_config(FLAGS.mode, model_dir)
예제 #6
0
    performance.set_mixed_precision_policy(params.runtime.mixed_precision_dtype,
<<<<<<< HEAD
                                           params.runtime.loss_scale)
=======
                                           params.runtime.loss_scale,
                                           use_experimental_api=True)
>>>>>>> upstream/master
  distribution_strategy = distribute_utils.get_distribution_strategy(
      distribution_strategy=params.runtime.distribution_strategy,
      all_reduce_alg=params.runtime.all_reduce_alg,
      num_gpus=params.runtime.num_gpus,
      tpu_address=params.runtime.tpu)
  with distribution_strategy.scope():
    task = task_factory.get_task(params.task, logging_dir=model_dir)

  train_lib.run_experiment(
      distribution_strategy=distribution_strategy,
      task=task,
      mode=FLAGS.mode,
      params=params,
      model_dir=model_dir)

<<<<<<< HEAD
=======
  train_utils.save_gin_config(FLAGS.mode, model_dir)

>>>>>>> upstream/master
if __name__ == '__main__':
  tfm_flags.define_flags()
  app.run(main)
예제 #7
0
def main(_):
    gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_params)
    params = train_utils.parse_configuration(FLAGS)

    if params.runtime.num_hpus > 0:
        import os
        #TODO: remove when SW-49334 is fixed [SW-49404]
        os.environ["TF_DISABLE_EAGER_TO_FUNC_REWRITER"] = "1"
        from habana_frameworks.tensorflow import load_habana_module
        load_habana_module()

    if params.task.train_data.deterministic or params.task.validation_data.deterministic:
        import os
        os.environ['PYTHONHASHSEED'] = '0'
        os.environ['TF_DETERMINISTIC_OPS'] = '1'
        import numpy
        numpy.random.seed(0)
        import tensorflow as tf
        tf.random.set_seed(0)
        tf.compat.v1.set_random_seed(0)
        import random
        random.seed(0)

    if FLAGS.dtype == "bf16":
        print("Using bf16 config list {}".format(FLAGS.bf16_config_path))
        os.environ['TF_BF16_CONVERSION'] = FLAGS.bf16_config_path

    hls_addresses = str(os.environ.get("MULTI_HLS_IPS",
                                       "127.0.0.1")).split(",")
    TF_BASE_PORT = 2410
    mpi_rank = comm_rank()
    mpi_size = comm_size()

    if params.runtime.num_hpus > 1:
        model_dir = os.path.join(FLAGS.model_dir, "worker_" + str(mpi_rank))
    else:
        model_dir = FLAGS.model_dir

    #prepare a comma-seperated list of device addreses
    worker_list = []
    for address in hls_addresses:
        for rank in range(mpi_size // len(hls_addresses)):
            worker_list.append(address + ':' + str(TF_BASE_PORT + rank))
    worker_hosts = ",".join(worker_list)
    task_index = mpi_rank

    # Configures cluster spec for distribution strategy.
    distribution_utils.configure_cluster(worker_hosts, task_index)
    if 'train' in FLAGS.mode:
        # Pure eval modes do not output yaml files. Otherwise continuous eval job
        # may race against the train job for writing the same file.
        train_utils.serialize_config(params, model_dir)

    # Sets mixed_precision policy. Using 'mixed_float16' or 'mixed_bfloat16'
    # can have significant impact on model speeds by utilizing float16 in case of
    # GPUs, and bfloat16 in the case of TPUs. loss_scale takes effect only when
    # dtype is float16
    if params.runtime.mixed_precision_dtype:
        performance.set_mixed_precision_policy(
            params.runtime.mixed_precision_dtype)

    distribution_strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=params.runtime.distribution_strategy,
        all_reduce_alg=params.runtime.all_reduce_alg,
        num_gpus=params.runtime.num_gpus,
        num_hpus=params.runtime.num_hpus,
        tpu_address=params.runtime.tpu)

    with distribution_strategy.scope():
        task = task_factory.get_task(params.task, logging_dir=model_dir)

    train_lib.run_experiment(distribution_strategy=distribution_strategy,
                             task=task,
                             mode=FLAGS.mode,
                             params=params,
                             model_dir=model_dir)

    train_utils.save_gin_config(FLAGS.mode, model_dir)