示例#1
0
  def _tpu_estimator_spec_eval(self, features, logits, labels, loss,
                               losses_dict):
    """Construct EstimatorSpec for TPU EVAL mode."""
    del losses_dict
    hparams = self.hparams

    if not hasattr(hparams, "problem"):
      raise NotImplementedError(
          "hparams is missing attribute `problem`. NasSeq2Seq must "
          "be used with a problem.")

    problem = hparams.problem
    t2t_model.remove_summaries()
    eval_metrics_fn = t2t_model.create_tpu_eval_metrics_fn(problem, hparams)
    if isinstance(logits, dict):
      # For TPU, logits dict will be passed as keyword arguments to
      # eval_metrics_fn. Here we add the labels to those arguments.
      logits.update({"labels": labels})
      return contrib.tpu().TPUEstimatorSpec(
          tf.estimator.ModeKeys.EVAL,
          eval_metrics=(eval_metrics_fn, logits),
          loss=loss)
    else:
      return contrib.tpu().TPUEstimatorSpec(
          tf.estimator.ModeKeys.EVAL,
          eval_metrics=(eval_metrics_fn, [logits, labels]),
          loss=loss)
示例#2
0
def create_run_config(model_name,
                      master="",
                      model_dir=None,
                      iterations_per_loop=1000,
                      num_shards=8,
                      log_device_placement=False,
                      save_checkpoints_steps=1000,
                      save_checkpoints_secs=None,
                      keep_checkpoint_max=20,
                      keep_checkpoint_every_n_hours=10000,
                      num_gpus=1,
                      gpu_order="",
                      num_async_replicas=1,
                      enable_graph_rewriter=False,
                      gpu_mem_fraction=0.95,
                      no_data_parallelism=False,
                      optionally_use_dist_strat=False,
                      daisy_chain_variables=True,
                      schedule="continuous_train_and_eval",
                      worker_job="/job:localhost",
                      worker_id=0,
                      ps_replicas=0,
                      ps_job="/job:ps",
                      ps_gpu=0,
                      random_seed=None,
                      sync=False,
                      tpu_infeed_sleep_secs=None,
                      use_tpu=False,
                      use_tpu_estimator=False,
                      xla_jit_level=tf.OptimizerOptions.OFF,
                      inter_op_parallelism_threads=0,
                      log_step_count_steps=100,
                      intra_op_parallelism_threads=0,
                      tpu_config_extra_kwargs=None,
                      cloud_tpu_name="",
                      cloud_tpu_zone=None):
    """Create RunConfig, TPUConfig, and Parallelism object."""
    session_config = create_session_config(
        log_device_placement=log_device_placement,
        enable_graph_rewriter=enable_graph_rewriter,
        gpu_mem_fraction=gpu_mem_fraction,
        use_tpu=use_tpu,
        xla_jit_level=xla_jit_level,
        inter_op_parallelism_threads=inter_op_parallelism_threads,
        intra_op_parallelism_threads=intra_op_parallelism_threads)
    run_config_args = {
        "master": master,
        "evaluation_master": master,
        "model_dir": model_dir,
        "session_config": session_config,
        "save_summary_steps": 100,
        "save_checkpoints_steps": save_checkpoints_steps,
        "save_checkpoints_secs": save_checkpoints_secs,
        "keep_checkpoint_max": keep_checkpoint_max,
        "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
        "tf_random_seed": random_seed,
        "log_step_count_steps": log_step_count_steps,
    }
    if save_checkpoints_secs:
        del run_config_args["save_checkpoints_steps"]
    run_config_cls = contrib.learn().RunConfig

    if use_tpu or use_tpu_estimator:
        # If using TPUEstimator, use TPU RunConfig, add TPUConfig, and add
        # additional args.
        tpu_config_kwargs = {
            "iterations_per_loop": iterations_per_loop,
            "num_shards": num_shards,
            "per_host_input_for_training": True,
            "initial_infeed_sleep_secs": tpu_infeed_sleep_secs,
        }
        if tpu_config_extra_kwargs is not None:
            tpu_config_kwargs.update(tpu_config_extra_kwargs)
        run_config_cls = contrib.tpu().RunConfig
        tpu_config = contrib.tpu().TPUConfig(**tpu_config_kwargs)
        run_config_args["tpu_config"] = tpu_config
        if not master and "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS" in os.environ:
            # If running on TPU but no master is set and the KUBE env var is present
            # then we're running on ML Engine. Set the master.
            run_config_args["master"] = os.environ[
                "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS"]
            run_config_args["evaluation_master"] = run_config_args["master"]
        elif not master and cloud_tpu_name:
            # Update run_config to use cluster instead of master/evaluation_master
            # as we need the cluster spec to use Cloud Pods
            tpu_cluster_resolver = contrib.cluster_resolver(
            ).TPUClusterResolver(tpu=cloud_tpu_name, zone=cloud_tpu_zone)
            run_config_args["cluster"] = tpu_cluster_resolver
            del run_config_args["master"]
            del run_config_args["evaluation_master"]
    elif is_cloud_async_distributed():
        run_config_cls = tf.estimator.RunConfig
        del run_config_args["master"]
        del run_config_args["evaluation_master"]

    # tf.estimator RunConfig construction got totally broken in TF2.
    # we now have to specify master in a global environment variable
    if contrib.is_tf2:
        del run_config_args["evaluation_master"]
        del run_config_args["master"]

    config = run_config_cls(**run_config_args)

    # If not using TPU, add device info for data_parallelism
    config.use_tpu = use_tpu
    if not use_tpu:
        config.t2t_device_info = {
            "num_async_replicas": num_async_replicas,
        }
        use_distribution_strategy = (
            optionally_use_dist_strat
            and t2t_model.T2TModel.has_symmetric_shards(model_name)
            and not no_data_parallelism and ps_replicas == 0 and ps_gpu == 0
            and num_async_replicas == 1)

        if use_distribution_strategy:
            tf.logging.info(
                "Configuring MirroredStrategy DistributionStrategy to replicate the "
                "model.")
            distribution = contrib.distribute().MirroredStrategy()
            config = config.replace(train_distribute=distribution)
            config.data_parallelism = None
        else:
            tf.logging.info(
                "Configuring DataParallelism to replicate the model.")
            config.data_parallelism = devices.data_parallelism(
                daisy_chain_variables=daisy_chain_variables,
                ps_replicas=ps_replicas,
                ps_job=ps_job,
                ps_gpu=ps_gpu,
                schedule=schedule,
                sync=sync,
                worker_gpu=num_gpus,
                worker_replicas=num_async_replicas,
                worker_id=worker_id,
                gpu_order=gpu_order,
                worker_job=worker_job,
                no_data_parallelism=no_data_parallelism)

    return config
示例#3
0
def create_estimator(model_name,
                     hparams,
                     run_config,
                     schedule="train_and_evaluate",
                     decode_hparams=None,
                     use_tpu=False,
                     use_tpu_estimator=False,
                     use_xla=False,
                     export_saved_model_api_version=1,
                     use_guarantee_const_getter=False):
    """Create a T2T Estimator."""
    model_fn = t2t_model.T2TModel.make_estimator_model_fn(
        model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu)

    del use_xla
    if use_tpu or use_tpu_estimator:
        from tensorflow.contrib.tpu.python.tpu import tpu_estimator  # pylint: disable=g-import-not-at-top
        problem = hparams.problem
        batch_size = (problem.tpu_batch_size_per_shard(hparams) *
                      run_config.tpu_config.num_shards)
        mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE,
                                     value=batch_size)
        if getattr(hparams, "mtf_mode", False):
            batch_size = problem.tpu_batch_size_per_shard(hparams)
        predict_batch_size = batch_size
        if decode_hparams and decode_hparams.batch_size:
            predict_batch_size = decode_hparams.batch_size
        if decode_hparams and run_config.tpu_config:
            decode_hparams.add_hparam(
                "iterations_per_loop",
                run_config.tpu_config.iterations_per_loop)
        if export_saved_model_api_version == 1:
            api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V1
            estimator_model_fn = model_fn
        elif export_saved_model_api_version == 2:
            api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V2

            def maybe_use_guarantee_const_getter_model_fn(
                    features, labels, mode, params):
                """Wrapper model_fn with guarantee_const getter."""
                if not use_guarantee_const_getter:
                    return model_fn(features, labels, mode, params)

                # It marks all weights as constant, which may improves TPU inference
                # performance because it prevents the weights being transferred to the
                # TPU. It will increase HBM "program" usage and reduce HBM "arguments"
                # usage during TPU model serving.
                def guarantee_const_getter(getter, name, *args, **kwargs):
                    with tf.control_dependencies(None):
                        return tf.guarantee_const(
                            getter(name, *args, **kwargs),
                            name=name + "/GuaranteeConst")

                @contextlib.contextmanager
                def guarantee_const_scope():
                    var_scope = tf.get_variable_scope()
                    prev_custom_getter = var_scope.custom_getter
                    prev_caching_device = var_scope.caching_device
                    var_scope.set_custom_getter(guarantee_const_getter)
                    var_scope.set_caching_device(lambda op: op.device)
                    yield
                    var_scope.set_custom_getter(prev_custom_getter)
                    var_scope.set_caching_device(prev_caching_device)

                with guarantee_const_scope():
                    return model_fn(features, labels, mode, params)

            def tpu_model_fn(features, labels, mode, params):
                """Wrapper model_fn with tpu.rewrite / TPUPartitionedCall."""
                if mode == tf.estimator.ModeKeys.PREDICT and params["use_tpu"]:
                    batch_config = tpu_estimator.BatchConfig(
                        num_batch_threads=2,
                        max_batch_size=predict_batch_size,
                        batch_timeout_micros=60 * 1000,
                        allowed_batch_sizes=[predict_batch_size])
                    return tpu_estimator.model_fn_inference_on_tpu(
                        maybe_use_guarantee_const_getter_model_fn,
                        features=features,
                        labels=labels,
                        config=None,
                        params=params,
                        batch_config=batch_config)
                else:
                    return model_fn(features, labels, mode, params)

            estimator_model_fn = tpu_model_fn
        else:
            raise ValueError(
                "Flag export_saved_model_api_version must be 1 or 2.")
        estimator = contrib.tpu().TPUEstimator(
            model_fn=estimator_model_fn,
            model_dir=run_config.model_dir,
            config=run_config,
            use_tpu=use_tpu,
            train_batch_size=batch_size,
            eval_batch_size=batch_size if "eval" in schedule else None,
            predict_batch_size=predict_batch_size,
            export_saved_model_api_version=api_version_enum_name)
    else:
        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            model_dir=run_config.model_dir,
            config=run_config,
        )
    return estimator
示例#4
0
def optimize(loss, learning_rate, hparams, use_tpu=False, variables=None):
    """Minimize loss."""
    loss = weight_decay_and_noise(loss, hparams, learning_rate)
    loss = tf.identity(loss, name="total_loss")
    if variables is None:
        variables = tf.trainable_variables()
    # Print trainable variables.
    log_variable_sizes(variables, verbose=hparams.summarize_vars)
    # Print non-trainable variables.
    non_trainable_variables = list(set(tf.global_variables()) - set(variables))
    log_variable_sizes(non_trainable_variables,
                       tag="Non-trainable variables",
                       verbose=hparams.summarize_vars)
    if hparams.summarize_vars:
        summarize_variables(variables)
        # Summarize non-trainable variables as well
        summarize_variables(non_trainable_variables,
                            tag="Non-trainable variables")
    diet_vars = [
        v for v in tf.global_variables() if v.dtype == dtypes.float16_ref
    ]
    log_variable_sizes(diet_vars,
                       "Diet Variables",
                       verbose=hparams.summarize_vars)
    opt = ConditionalOptimizer(hparams.optimizer, learning_rate, hparams,
                               use_tpu)
    if use_tpu:
        opt = contrib.tpu().CrossShardOptimizer(opt)
    if getattr(hparams, "gpu_automatic_mixed_precision", False):
        if use_tpu:
            raise RuntimeError(
                "GPU auto mixed precision cannot be used with TPU")
        elif _mixed_precision_is_enabled(hparams):
            raise RuntimeError(
                "GPU auto mixed precision cannot be used with manual mixed precision"
            )
        else:
            setattr(opt, "_use_locking", "True")
            setattr(opt, "_name", "ConditionalOptimizer")
            opt = tf.train.experimental.enable_mixed_precision_graph_rewrite(
                opt)

    opt_summaries = []
    if common_layers.should_generate_summaries():
        tf.summary.scalar("learning_rate", learning_rate)
        opt_summaries.append("loss")
        if hparams.summarize_grads:
            tf.logging.info("Summarizing gradients")
            opt_summaries.extend(
                ["gradients", "gradient_norm", "global_gradient_norm"])

    if hparams.clip_grad_norm:
        tf.logging.info("Clipping gradients, norm: %0.5f",
                        hparams.clip_grad_norm)
    if hparams.grad_noise_scale:
        tf.logging.info("Adding noise to gradients, noise scale: %0.5f",
                        hparams.grad_noise_scale)

    train_op = contrib.layers().optimize_loss(
        name="training",
        loss=loss,
        global_step=tf.train.get_or_create_global_step(),
        learning_rate=learning_rate,
        clip_gradients=hparams.clip_grad_norm or None,
        gradient_noise_scale=hparams.grad_noise_scale or None,
        optimizer=opt,
        summaries=opt_summaries,
        colocate_gradients_with_ops=True,
        variables=variables)
    return train_op