def _tpu_estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for TPU EVAL mode.""" del losses_dict hparams = self.hparams if not hasattr(hparams, "problem"): raise NotImplementedError( "hparams is missing attribute `problem`. NasSeq2Seq must " "be used with a problem.") problem = hparams.problem t2t_model.remove_summaries() eval_metrics_fn = t2t_model.create_tpu_eval_metrics_fn(problem, hparams) if isinstance(logits, dict): # For TPU, logits dict will be passed as keyword arguments to # eval_metrics_fn. Here we add the labels to those arguments. logits.update({"labels": labels}) return contrib.tpu().TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, logits), loss=loss) else: return contrib.tpu().TPUEstimatorSpec( tf.estimator.ModeKeys.EVAL, eval_metrics=(eval_metrics_fn, [logits, labels]), loss=loss)
def create_run_config(model_name, master="", model_dir=None, iterations_per_loop=1000, num_shards=8, log_device_placement=False, save_checkpoints_steps=1000, save_checkpoints_secs=None, keep_checkpoint_max=20, keep_checkpoint_every_n_hours=10000, num_gpus=1, gpu_order="", num_async_replicas=1, enable_graph_rewriter=False, gpu_mem_fraction=0.95, no_data_parallelism=False, optionally_use_dist_strat=False, daisy_chain_variables=True, schedule="continuous_train_and_eval", worker_job="/job:localhost", worker_id=0, ps_replicas=0, ps_job="/job:ps", ps_gpu=0, random_seed=None, sync=False, tpu_infeed_sleep_secs=None, use_tpu=False, use_tpu_estimator=False, xla_jit_level=tf.OptimizerOptions.OFF, inter_op_parallelism_threads=0, log_step_count_steps=100, intra_op_parallelism_threads=0, tpu_config_extra_kwargs=None, cloud_tpu_name="", cloud_tpu_zone=None): """Create RunConfig, TPUConfig, and Parallelism object.""" session_config = create_session_config( log_device_placement=log_device_placement, enable_graph_rewriter=enable_graph_rewriter, gpu_mem_fraction=gpu_mem_fraction, use_tpu=use_tpu, xla_jit_level=xla_jit_level, inter_op_parallelism_threads=inter_op_parallelism_threads, intra_op_parallelism_threads=intra_op_parallelism_threads) run_config_args = { "master": master, "evaluation_master": master, "model_dir": model_dir, "session_config": session_config, "save_summary_steps": 100, "save_checkpoints_steps": save_checkpoints_steps, "save_checkpoints_secs": save_checkpoints_secs, "keep_checkpoint_max": keep_checkpoint_max, "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours, "tf_random_seed": random_seed, "log_step_count_steps": log_step_count_steps, } if save_checkpoints_secs: del run_config_args["save_checkpoints_steps"] run_config_cls = contrib.learn().RunConfig if use_tpu or use_tpu_estimator: # If using TPUEstimator, use TPU RunConfig, add TPUConfig, and add # additional args. tpu_config_kwargs = { "iterations_per_loop": iterations_per_loop, "num_shards": num_shards, "per_host_input_for_training": True, "initial_infeed_sleep_secs": tpu_infeed_sleep_secs, } if tpu_config_extra_kwargs is not None: tpu_config_kwargs.update(tpu_config_extra_kwargs) run_config_cls = contrib.tpu().RunConfig tpu_config = contrib.tpu().TPUConfig(**tpu_config_kwargs) run_config_args["tpu_config"] = tpu_config if not master and "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS" in os.environ: # If running on TPU but no master is set and the KUBE env var is present # then we're running on ML Engine. Set the master. run_config_args["master"] = os.environ[ "KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS"] run_config_args["evaluation_master"] = run_config_args["master"] elif not master and cloud_tpu_name: # Update run_config to use cluster instead of master/evaluation_master # as we need the cluster spec to use Cloud Pods tpu_cluster_resolver = contrib.cluster_resolver( ).TPUClusterResolver(tpu=cloud_tpu_name, zone=cloud_tpu_zone) run_config_args["cluster"] = tpu_cluster_resolver del run_config_args["master"] del run_config_args["evaluation_master"] elif is_cloud_async_distributed(): run_config_cls = tf.estimator.RunConfig del run_config_args["master"] del run_config_args["evaluation_master"] # tf.estimator RunConfig construction got totally broken in TF2. # we now have to specify master in a global environment variable if contrib.is_tf2: del run_config_args["evaluation_master"] del run_config_args["master"] config = run_config_cls(**run_config_args) # If not using TPU, add device info for data_parallelism config.use_tpu = use_tpu if not use_tpu: config.t2t_device_info = { "num_async_replicas": num_async_replicas, } use_distribution_strategy = ( optionally_use_dist_strat and t2t_model.T2TModel.has_symmetric_shards(model_name) and not no_data_parallelism and ps_replicas == 0 and ps_gpu == 0 and num_async_replicas == 1) if use_distribution_strategy: tf.logging.info( "Configuring MirroredStrategy DistributionStrategy to replicate the " "model.") distribution = contrib.distribute().MirroredStrategy() config = config.replace(train_distribute=distribution) config.data_parallelism = None else: tf.logging.info( "Configuring DataParallelism to replicate the model.") config.data_parallelism = devices.data_parallelism( daisy_chain_variables=daisy_chain_variables, ps_replicas=ps_replicas, ps_job=ps_job, ps_gpu=ps_gpu, schedule=schedule, sync=sync, worker_gpu=num_gpus, worker_replicas=num_async_replicas, worker_id=worker_id, gpu_order=gpu_order, worker_job=worker_job, no_data_parallelism=no_data_parallelism) return config
def create_estimator(model_name, hparams, run_config, schedule="train_and_evaluate", decode_hparams=None, use_tpu=False, use_tpu_estimator=False, use_xla=False, export_saved_model_api_version=1, use_guarantee_const_getter=False): """Create a T2T Estimator.""" model_fn = t2t_model.T2TModel.make_estimator_model_fn( model_name, hparams, decode_hparams=decode_hparams, use_tpu=use_tpu) del use_xla if use_tpu or use_tpu_estimator: from tensorflow.contrib.tpu.python.tpu import tpu_estimator # pylint: disable=g-import-not-at-top problem = hparams.problem batch_size = (problem.tpu_batch_size_per_shard(hparams) * run_config.tpu_config.num_shards) mlperf_log.transformer_print(key=mlperf_log.INPUT_BATCH_SIZE, value=batch_size) if getattr(hparams, "mtf_mode", False): batch_size = problem.tpu_batch_size_per_shard(hparams) predict_batch_size = batch_size if decode_hparams and decode_hparams.batch_size: predict_batch_size = decode_hparams.batch_size if decode_hparams and run_config.tpu_config: decode_hparams.add_hparam( "iterations_per_loop", run_config.tpu_config.iterations_per_loop) if export_saved_model_api_version == 1: api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V1 estimator_model_fn = model_fn elif export_saved_model_api_version == 2: api_version_enum_name = tpu_estimator.ExportSavedModelApiVersion.V2 def maybe_use_guarantee_const_getter_model_fn( features, labels, mode, params): """Wrapper model_fn with guarantee_const getter.""" if not use_guarantee_const_getter: return model_fn(features, labels, mode, params) # It marks all weights as constant, which may improves TPU inference # performance because it prevents the weights being transferred to the # TPU. It will increase HBM "program" usage and reduce HBM "arguments" # usage during TPU model serving. def guarantee_const_getter(getter, name, *args, **kwargs): with tf.control_dependencies(None): return tf.guarantee_const( getter(name, *args, **kwargs), name=name + "/GuaranteeConst") @contextlib.contextmanager def guarantee_const_scope(): var_scope = tf.get_variable_scope() prev_custom_getter = var_scope.custom_getter prev_caching_device = var_scope.caching_device var_scope.set_custom_getter(guarantee_const_getter) var_scope.set_caching_device(lambda op: op.device) yield var_scope.set_custom_getter(prev_custom_getter) var_scope.set_caching_device(prev_caching_device) with guarantee_const_scope(): return model_fn(features, labels, mode, params) def tpu_model_fn(features, labels, mode, params): """Wrapper model_fn with tpu.rewrite / TPUPartitionedCall.""" if mode == tf.estimator.ModeKeys.PREDICT and params["use_tpu"]: batch_config = tpu_estimator.BatchConfig( num_batch_threads=2, max_batch_size=predict_batch_size, batch_timeout_micros=60 * 1000, allowed_batch_sizes=[predict_batch_size]) return tpu_estimator.model_fn_inference_on_tpu( maybe_use_guarantee_const_getter_model_fn, features=features, labels=labels, config=None, params=params, batch_config=batch_config) else: return model_fn(features, labels, mode, params) estimator_model_fn = tpu_model_fn else: raise ValueError( "Flag export_saved_model_api_version must be 1 or 2.") estimator = contrib.tpu().TPUEstimator( model_fn=estimator_model_fn, model_dir=run_config.model_dir, config=run_config, use_tpu=use_tpu, train_batch_size=batch_size, eval_batch_size=batch_size if "eval" in schedule else None, predict_batch_size=predict_batch_size, export_saved_model_api_version=api_version_enum_name) else: estimator = tf.estimator.Estimator( model_fn=model_fn, model_dir=run_config.model_dir, config=run_config, ) return estimator
def optimize(loss, learning_rate, hparams, use_tpu=False, variables=None): """Minimize loss.""" loss = weight_decay_and_noise(loss, hparams, learning_rate) loss = tf.identity(loss, name="total_loss") if variables is None: variables = tf.trainable_variables() # Print trainable variables. log_variable_sizes(variables, verbose=hparams.summarize_vars) # Print non-trainable variables. non_trainable_variables = list(set(tf.global_variables()) - set(variables)) log_variable_sizes(non_trainable_variables, tag="Non-trainable variables", verbose=hparams.summarize_vars) if hparams.summarize_vars: summarize_variables(variables) # Summarize non-trainable variables as well summarize_variables(non_trainable_variables, tag="Non-trainable variables") diet_vars = [ v for v in tf.global_variables() if v.dtype == dtypes.float16_ref ] log_variable_sizes(diet_vars, "Diet Variables", verbose=hparams.summarize_vars) opt = ConditionalOptimizer(hparams.optimizer, learning_rate, hparams, use_tpu) if use_tpu: opt = contrib.tpu().CrossShardOptimizer(opt) if getattr(hparams, "gpu_automatic_mixed_precision", False): if use_tpu: raise RuntimeError( "GPU auto mixed precision cannot be used with TPU") elif _mixed_precision_is_enabled(hparams): raise RuntimeError( "GPU auto mixed precision cannot be used with manual mixed precision" ) else: setattr(opt, "_use_locking", "True") setattr(opt, "_name", "ConditionalOptimizer") opt = tf.train.experimental.enable_mixed_precision_graph_rewrite( opt) opt_summaries = [] if common_layers.should_generate_summaries(): tf.summary.scalar("learning_rate", learning_rate) opt_summaries.append("loss") if hparams.summarize_grads: tf.logging.info("Summarizing gradients") opt_summaries.extend( ["gradients", "gradient_norm", "global_gradient_norm"]) if hparams.clip_grad_norm: tf.logging.info("Clipping gradients, norm: %0.5f", hparams.clip_grad_norm) if hparams.grad_noise_scale: tf.logging.info("Adding noise to gradients, noise scale: %0.5f", hparams.grad_noise_scale) train_op = contrib.layers().optimize_loss( name="training", loss=loss, global_step=tf.train.get_or_create_global_step(), learning_rate=learning_rate, clip_gradients=hparams.clip_grad_norm or None, gradient_noise_scale=hparams.grad_noise_scale or None, optimizer=opt, summaries=opt_summaries, colocate_gradients_with_ops=True, variables=variables) return train_op