def test_init_run_config_none_distribute_coordinator_mode(self): # We don't use distribute coordinator for local training. config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) dc_training.init_run_config(config, {}) self.assertIsNone(config._distribute_coordinator_mode) # With a master in the cluster, don't run distribute coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertIsNone(config._distribute_coordinator_mode) # When `train_distribute` is not specified, don't use distribute # coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config = run_config_lib.RunConfig() self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
def test_init_run_config_none_distribute_coordinator_mode(self): # We don't use distribute coordinator for local training. config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) dc_training.init_run_config(config, {}) self.assertIsNone(config._distribute_coordinator_mode) # With a master in the cluster, don't run distribute coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}): config = run_config_lib.RunConfig( train_distribute=mirrored_strategy.CoreMirroredStrategy()) self.assertIsNone(config._distribute_coordinator_mode) # When `train_distribute` is not specified, don't use distribute # coordinator. with test.mock.patch.dict("os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}): config = run_config_lib.RunConfig() self.assertFalse(hasattr(config, "_distribute_coordinator_mode"))
def __init__(self, model_dir=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_steps=_USE_DEFAULT, save_checkpoints_secs=_USE_DEFAULT, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, train_distribute=None, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None, experimental_max_worker_delay_secs=None, session_creation_timeout_secs=7200): """Constructs a RunConfig. All distributed training related properties `cluster_spec`, `is_chief`, `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and `task_type` are set based on the `TF_CONFIG` environment variable, if the pertinent information is present. The `TF_CONFIG` environment variable is a JSON object with attributes: `cluster` and `task`. `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to a list of task addresses. `task` has two attributes: `type` and `index`, where `type` can be any of the task types in `cluster`. When `TF_CONFIG` contains said information, the following properties are set on this class: * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If present, must have one and only one node in the `chief` attribute of `cluster_spec`. * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if `cluster_spec` is present; must be `worker` (the default value) if `cluster_spec` is not set. * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if `cluster_spec` is present; must be 0 (the default value) if `cluster_spec` is not set. * `master` is determined by looking up `task_type` and `task_id` in the `cluster_spec`. Defaults to ''. * `num_ps_replicas` is set by counting the number of nodes listed in the `ps` attribute of `cluster_spec`. Defaults to 0. * `num_worker_replicas` is set by counting the number of nodes listed in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1. * `is_chief` is determined based on `task_type` and `cluster`. There is a special node with `task_type` as `evaluator`, which is not part of the (training) `cluster_spec`. It handles the distributed evaluation job. Example of non-chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'worker', 'index': 1}}) config = RunConfig() assert config.master == 'host4:2222' assert config.task_id == 1 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'worker' assert not config.is_chief ``` Example of chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'chief', 'index': 0}}) config = RunConfig() assert config.master == 'host0:2222' assert config.task_id == 0 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'chief' assert config.is_chief ``` Example of evaluator node (evaluator is not part of training cluster): ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'evaluator', 'index': 0}}) config = RunConfig() assert config.master == '' assert config.evaluator_master == '' assert config.task_id == 0 assert config.num_ps_replicas == 0 assert config.num_worker_replicas == 0 assert config.cluster_spec == {} assert config.task_type == 'evaluator' assert not config.is_chief ``` N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set, `keep_checkpoint_max` might need to be adjusted accordingly, especially in distributed training. For example, setting `save_checkpoints_secs` as 60 without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation that checkpoint would be garbage collected after 5 minutes. In distributed training, the evaluation job starts asynchronously and might fail to load or find the checkpoint due to race condition. Args: model_dir: directory where model parameters, graph, etc are saved. If `PathLike` object, the path will be resolved. If `None`, will use a default value set by the Estimator. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. Defaults to 600 seconds if both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in constructor. If both `save_checkpoints_steps` and `save_checkpoints_secs` are `None`, then checkpoints are disabled. session_config: a ConfigProto used to set session parameters, or `None`. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If `None` or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept). If a saver is passed to the estimator, this argument will be ignored. keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step and the loss will be logged during training. Also controls the frequency that the global steps / s will be logged (and written to summary) during training. train_distribute: An optional instance of `tf.distribute.Strategy`. If specified, then Estimator will distribute the user's model during training, according to the policy specified by that strategy. Setting `experimental_distribute.train_distribute` is preferred. device_fn: A callable invoked for every `Operation` that takes the `Operation` and returns the device string. If `None`, defaults to the device function returned by `tf.train.replica_device_setter` with round-robin strategy. protocol: An optional argument which specifies the protocol used when starting server. `None` means default to grpc. eval_distribute: An optional instance of `tf.distribute.Strategy`. If specified, then Estimator will distribute the user's model during evaluation, according to the policy specified by that strategy. Setting `experimental_distribute.eval_distribute` is preferred. experimental_distribute: An optional `tf.contrib.distribute.DistributeConfig` object specifying DistributionStrategy-related configuration. The `train_distribute` and `eval_distribute` can be passed as parameters to `RunConfig` or set in `experimental_distribute` but not both. experimental_max_worker_delay_secs: An optional integer specifying the maximum time a worker should wait before starting. By default, workers are started at staggered times, with each worker being delayed by up to 60 seconds. This is intended to reduce the risk of divergence, which can occur when many workers simultaneously update the weights of a randomly initialized model. Users who warm-start their models and train them for short durations (a few minutes or less) should consider reducing this default to improve training times. session_creation_timeout_secs: Max time workers should wait for a session to become available (on initialization or when recovering a session) with MonitoredTrainingSession. Defaults to 7200 seconds, but users may want to set a lower value to detect problems with variable / session (re)-initialization more quickly. Raises: ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs` are set. """ if (save_checkpoints_steps == _USE_DEFAULT and save_checkpoints_secs == _USE_DEFAULT): save_checkpoints_steps = None save_checkpoints_secs = 600 elif save_checkpoints_secs == _USE_DEFAULT: save_checkpoints_secs = None elif save_checkpoints_steps == _USE_DEFAULT: save_checkpoints_steps = None elif (save_checkpoints_steps is not None and save_checkpoints_secs is not None): raise ValueError(_SAVE_CKPT_ERR) tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}')) if tf_config: tf.compat.v1.logging.info('TF_CONFIG environment variable: %s', tf_config) model_dir = _get_model_dir(tf_config, compat_internal.path_to_str(model_dir)) RunConfig._replace( self, allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, train_distribute=train_distribute, device_fn=device_fn, protocol=protocol, eval_distribute=eval_distribute, experimental_distribute=experimental_distribute, experimental_max_worker_delay_secs= experimental_max_worker_delay_secs, session_creation_timeout_secs=session_creation_timeout_secs) # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs. if ((train_distribute and not train_distribute.__class__.__name__.startswith('TPUStrategy')) or (eval_distribute and not eval_distribute.__class__.__name__.startswith('TPUStrategy')) or experimental_distribute): tf.compat.v1.logging.info( 'Initializing RunConfig with distribution strategies.') distribute_coordinator_training.init_run_config(self, tf_config) else: self._init_distributed_setting_from_environment_var(tf_config) self._maybe_overwrite_session_config_for_distributed_training()
def __init__(self, model_dir=None, tf_random_seed=None, save_summary_steps=100, save_checkpoints_steps=_USE_DEFAULT, save_checkpoints_secs=_USE_DEFAULT, session_config=None, keep_checkpoint_max=5, keep_checkpoint_every_n_hours=10000, log_step_count_steps=100, train_distribute=None, device_fn=None, protocol=None, eval_distribute=None, experimental_distribute=None): """Constructs a RunConfig. All distributed training related properties `cluster_spec`, `is_chief`, `master` , `num_worker_replicas`, `num_ps_replicas`, `task_id`, and `task_type` are set based on the `TF_CONFIG` environment variable, if the pertinent information is present. The `TF_CONFIG` environment variable is a JSON object with attributes: `cluster` and `task`. `cluster` is a JSON serialized version of `ClusterSpec`'s Python dict from `server_lib.py`, mapping task types (usually one of the `TaskType` enums) to a list of task addresses. `task` has two attributes: `type` and `index`, where `type` can be any of the task types in `cluster`. When `TF_CONFIG` contains said information, the following properties are set on this class: * `cluster_spec` is parsed from `TF_CONFIG['cluster']`. Defaults to {}. If present, must have one and only one node in the `chief` attribute of `cluster_spec`. * `task_type` is set to `TF_CONFIG['task']['type']`. Must set if `cluster_spec` is present; must be `worker` (the default value) if `cluster_spec` is not set. * `task_id` is set to `TF_CONFIG['task']['index']`. Must set if `cluster_spec` is present; must be 0 (the default value) if `cluster_spec` is not set. * `master` is determined by looking up `task_type` and `task_id` in the `cluster_spec`. Defaults to ''. * `num_ps_replicas` is set by counting the number of nodes listed in the `ps` attribute of `cluster_spec`. Defaults to 0. * `num_worker_replicas` is set by counting the number of nodes listed in the `worker` and `chief` attributes of `cluster_spec`. Defaults to 1. * `is_chief` is determined based on `task_type` and `cluster`. There is a special node with `task_type` as `evaluator`, which is not part of the (training) `cluster_spec`. It handles the distributed evaluation job. Example of non-chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'worker', 'index': 1}}) config = RunConfig() assert config.master == 'host4:2222' assert config.task_id == 1 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'worker' assert not config.is_chief ``` Example of chief node: ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'chief', 'index': 0}}) config = RunConfig() assert config.master == 'host0:2222' assert config.task_id == 0 assert config.num_ps_replicas == 2 assert config.num_worker_replicas == 4 assert config.cluster_spec == server_lib.ClusterSpec(cluster) assert config.task_type == 'chief' assert config.is_chief ``` Example of evaluator node (evaluator is not part of training cluster): ``` cluster = {'chief': ['host0:2222'], 'ps': ['host1:2222', 'host2:2222'], 'worker': ['host3:2222', 'host4:2222', 'host5:2222']} os.environ['TF_CONFIG'] = json.dumps( {'cluster': cluster, 'task': {'type': 'evaluator', 'index': 0}}) config = RunConfig() assert config.master == '' assert config.evaluator_master == '' assert config.task_id == 0 assert config.num_ps_replicas == 0 assert config.num_worker_replicas == 0 assert config.cluster_spec == {} assert config.task_type == 'evaluator' assert not config.is_chief ``` N.B.: If `save_checkpoints_steps` or `save_checkpoints_secs` is set, `keep_checkpoint_max` might need to be adjusted accordingly, especially in distributed training. For example, setting `save_checkpoints_secs` as 60 without adjusting `keep_checkpoint_max` (defaults to 5) leads to situation that checkpoint would be garbage collected after 5 minutes. In distributed training, the evaluation job starts asynchronously and might fail to load or find the checkpoint due to race condition. Args: model_dir: directory where model parameters, graph, etc are saved. If `PathLike` object, the path will be resolved. If `None`, will use a default value set by the Estimator. tf_random_seed: Random seed for TensorFlow initializers. Setting this value allows consistency between reruns. save_summary_steps: Save summaries every this many steps. save_checkpoints_steps: Save checkpoints every this many steps. Can not be specified with `save_checkpoints_secs`. save_checkpoints_secs: Save checkpoints every this many seconds. Can not be specified with `save_checkpoints_steps`. Defaults to 600 seconds if both `save_checkpoints_steps` and `save_checkpoints_secs` are not set in constructor. If both `save_checkpoints_steps` and `save_checkpoints_secs` are None, then checkpoints are disabled. session_config: a ConfigProto used to set session parameters, or None. keep_checkpoint_max: The maximum number of recent checkpoint files to keep. As new files are created, older files are deleted. If None or 0, all checkpoint files are kept. Defaults to 5 (that is, the 5 most recent checkpoint files are kept.) keep_checkpoint_every_n_hours: Number of hours between each checkpoint to be saved. The default value of 10,000 hours effectively disables the feature. log_step_count_steps: The frequency, in number of global steps, that the global step/sec and the loss will be logged during training. train_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during training, according to the policy specified by that strategy. Setting `experimental_distribute.train_distribute` is preferred. device_fn: A callable invoked for every `Operation` that takes the `Operation` and returns the device string. If `None`, defaults to the device function returned by `tf.train.replica_device_setter` with round-robin strategy. protocol: An optional argument which specifies the protocol used when starting server. None means default to grpc. eval_distribute: An optional instance of `tf.contrib.distribute.DistributionStrategy`. If specified, then Estimator will distribute the user's model during evaluation, according to the policy specified by that strategy. Setting `experimental_distribute.eval_distribute` is preferred. experimental_distribute: an optional `tf.contrib.distribute.DistributeConfig` object specifying DistributionStrategy-related configuration. The `train_distribute` and `eval_distribute` can be passed as parameters to `RunConfig` or set in `experimental_distribute` but not both. Raises: ValueError: If both `save_checkpoints_steps` and `save_checkpoints_secs` are set. """ if (save_checkpoints_steps == _USE_DEFAULT and save_checkpoints_secs == _USE_DEFAULT): save_checkpoints_steps = None save_checkpoints_secs = 600 elif save_checkpoints_secs == _USE_DEFAULT: save_checkpoints_secs = None elif save_checkpoints_steps == _USE_DEFAULT: save_checkpoints_steps = None elif (save_checkpoints_steps is not None and save_checkpoints_secs is not None): raise ValueError(_SAVE_CKPT_ERR) tf_config = json.loads(os.environ.get(_TF_CONFIG_ENV, '{}')) if tf_config: logging.info('TF_CONFIG environment variable: %s', tf_config) model_dir = _get_model_dir(tf_config, compat_internal.path_to_str(model_dir)) RunConfig._replace( self, allowed_properties_list=_DEFAULT_REPLACEABLE_LIST, model_dir=model_dir, tf_random_seed=tf_random_seed, save_summary_steps=save_summary_steps, save_checkpoints_steps=save_checkpoints_steps, save_checkpoints_secs=save_checkpoints_secs, session_config=session_config, keep_checkpoint_max=keep_checkpoint_max, keep_checkpoint_every_n_hours=keep_checkpoint_every_n_hours, log_step_count_steps=log_step_count_steps, train_distribute=train_distribute, device_fn=device_fn, protocol=protocol, eval_distribute=eval_distribute, experimental_distribute=experimental_distribute) # TODO(frankchn,priyag): Eventually use distributed coordinator for TPUs. if ((train_distribute and train_distribute.__class__.__name__ != 'TPUStrategy') or (eval_distribute and eval_distribute.__class__.__name__ != 'TPUStrategy') or experimental_distribute): logging.info('Initializing RunConfig with distribution strategies.') distribute_coordinator_training.init_run_config(self, tf_config) else: self._init_distributed_setting_from_environment_var(tf_config) self._maybe_overwrite_session_config_for_distributed_training()