def testClusterWithChief(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "ps": ["127.0.0.1:1926", "127.0.0.1:3141"] } self.assertTrue(multi_worker_util.is_chief(cluster_spec, "chief", 0)) self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 0))
def _maybe_create_checkpoint_manager(self, checkpoint_or_checkpoint_manager, checkpoint_dir, cluster_resolver): """Create CheckpointManager(s) if a checkpoint is passed else take it.""" if isinstance(checkpoint_or_checkpoint_manager, checkpoint_management.CheckpointManager): self._read_checkpoint_manager = checkpoint_or_checkpoint_manager self._write_checkpoint_manager = checkpoint_or_checkpoint_manager self._api_made_checkpoint_manager = False else: self._api_made_checkpoint_manager = True # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different # setup on chief and on other workers. self._read_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint_or_checkpoint_manager, directory=checkpoint_dir, max_to_keep=1) if multi_worker_util.is_chief( cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id): self._write_checkpoint_manager = self._read_checkpoint_manager else: self._write_checkpoint_manager = ( checkpoint_management.CheckpointManager( checkpoint_or_checkpoint_manager, _non_chief_checkpoint_dir(checkpoint_dir, cluster_resolver.task_id), max_to_keep=1))
def testEvaluatorIsChief(self): cluster_spec = { "chief": ["127.0.0.1:1234"], "worker": ["127.0.0.1:8964", "127.0.0.1:2333"], "evaluator": ["127.0.0.1:2019"] } self.assertTrue(multi_worker_util.is_chief(cluster_spec, "evaluator", 0))
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id` in the `cluster_resolver`.") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _save_checkpoint_and_exit(self): """Saves the checkpoint and exit program.""" logging.info('Starting checkpoint and exit') self._checkpointed_runs.assign(self.total_runs) start_time = time.monotonic() self._write_checkpoint_manager.save() # All workers need to participate in saving a checkpoint to avoid # deadlock. They need to write to different paths so that they would not # override each other. We make temporary directories for non-chief # workers to write to, and clean them up afterward. if not multi_worker_util.is_chief( cluster_spec=self._cluster_resolver.cluster_spec(), task_type=self._cluster_resolver.task_type, task_id=self._cluster_resolver.task_id): gfile.DeleteRecursively( os.path.dirname(self._write_checkpoint_manager.directory)) end_time = time.monotonic() logging.info('Checkpoint finished at path %s', self._write_checkpoint_manager.directory) logging.info('Checkpoint time: %f', end_time - start_time) self._stop_poll_termination_signal_thread() self._stop_cluster_wise_termination_watcher_thread() sys.exit(self._restart_code)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. num_gpus = cluster_resolver.num_accelerators() cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id` in the `cluster_resolver`.") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _mwms_write_checkpoint_dir(self, checkpoint_dir, cluster_spec, task_type, task_id): dirpath = os.path.dirname(checkpoint_dir) base = os.path.basename(checkpoint_dir) if not multi_worker_util.is_chief( cluster_spec=cluster_spec, task_type=task_type, task_id=task_id): base_dirpath = 'workertemp_' + str(task_id) dirpath = os.path.join(dirpath, base_dirpath) gfile.MakeDirs(dirpath) return os.path.join(dirpath, base)
def _mwms_write_checkpoint_dir(checkpoint_dir, task_type, task_id, cluster_spec): """Returns checkpoint_dir for chief and a temp dir for any other worker.""" dirpath = os.path.dirname(checkpoint_dir) base = os.path.basename(checkpoint_dir) if not multi_worker_util.is_chief( cluster_spec=cluster_spec, task_type=task_type, task_id=task_id): base_dirpath = 'workertemp_' + str(task_id) dirpath = os.path.join(dirpath, base_dirpath) gfile.MakeDirs(dirpath) return os.path.join(dirpath, base)
def _make_checkpoint_manager(checkpoint, checkpoint_dir, cluster_resolver): if multi_worker_util.is_chief(cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id): return checkpoint_management.CheckpointManager( checkpoint, directory=checkpoint_dir, max_to_keep=1) else: return checkpoint_management.CheckpointManager( checkpoint, directory=failure_handling._non_chief_checkpoint_dir( checkpoint_dir, cluster_resolver.task_id), max_to_keep=1)
def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) worker_device = "/job:%s/task:%d" % (task_type, task_id) if num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, cross_device_ops=cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if num_gpus_per_worker: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _save_checkpoint(self): """Saves the checkpoint.""" self._write_checkpoint_manager.save() # All workers need to participate in saving a checkpoint to avoid # deadlock. They need to write to different paths so that they would not # override each other. We make temporary directories for non-chief # workers to write to, and clean them up afterward. if not multi_worker_util.is_chief( cluster_spec=self._cluster_resolver.cluster_spec(), task_type=self._cluster_resolver.task_type, task_id=self._cluster_resolver.task_id): gfile.DeleteRecursively( os.path.dirname(self._write_checkpoint_manager.directory))
def _initialize(self, cluster_spec, task_type, task_id): if cluster_spec: if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) worker_device = "/job:%s/task:%d" % (task_type, task_id) num_workers = len(self._cluster_spec.as_dict().get( "worker", [])) + len(self._cluster_spec.as_dict().get( "chief", [])) if not num_workers: raise ValueError( "No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief( cluster_spec, task_type, task_id) else: self._cluster_spec = None self._is_chief = True worker_device = "" num_workers = 1 self._num_workers = num_workers if self._num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(self._num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceStrategy, self).__init__( devices=local_devices, cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=num_workers, num_gpus_per_worker=self._num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. if cluster_spec: self._default_device = "/job:%s/replica:0/task:%d" % (task_type, task_id)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ("chief", "worker"): raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus_per_worker: local_devices = tuple( "%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker) ) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() self._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len( cluster_spec.as_dict().get("chief", [])) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) worker_device = "/job:%s/task:%d" % (task_type, task_id) if num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceStrategy, self).__init__( devices=local_devices, cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def testClusterWithoutChief(self): cluster_spec = {"worker": ["127.0.0.1:8964", "127.0.0.1:2333"]} self.assertTrue(multi_worker_util.is_chief(cluster_spec, "worker", 0)) self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1)) with self.assertRaisesRegexp( ValueError, "The task_type \"chief\" is not in the `cluster_spec`."): multi_worker_util.is_chief(cluster_spec, "chief", 0) with self.assertRaisesRegexp( ValueError, "The `task_id` 2 exceeds the maximum id of worker."): multi_worker_util.is_chief(cluster_spec, "worker", 2)
def on_train_begin(self, logs): if not multi_worker_util.is_chief(): # Non-chief workers shouldn't run this callback. self.filtered_correctly = False
def _test_minimize_loss_graph(self, task_type, task_id, num_gpus, use_core_strategy=False): d, master_target, sess_config = self._get_test_objects( task_type, task_id, num_gpus, use_core_strategy=use_core_strategy) if task_type: # Multi-worker assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER)) if CHIEF in d.extended._cluster_spec.as_dict(): num_workers += 1 else: # local num_workers = 1 with ops.Graph().as_default(), \ self.cached_session(target=master_target, config=sess_config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss_fn(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for # multiple graphs (b/111216820). def grad_fn(x): loss = loss_fn(x) var_list = (variables.trainable_variables() + ops.get_collection( ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) grads = gradients.gradients(loss, var_list) ret = list(zip(grads, var_list)) return ret def update(v, g): return v.assign_sub(0.05 * g, use_locking=True) one = constant_op.constant([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): # TODO(yuefengz): support non-Mirrored variable as destinations. g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g, ), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list before_out, after_out = step() if context.num_gpus() < d.extended._num_gpus_per_worker: return True if (not task_type or multi_worker_util.is_chief( d.extended._cluster_spec, task_type, task_id)): variables.global_variables_initializer().run() # Workers waiting for chief worker's initializing variables. self._init_condition.acquire() self._init_reached += 1 while self._init_reached != num_workers: self._init_condition.wait() self._init_condition.notify_all() self._init_condition.release() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before) return error_after < error_before
def _test_minimize_loss_graph(self, task_type, task_id, num_gpus): d, master_target, sess_config = self._get_test_objects( task_type, task_id, num_gpus) assert hasattr(d, '_cluster_spec') and d._cluster_spec num_workers = len(d._cluster_spec.as_dict().get(WORKER)) if CHIEF in d._cluster_spec.as_dict(): num_workers += 1 with ops.Graph().as_default(), \ self.test_session(target=master_target, config=sess_config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss_fn(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(yuefengz, apassos): eager.backprop.implicit_grad is not safe for # multiple graphs (b/111216820). def grad_fn(x): loss = loss_fn(x) var_list = ( variables.trainable_variables() + ops.get_collection( ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) grads = gradients.gradients(loss, var_list) ret = list(zip(grads, var_list)) return ret def update(v, g): return v.assign_sub(0.05 * g, use_locking=True) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_tower(grad_fn, one) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): # TODO(yuefengz): support non-Mirrored variable as destinations. g = d.reduce( variable_scope.VariableAggregation.SUM, g, destinations=v) with ops.control_dependencies(d.unwrap(d.update(v, update, g))): after_list.append(d.read_var(v)) return before_list, after_list before_out, after_out = step() if context.num_gpus() < d._num_gpus_per_worker: return True if multi_worker_util.is_chief(d._cluster_spec, task_type, task_id): variables.global_variables_initializer().run() # Workers waiting for chief worker's initializing variables. self._init_condition.acquire() self._init_reached += 1 while self._init_reached != num_workers: self._init_condition.wait() self._init_condition.notify_all() self._init_condition.release() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before) return error_after < error_before
def _initialize_devices(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initialize internal devices. It creates variable devices and compute devices. Variables and operations will be assigned to them respectively. We have one compute device per tower. The variable device is a device function or device string. The default variable device assigns variables to parameter servers in a round-robin fashion. Args: num_gpus_per_worker: number of local GPUs or GPUs per worker. cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the cluster configurations. task_type: the current task type. task_id: the current task id. Raises: ValueError: if the cluster_spec doesn't have ps jobs. """ self._task_type = task_type or "worker" self._task_id = task_id or 0 self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id) # TODO(yuefengz): maybe clearer to split it into two classes, one for # the distribuetd case and one for the local case, once we have the factory # class/method. # Define compute devices which is a list of device strings and one for each # tower. When there are GPUs, replicate operations on these GPUs. Otherwise, # place operations on CPU. if cluster_spec is None: # Local mode. if num_gpus_per_worker > 0: self._compute_devices = list( map("/device:GPU:{}".format, range(num_gpus_per_worker))) else: self._compute_devices = [_LOCAL_CPU] else: # Distributed mode. if num_gpus_per_worker > 0: self._compute_devices = [ "%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker) ] else: self._compute_devices = [self._worker_device] self._compute_devices = list( map(device_util.resolve, self._compute_devices)) self._canonical_compute_device_set = set(self._compute_devices) # Define variable device which is a device string in the local case and a # device function in the distributed case. It is used to open a device scope # where varibles are defined. # The `_parameter_devices` is needed for the `parameter_devices` property # and is a list of all variable devices. if cluster_spec is None: # Local mode. If there is only one GPU, put everything on that GPU. # Otherwise, place variables on CPU. if num_gpus_per_worker == 1: assert len(list(self._compute_devices)) == 1 self._variable_device = _LOCAL_GPU_0 self._parameter_devices = [_LOCAL_GPU_0] else: self._variable_device = _LOCAL_CPU self._parameter_devices = [_LOCAL_CPU] else: # Distributed mode. Place variables on ps jobs in a round-robin fashion. # Note that devices returned from `replica_device_setter` are not # canonical and therefore we don't canonicalize all variable devices to # make them consistent. # TODO(yuefengz): support passing a strategy object to control variable # assignment. # TODO(yuefengz): merge the logic of replica_device_setter into this # class. num_ps_replicas = len(cluster_spec.as_dict().get("ps", [])) if num_ps_replicas == 0: raise ValueError("The cluster spec needs to have `ps` jobs.") self._variable_device = device_setter.replica_device_setter( ps_tasks=num_ps_replicas, worker_device=self._worker_device, merge_devices=True, cluster=cluster_spec) # Parameter devices are all tasks of the "ps" job. self._parameter_devices = map("/job:ps/task:{}".format, range(num_ps_replicas)) # Define the default device in cross-tower mode. In the distributed case, we # set the default device to the corresponding worker to prevent these ops # from being placed on other workers. if cluster_spec is None: self._default_device = None else: self._default_device = self._worker_device self._is_chief = cluster_spec is None or multi_worker_util.is_chief( cluster_spec, task_type, task_id)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), use_nccl_communication=( self._communication == cross_device_ops_lib.CollectiveCommunication.NCCL), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = config_pb2.ConfigProto() config_proto = self._update_config_proto(config_proto) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc") context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device, ) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() logging.info( "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, " "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initialize devices for multiple workers. It creates variable devices and compute devices. Variables and operations will be assigned to them respectively. We have one compute device per replica. The variable device is a device function or device string. The default variable device assigns variables to parameter servers in a round-robin fashion. Args: num_gpus_per_worker: number of local GPUs or GPUs per worker. cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the cluster configurations. task_type: the current task type. task_id: the current task id. Raises: ValueError: if the cluster_spec doesn't have ps jobs. """ assert cluster_spec if not task_type or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id) # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus_per_worker > 0: self._compute_devices = [ "%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker) ] else: self._compute_devices = [self._worker_device] self._compute_devices = list( map(device_util.resolve, self._compute_devices)) self._canonical_compute_device_set = set(self._compute_devices) # In distributed mode, place variables on ps jobs in a round-robin fashion. # Note that devices returned from `replica_device_setter` are not # canonical and therefore we don't canonicalize all variable devices to # make them consistent. # TODO(yuefengz): support passing a strategy object to control variable # assignment. # TODO(yuefengz): merge the logic of replica_device_setter into this # class. num_ps_replicas = len(cluster_spec.as_dict().get("ps", [])) if num_ps_replicas == 0: raise ValueError("The cluster spec needs to have `ps` jobs.") self._variable_device = device_setter.replica_device_setter( ps_tasks=num_ps_replicas, worker_device=self._worker_device, merge_devices=True, cluster=cluster_spec) # The `_parameter_devices` is needed for the `parameter_devices` property # and is a list of all variable devices. Here parameter devices are all # tasks of the "ps" job. self._parameter_devices = map("/job:ps/task:{}".format, range(num_ps_replicas)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = self._worker_device self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker ParameterServerStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, " "variable_device = %r", cluster_spec.as_dict(), task_type, task_id, num_ps_replicas, self._is_chief, self._compute_devices, self._variable_device)
def _checkpoint_if_preempted(self): """Checkpoint if any worker has received a preemption signal. This function handles preemption signal reported by any worker in the cluster. The current implementation relies on the fact that all workers in a MultiWorkerMirroredStrategy training cluster have a step number difference maximum of 1. - If the signal comes from the worker itself (i.e., where this failure handler sits), the worker will notify all peers to checkpoint after they finish CURRENT_STEP+1 steps, where CURRENT_STEP is the step this worker has just finished. And the worker will wait for all peers to acknowledge that they have received its preemption signal and the final-step number before the worker proceeds on training the final step. - If the signal comes from another member in the cluster but NO final-step info is available, proceed on training, because it will be available after finishing the next step. - If the signal comes from some other member in the cluster, and final-step info is available, if the worker has not finished these steps yet, keep training; otherwise, checkpoint and exit with a cluster-recognized restart code. """ if self._final_checkpoint_countdown: run_count_config_key = _FINAL_RUN_COUNT_KEY else: run_count_config_key = _INITIAL_RUN_COUNT_KEY if self._received_checkpoint_step.is_set(): run_count_key = context.context().get_config_key_value( run_count_config_key) if run_count_key == str(self._run_counter): self._save_checkpoint() if self._time_to_exit(): self._stop_poll_termination_signal_thread() self._stop_cluster_wise_termination_watcher_thread() if self._api_made_checkpoint_manager and ( not multi_worker_util.is_chief( cluster_spec=self._cluster_resolver. cluster_spec(), task_type=self._cluster_resolver.task_type, task_id=self._cluster_resolver.task_id)): gfile.DeleteRecursively( os.path.dirname( self._write_checkpoint_manager.directory)) logging.info( 'PreemptionCheckpointHandler: checkpoint saved. Exiting.' ) self._exit_fn() else: logging.info('Continue training for the grace period.') self._final_checkpoint_countdown = True self._received_checkpoint_step.clear() elif self._received_own_sigterm.is_set(): # Only the worker who gets termination signal first among the cluster # will enter this branch. The following will happen in chronological # order: # 1. The worker just receives a preemption signal and enters this branch # for the first time. It will set a step-to-checkpoint and let the cluster # know. # 2. If there is a long grace period, it will also set # _final_checkpoint_countdown, so that during this grace period, it will # re-enter this branch to check if grace period is ending. # 3. If it is, set a step-to-checkpoint key again. if self._final_checkpoint_countdown: if self._target_time_for_termination < time.time(): logging.info( 'Grace period almost ended. Final call to save a checkpoint!' ) else: return step_to_save_at = str(self._run_counter + 1) logging.info( 'Termination caught in main thread on preempted worker') context.context().set_config_key_value(run_count_config_key, step_to_save_at) logging.info('%s set to %s', run_count_config_key, step_to_save_at) n_workers = multi_worker_util.worker_count( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type) for i in range(n_workers): context.context().get_config_key_value( f'{_ACKNOWLEDGE_KEY}_{run_count_config_key}_{i}') logging.info( 'Sigterm acknowledgement from replica %d received', i) self._setup_countdown_if_has_grace_period_and_not_already_counting_down( )
def _initialize_multi_worker(self, cluster_resolver): """Initialize devices for multiple workers. It creates variable devices and compute devices. Variables and operations will be assigned to them respectively. We have one compute device per replica. The variable device is a device function or device string. The default variable device assigns variables to parameter servers in a round-robin fashion. Args: cluster_resolver: a descendant of `ClusterResolver` object. Raises: ValueError: if the cluster doesn't have ps jobs. """ # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) # Save the num_gpus_per_worker for configure method. self._num_gpus_per_worker = num_gpus cluster_spec = cluster_resolver.cluster_spec() task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if not task_type or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) assert cluster_spec.as_dict() worker_device = "/job:%s/task:%d" % (task_type, task_id) self._input_host_device = numpy_dataset.SingleDevice(worker_device) # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus > 0: compute_devices = tuple( "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus)) else: compute_devices = (worker_device,) self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) # In distributed mode, place variables on ps jobs in a round-robin fashion. # Note that devices returned from `replica_device_setter` are not # canonical and therefore we don't canonicalize all variable devices to # make them consistent. # TODO(yuefengz): support passing a strategy object to control variable # assignment. # TODO(yuefengz): merge the logic of replica_device_setter into this # class. num_ps_replicas = len(cluster_spec.as_dict().get("ps", [])) if num_ps_replicas == 0: raise ValueError("The cluster spec needs to have `ps` jobs.") self._variable_device = device_setter.replica_device_setter( ps_tasks=num_ps_replicas, worker_device=worker_device, merge_devices=True, cluster=cluster_spec) # The `_parameter_devices` is needed for the `parameter_devices` property # and is a list of all variable devices. Here parameter devices are all # tasks of the "ps" job. self._parameter_devices = tuple(map("/job:ps/task:{}".format, range(num_ps_replicas))) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = worker_device self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker ParameterServerStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_ps_replicas = %r, is_chief = %r, device_map = %r, " "variable_device = %r", cluster_spec.as_dict(), task_type, task_id, num_ps_replicas, self._is_chief, self._device_map, self._variable_device)
def __init__(self, cluster_resolver, checkpoint, checkpoint_dir): """Creates the failure handler. Args: cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You may also get it through the `cluster_resolver` attribute of the strategy in use. checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and loaded upon restart by the `CoordinatedCheckpointManager` API automatically. checkpoint_dir: a directory for the `CoordinatedCheckpointManager` to play with checkpoints. `CoordinatedCheckpointManager` will create a `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since only one `tf.train.CheckpointManager` should be active in a particular directory at a time, this `checkpoint_dir` arg should preferably be separated from where the user saves their checkpoint for non-fault tolerance purpose. """ self._cluster_resolver = cluster_resolver self._checkpoint = checkpoint self._id_in_cluster = str( multi_worker_util.id_in_cluster( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type, self._cluster_resolver.task_id)) # The number of calls to `CoordinatedCheckpointManager.run` when the latest # checkpoint was saved. self._checkpointed_runs = variables.Variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), trainable=False, name=_ITERATION_VARIABLE) if not hasattr(self._checkpoint, _ITERATION_VARIABLE): setattr(self._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different # setup on chief and on other workers. self._read_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, directory=checkpoint_dir, max_to_keep=1) if multi_worker_util.is_chief( cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id): self._write_checkpoint_manager = self._read_checkpoint_manager else: self._write_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, _mwms_write_checkpoint_dir(checkpoint_dir, cluster_resolver.task_type, cluster_resolver.task_id, cluster_resolver.cluster_spec()), max_to_keep=1) self._read_checkpoint_manager.restore_or_initialize() # An internal step counter that's restored to checkpointed_iterations when # training is restored. It increments by one every time # `CoordinatedCheckpointManager.run` is called. Note that in this case, the # user must pass a single-step training function to # `CoordinatedCheckpointManager.run` instead of a multiple-step one. self._run_counter = self._checkpointed_runs.numpy() # The worker itself has received preeption signal. self._received_own_sigterm = threading.Event() # Some member (could be oneself) has received preemption signal, and the # step number to save a checkpoint has been aligned. self._received_sigterm_and_step = threading.Event() # When training is interrupted, we explicitly call the cleanup methods for # the thread watching for local worker's termination signal and the thread # watching for clusterwise information before we save a checkpoint and exit. # In the final chapter of the training where no interruption is encountered, # we rely on __del__ to clean up. However, there is no guarantee when or # whether __del__ is executed, thus we make the threads daemon to avoid it # preventing program from exit. self._cluster_wise_termination_watcher_thread = threading.Thread( target=self._wait_for_signal, name='PeerTerminationWatcher-%s' % self._id_in_cluster, daemon=True) self._cluster_wise_termination_watcher_thread.start() self._poll_gce_signal_thread = None self._platform_device = gce_util.detect_platform() if self._platform_device is gce_util.PlatformDevice.GCE_GPU: self._start_polling_for_gce_signal() self._exit_code = gce_util._RESTARTABLE_EXIT_CODE elif self._platform_device is gce_util.PlatformDevice.INTERNAL: self._start_watching_for_signal() self._exit_code = _RESTARTABLE_EXIT_CODE else: raise NotImplementedError( 'CoordinatedCheckpointManager is only supported' ' for MultiWorkerMirroredStrategy with GPU.')
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id self._id_in_cluster = multi_worker_util.id_in_cluster( self._cluster_spec, self._task_type, self._task_id) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError( "No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice( self._worker_device) if (ops.executing_eagerly_outside_functions() and not getattr(self, "_local_or_standalone_client_mode", False)): context.context().configure_collective_ops( collective_leader=multi_worker_util.collective_leader( cluster_spec, task_type, task_id), scoped_allocator_enabled_ops=("CollectiveReduce", ), device_filters=("/job:%s/task:%d" % (task_type, task_id), )) self._collective_ops_configured = True if context.context().coordination_service is None: coordinated_jobs = ["chief", "worker"] if task_type in coordinated_jobs: context.context().configure_coordination_service( service_type="standalone", service_leader=multi_worker_util.coordination_leader( cluster_spec), coordinated_jobs=coordinated_jobs) # Starting a std server in eager mode and in independent worker mode. if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = copy.deepcopy(context.context().config) config_proto = self._update_config_proto(config_proto) # If coordination service is enabled, use its internal heartbeat to detect # peer failures instead of the Python-level health check. if config_proto.experimental.coordination_config.service_type: self._enable_check_health = False if hasattr(cluster_resolver, "port"): port = cluster_resolver.port else: port = 0 server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc", port=port) context.context().enable_collective_ops(server_def) self._std_server_started = True # The `ensure_initialized` is needed before calling # `context.context().devices()`. context.context().ensure_initialized() logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices()) # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. local_devices, local_device_type = self._initialize_local_devices( cluster_resolver, self._worker_device) if local_device_type == "TPU": tpu_strategy_util.initialize_tpu_system() self._collective_keys = cross_device_utils.CollectiveKeys( group_key_start=1 + self._collective_key_base) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=local_devices, group_size=len(local_devices) * self._num_workers, options=self._communication_options, collective_keys=self._collective_keys) # CrossDeviceOps for per host tensors. self._host_cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( devices=[self._worker_device], group_size=self._num_workers, options=self._communication_options, collective_keys=self._collective_keys) super(CollectiveAllReduceExtended, self)._initialize_single_worker(local_devices) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) # Save the num_devices_per_worker and rpc_layer for configure method. self._num_devices_per_worker = len(local_devices) self._local_device_type = local_device_type self._rpc_layer = cluster_resolver.rpc_layer self._warn_nccl_no_gpu() if self._enable_check_health and context.executing_eagerly(): self._start_check_health_thread() else: logging.info("Check health not enabled.") logging.info( "MultiWorkerMirroredStrategy with cluster_spec = %r, task_type = %r, " "task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication_options.implementation)
def _initialize_multi_worker(self, cluster_resolver): """Initializes the object for multi-worker training.""" # TODO(yuefengz): The `num_gpus` is only for this particular task. It # assumes all workers have the same number of GPUs. We should remove this # assumption by querying all tasks for their numbers of GPUs. # TODO(b/126786766): TFConfigClusterResolver returns wrong number of GPUs in # some cases. if isinstance(cluster_resolver, TFConfigClusterResolver): num_gpus = context.num_gpus() else: num_gpus = cluster_resolver.num_accelerators().get("GPU", 0) cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_resolver.cluster_spec()) task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if task_type is None or task_id is None: raise ValueError("When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`.") self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker`, `chief` or `evaluator` tasks can be found " "in `cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._worker_device = "/job:%s/task:%d" % (task_type, task_id) self._host_input_device = numpy_dataset.SingleDevice(self._worker_device) if num_gpus: local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus)) else: local_devices = (self._worker_device,) self._collective_keys = cross_device_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self)._initialize_local(local_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(self._worker_device, self.worker_devices)]) self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus, collective_keys=self._collective_keys) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id # Save the num_gpus_per_worker and rpc_layer for configure method. self._num_gpus_per_worker = num_gpus self._rpc_layer = cluster_resolver.rpc_layer logging.info( "Multi-worker CollectiveAllReduceStrategy with cluster_spec = %r, " "task_type = %r, task_id = %r, num_workers = %r, local_devices = %r, " "communication = %s", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices, self._communication) if (context.executing_eagerly() and not getattr(self, "_std_server_started", False) and not getattr(self, "_local_or_standalone_client_mode", False)): # Checking _local_or_standalone_client_mode as well because we should not # create the std server in standalone client mode. config_proto = config_pb2.ConfigProto() config_proto = self._update_config_proto(config_proto) server_def = tensorflow_server_pb2.ServerDef( cluster=cluster_spec.as_cluster_def(), default_session_config=config_proto, job_name=task_type, task_index=task_id, protocol=cluster_resolver.rpc_layer or "grpc") context.context().enable_collective_ops(server_def) self._std_server_started = True logging.info( "Enabled multi-worker collective ops with available devices: %r", context.context().devices())
def __init__(self, cluster_resolver, checkpoint, checkpoint_dir, termination_config=TerminationConfig()): """Creates the failure handler. Args: cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You may also get it through the `cluster_resolver` attribute of the strategy in use. checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and loaded upon restart by the `WorkerPreemptionHandler` API automatically. checkpoint_dir: a directory for the `WorkerPreemptionHandler` to play with checkpoints. `WorkerPreemptionHandler` will create a `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since only one `tf.train.CheckpointManager` should be active in a particular directory at a time, this `checkpoint_dir` arg should preferably be separated from where the user saves their checkpoint for non-fault tolerance purpose. termination_config: a `TerminationConfig` object to configure for a platform other than Google Borg or GCP. """ self._cluster_resolver = cluster_resolver self._checkpoint = checkpoint self._id_in_cluster = str( multi_worker_util.id_in_cluster( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type, self._cluster_resolver.task_id)) # The number of calls to `WorkerPreemptionHandler.run` when the latest # checkpoint was saved. self._checkpointed_runs = variables.Variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), trainable=False, name=_ITERATION_VARIABLE) if not hasattr(self._checkpoint, _ITERATION_VARIABLE): setattr(self._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different # setup on chief and on other workers. self._read_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, directory=checkpoint_dir, max_to_keep=1) if multi_worker_util.is_chief( cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id): self._write_checkpoint_manager = self._read_checkpoint_manager else: self._write_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, _mwms_write_checkpoint_dir(checkpoint_dir, cluster_resolver.task_type, cluster_resolver.task_id, cluster_resolver.cluster_spec()), max_to_keep=1) self._read_checkpoint_manager.restore_or_initialize() # grace period countdown. Set to True for all workers once they finish # timing saving a checkpoint. Once entering this phase, new # preemption/maintenance notice will not be handled, since the whole cluster # goes down as the worker who first initiates the grace period goes down. self._final_checkpoint_countdown = False self._estimated_run_time = 0 # An internal step counter that's restored to checkpointed_iterations when # training is restored. It increments by one every time # `WorkerPreemptionHandler.run` is called. Note that in this case, the # user must pass a single-step training function to # `WorkerPreemptionHandler.run` instead of a multiple-step one. self._run_counter = self._checkpointed_runs.numpy() # The worker itself has received preeption signal. self._received_own_sigterm = threading.Event() # Some member (could be oneself) has received preemption signal, and the # step number to save a checkpoint has been aligned. self._received_checkpoint_step = threading.Event() self._platform_device = gce_util.detect_platform() completed_termination_config = _complete_config_for_environement( self._platform_device, termination_config) self._termination_watcher_function = completed_termination_config.termination_watcher_function self._exit_fn = completed_termination_config.exit_fn self._grace_period = completed_termination_config.time_till_termination # When training is interrupted, we explicitly call the cleanup methods for # the thread watching for local worker's termination signal and the thread # watching for clusterwise information before we save a checkpoint and exit. # In the final chapter of the training where no interruption is encountered, # we rely on __del__ to clean up. However, there is no guarantee when or # whether __del__ is executed, thus we make the threads daemon to avoid it # preventing program from exit. self._cluster_wise_termination_watcher_thread = threading.Thread( target=self._watch_step_to_save_key, name='PeerTerminationWatcher-%s' % self._id_in_cluster, daemon=True) logging.info('Start watcher for peer\'s signal.') self._cluster_wise_termination_watcher_thread.start() self._poll_termination_signal_thread = None if completed_termination_config.termination_watcher_function: self._start_polling_for_termination_signal() else: self._start_watching_for_signal()
def __init__(self, cluster_resolver, checkpoint, checkpoint_dir): """Creates the failure handler. Args: cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver`. You may also get it through the `cluster_resolver` attribute of the strategy in use. checkpoint: a `tf.train.Checkpoint` that will be saved upon preemption and loaded upon restart by the `CoordinatedCheckpointManager` API automatically. checkpoint_dir: a directory for the `CoordinatedCheckpointManager` to play with checkpoints. `CoordinatedCheckpointManager` will create a `tf.train.CheckpointManager` to manage the passed-in `checkpoint`. Since only one `tf.train.CheckpointManager` should be active in a particular directory at a time, this `checkpoint_dir` arg should preferably be separated from where the user saves their checkpoint for non-fault tolerance purpose. """ self._cluster_resolver = cluster_resolver self._checkpoint = checkpoint self._id_in_cluster = str( multi_worker_util.id_in_cluster( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type, self._cluster_resolver.task_id)) # The number of calls to `CoordinatedCheckpointManager.run` when the latest # checkpoint was saved. self._checkpointed_runs = variables.Variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), trainable=False, name=_ITERATION_VARIABLE) if not hasattr(self._checkpoint, _ITERATION_VARIABLE): setattr(self._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) # Make CheckpointManagers. MultiWorkerMirroredStrategy requires different # setup on chief and on other workers. self._read_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, directory=checkpoint_dir, max_to_keep=1) if multi_worker_util.is_chief( cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id): self._write_checkpoint_manager = self._read_checkpoint_manager else: self._write_checkpoint_manager = checkpoint_management.CheckpointManager( checkpoint, _mwms_write_checkpoint_dir(checkpoint_dir, cluster_resolver.task_type, cluster_resolver.task_id, cluster_resolver.cluster_spec()), max_to_keep=1) self._read_checkpoint_manager.restore_or_initialize() # An internal step counter that's restored to checkpointed_iterations when # training is restored. It increments by one every time # `CoordinatedCheckpointManager.run` is called. Note that in this case, the # user must pass a single-step training function to # `CoordinatedCheckpointManager.run` instead of a multiple-step one. self._run_counter = self._checkpointed_runs.numpy() # The worker itself has received preeption signal. self._received_own_sigterm = threading.Event() # Some member (could be oneself) has received preemption signal, and the # step number to save a checkpoint has been aligned. self._received_sigterm_and_step = threading.Event() # TODO(wxinyi): Enforce that only one instance of this class is created # per program. # TODO(wxinyi): make the thread non-daemon. threading.Thread(target=self._wait_for_signal, daemon=True).start() signal.signal(signal.SIGTERM, self._sigterm_handler_fn)
def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initialize devices for multiple workers. It creates variable devices and compute devices. Variables and operations will be assigned to them respectively. We have one compute device per replica. The variable device is a device function or device string. The default variable device assigns variables to parameter servers in a round-robin fashion. Args: num_gpus_per_worker: number of local GPUs or GPUs per worker. cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the cluster configurations. task_type: the current task type. task_id: the current task id. Raises: ValueError: if the cluster_spec doesn't have ps jobs. """ assert cluster_spec if not task_type or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id) # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus_per_worker > 0: self._compute_devices = [ "%s/device:GPU:%d" % (self._worker_device, i) for i in range(num_gpus_per_worker) ] else: self._compute_devices = [self._worker_device] self._compute_devices = list( map(device_util.resolve, self._compute_devices)) self._canonical_compute_device_set = set(self._compute_devices) # In distributed mode, place variables on ps jobs in a round-robin fashion. # Note that devices returned from `replica_device_setter` are not # canonical and therefore we don't canonicalize all variable devices to # make them consistent. # TODO(yuefengz): support passing a strategy object to control variable # assignment. # TODO(yuefengz): merge the logic of replica_device_setter into this # class. num_ps_replicas = len(cluster_spec.as_dict().get("ps", [])) if num_ps_replicas == 0: raise ValueError("The cluster spec needs to have `ps` jobs.") self._variable_device = device_setter.replica_device_setter( ps_tasks=num_ps_replicas, worker_device=self._worker_device, merge_devices=True, cluster=cluster_spec) # The `_parameter_devices` is needed for the `parameter_devices` property # and is a list of all variable devices. Here parameter devices are all # tasks of the "ps" job. self._parameter_devices = map("/job:ps/task:{}".format, range(num_ps_replicas)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = self._worker_device self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker ParameterServerStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_ps_replicas = %r, is_chief = %r, compute_devices = %r, " "variable_device = %r", cluster_spec.as_dict(), task_type, task_id, num_ps_replicas, self._is_chief, self._compute_devices, self._variable_device)
def _initialize_multi_worker(self, cluster_resolver): """Initialize devices for multiple workers. It creates variable devices and compute devices. Variables and operations will be assigned to them respectively. We have one compute device per replica. The variable device is a device function or device string. The default variable device assigns variables to parameter servers in a round-robin fashion. Args: cluster_resolver: a descendant of `ClusterResolver` object. Raises: ValueError: if the cluster doesn't have ps jobs. """ num_gpus = cluster_resolver.num_accelerators() cluster_spec = cluster_resolver.cluster_spec() task_type = cluster_resolver.task_type task_id = cluster_resolver.task_id if not task_type or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) assert cluster_spec.as_dict() worker_device = "/job:%s/task:%d" % (task_type, task_id) self._input_host_device = numpy_dataset.SingleDevice(worker_device) # Define compute devices which is a list of device strings and one for each # replica. When there are GPUs, replicate operations on these GPUs. # Otherwise, place operations on CPU. if num_gpus > 0: compute_devices = tuple("%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus)) else: compute_devices = (worker_device, ) self._device_map = values.ReplicaDeviceMap(compute_devices) self._input_workers = input_lib.InputWorkers( self._device_map, [(worker_device, compute_devices)]) # In distributed mode, place variables on ps jobs in a round-robin fashion. # Note that devices returned from `replica_device_setter` are not # canonical and therefore we don't canonicalize all variable devices to # make them consistent. # TODO(yuefengz): support passing a strategy object to control variable # assignment. # TODO(yuefengz): merge the logic of replica_device_setter into this # class. num_ps_replicas = len(cluster_spec.as_dict().get("ps", [])) if num_ps_replicas == 0: raise ValueError("The cluster spec needs to have `ps` jobs.") self._variable_device = device_setter.replica_device_setter( ps_tasks=num_ps_replicas, worker_device=worker_device, merge_devices=True, cluster=cluster_spec) # The `_parameter_devices` is needed for the `parameter_devices` property # and is a list of all variable devices. Here parameter devices are all # tasks of the "ps" job. self._parameter_devices = tuple( map("/job:ps/task:{}".format, range(num_ps_replicas))) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = worker_device self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) self._cluster_spec = cluster_spec self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker ParameterServerStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_ps_replicas = %r, is_chief = %r, device_map = %r, " "variable_device = %r", cluster_spec.as_dict(), task_type, task_id, num_ps_replicas, self._is_chief, self._device_map, self._variable_device)
def __init__(self, cluster_resolver, checkpoint_or_checkpoint_manager, checkpoint_dir=None, termination_config=None): """Creates the `PreemptionCheckpointHandler`. Args: cluster_resolver: a `tf.distribute.cluster_resolver.ClusterResolver` object. You may also obtain it through the `cluster_resolver` attribute of the distribution strategy in use. checkpoint_or_checkpoint_manager: a `tf.train.CheckpointManager` or a `tf.train.Checkpoint`. If you are using a `tf.train.CheckpointManager` to manage checkpoints outside the `PreemptionCheckpointHandler` for backup purpose as well, pass it as `checkpoint_or_checkpoint_manager` argument. Otherwise, pass a `tf.train.Checkpoint` and the `PreemptionCheckpointHandler` will create a `tf.train.CheckpointManager` to manage it in the `checkpoint_dir`. checkpoint_dir: a directory where the `PreemptionCheckpointHandler` saves and restores checkpoints. When a `PreemptionCheckpointHandler` is created, the latest checkpoint in the `checkpoint_dir` will be restored. (This is not needed if a `tf.train.CheckpointManager` instead of a `tf.train.Checkpoint` is passed as the `checkpoint_or_checkpoint_manager` argument.) termination_config: optional, a `tf.distribute.experimental.TerminationConfig` object to configure for a platform other than Google Borg or GCP. """ self._cluster_resolver = cluster_resolver if not cluster_resolver.cluster_spec().jobs: # For local-mode MultiWorkerMirroredStrategy, an empty cluster spec is # passed, and coordination service is not enabled nor is it needed (since # it's used for cross-worker communication). Thus we will directly name # the worker id and is_chief properties and also skip the # uploading/reading from coordination service logic. self._local_mode = True self._id_in_cluster = 'single_worker' self._is_chief = True else: self._local_mode = False self._id_in_cluster = str( multi_worker_util.id_in_cluster( self._cluster_resolver.cluster_spec(), self._cluster_resolver.task_type, self._cluster_resolver.task_id)) self._is_chief = multi_worker_util.is_chief( cluster_spec=cluster_resolver.cluster_spec(), task_type=cluster_resolver.task_type, task_id=cluster_resolver.task_id) if isinstance(checkpoint_or_checkpoint_manager, checkpoint_lib.Checkpoint) and not checkpoint_dir: raise errors.InvalidArgumentError( 'When a checkpoint is passed, a ' 'checkpoint_dir must be passed as well' '.') # The number of calls to `PreemptionCheckpointHandler.run` when the latest # checkpoint was saved. self._checkpointed_runs = variables.Variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), trainable=False, name=_ITERATION_VARIABLE) self._maybe_create_checkpoint_manager(checkpoint_or_checkpoint_manager, checkpoint_dir, cluster_resolver) if not hasattr(self._write_checkpoint_manager._checkpoint, _ITERATION_VARIABLE): setattr(self._write_checkpoint_manager._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) if not hasattr(self._read_checkpoint_manager._checkpoint, _ITERATION_VARIABLE): setattr(self._read_checkpoint_manager._checkpoint, _ITERATION_VARIABLE, self._checkpointed_runs) self._read_checkpoint_manager.restore_or_initialize() # grace period countdown. Set to True for all workers once they finish # timing saving a checkpoint. Once entering this phase, new # preemption/maintenance notice will not be handled, since the whole cluster # goes down as the worker who first initiates the grace period goes down. self._final_checkpoint_countdown = False self._estimated_run_time = 0 # An internal step counter that's restored to checkpointed_iterations when # training is restored. It increments by one every time # `PreemptionCheckpointHandler.run` is called. Note that in this case, the # user must pass a single-step training function to # `PreemptionCheckpointHandler.run` instead of a multiple-step one. self._run_counter = self._checkpointed_runs.numpy() # The worker itself has received preeption signal. self._received_own_sigterm = threading.Event() # Some member (could be oneself) has received preemption signal, and the # step number to save a checkpoint has been aligned. self._received_checkpoint_step = threading.Event() self._platform_device = gce_util.detect_platform() if self._platform_device in (gce_util.PlatformDevice.GCE_TPU, gce_util.PlatformDevice.GCE_CPU): # While running MultiWorkerMirroredStrategy training with GPUs and CPUs # are the same on Borg, GCE CPU VM and GPU VM are different in terms # of live migration, grace period, etc. We can make it work upon request. raise NotImplementedError( 'PreemptionCheckpointHandler does not support ' 'training with TPU or CPU device on GCP.') completed_termination_config = _complete_config_for_environment( self._platform_device, termination_config) self._termination_watcher_fn = completed_termination_config.termination_watcher_fn self._exit_fn = completed_termination_config.exit_fn self._grace_period = completed_termination_config.grace_period if not self._local_mode: # When training is interrupted, we explicitly call the cleanup methods for # the thread watching for local worker's termination signal and the thread # watching for clusterwise information before we save a checkpoint and # exit. In the final chapter of the training where no interruption is # encountered, we rely on __del__ to clean up. However, there is no # guarantee when or whether __del__ is executed, thus we make the threads # daemon to avoid it preventing program from exit. self._cluster_wise_termination_watcher_thread = threading.Thread( target=self._watch_step_to_save_key, name='PeerTerminationWatcher-%s' % self._id_in_cluster, daemon=True) logging.info('Start watcher for peer\'s signal.') self._cluster_wise_termination_watcher_thread.start() else: self._cluster_wise_termination_watcher_thread = None self._poll_termination_signal_thread = None if completed_termination_config.termination_watcher_fn: self._start_polling_for_termination_signal() else: self._start_watching_for_signal()