def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False): collective_keys = cross_tower_utils.CollectiveKeys( group_key_start=10 * num_gpus + MultiWorkerCollectiveAllReduceTest.collective_key_base, instance_key_start=num_gpus * 100 + MultiWorkerCollectiveAllReduceTest.collective_key_base, instance_key_with_id_start=num_gpus * 10000 + MultiWorkerCollectiveAllReduceTest.collective_key_base) if local_mode: collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( 1, num_gpus, collective_keys=collective_keys) if num_gpus: devices = ["/device:GPU:%d" % i for i in range(num_gpus)] else: devices = ["/device:CPU:0"] return collective_all_reduce_ops, devices, "local" else: collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce( 3, num_gpus, collective_keys=collective_keys) if num_gpus: devices = [ "/job:%s/task:%d/device:GPU:%d" % (task_type, task_id, i) for i in range(num_gpus) ] else: devices = ["/job:%s/task:%d" % (task_type, task_id)] return collective_all_reduce_ops, devices, self._workers[ task_id].target
def _initialize_local_worker(self, container_strategy, num_gpus_per_worker): """Initializes the object for local training.""" self._is_chief = True self._num_workers = 1 if num_gpus_per_worker: local_devices = [ "/device:GPU:%d" % i for i in range(num_gpus_per_worker) ] else: local_devices = ["/device:CPU:0"] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=1, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) self._cluster_spec = None self._task_type = None self._task_id = None logging.info("CollectiveAllReduceStrategy with local_devices = %r", local_devices)
def _initialize_multi_worker(self, container_strategy, num_gpus_per_worker, cluster_spec, task_type, task_id): """Initializes the object for multi-worker training.""" if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec) self._num_workers = multi_worker_util.worker_count( cluster_spec, task_type) if not self._num_workers: raise ValueError("No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type, task_id) worker_device = "/job:%s/task:%d" % (task_type, task_id) if num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceExtended, self).__init__( container_strategy, devices=local_devices, cross_device_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=self._num_workers, num_gpus_per_worker=num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. self._default_device = "/job:%s/task:%d" % (task_type, task_id) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) self._task_type = task_type self._task_id = task_id logging.info( "Multi-worker CollectiveAllReduceStrategy with " "cluster_spec = %r, task_type = %r, task_id = %r, " "num_workers = %r, local_devices = %r", cluster_spec.as_dict(), task_type, task_id, self._num_workers, local_devices)
def _initialize(self, cluster_spec, task_type, task_id): if cluster_spec: if task_type is None or task_id is None: raise ValueError( "When `cluster_spec` is given, you must also specify " "`task_type` and `task_id`") if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) self._cluster_spec = multi_worker_util.normalize_cluster_spec( cluster_spec) worker_device = "/job:%s/task:%d" % (task_type, task_id) num_workers = len(self._cluster_spec.as_dict().get( "worker", [])) + len(self._cluster_spec.as_dict().get( "chief", [])) if not num_workers: raise ValueError( "No `worker` or `chief` tasks can be found in " "`cluster_spec`.") self._is_chief = multi_worker_util.is_chief( cluster_spec, task_type, task_id) else: self._cluster_spec = None self._is_chief = True worker_device = "" num_workers = 1 self._num_workers = num_workers if self._num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(self._num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceStrategy, self).__init__( devices=local_devices, cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=num_workers, num_gpus_per_worker=self._num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. if cluster_spec: self._default_device = "/job:%s/replica:0/task:%d" % (task_type, task_id)
def _initialize(self, cluster_spec, task_type, task_id): if task_type not in ["chief", "worker"]: raise ValueError( "Unrecognized task_type: %r, valid task types are: \"chief\", " "\"worker\"." % task_type) if cluster_spec: self._cluster_spec = _normalize_cluster_spec(cluster_spec) worker_device = "/job:%s/task:%d" % (task_type, task_id) num_workers = len(self._cluster_spec.as_dict().get(task_type, [])) if "chief" in self._cluster_spec.as_dict(): num_workers += 1 if not num_workers: raise ValueError("`task_type` shoud be in `cluster_spec`.") # TODO(yuefengz): create a utility to infer chief. if "chief" in self._cluster_spec.as_dict( ) and task_type == "chief": assert task_id == 0 self._is_chief = True else: assert task_type == "worker" self._is_chief = task_id == 0 else: self._cluster_spec = None self._is_chief = True worker_device = "" num_workers = 1 self._num_workers = num_workers if self._num_gpus_per_worker: local_devices = [ "%s/device:GPU:%d" % (worker_device, i) for i in range(self._num_gpus_per_worker) ] else: local_devices = [worker_device] self._collective_keys = cross_tower_utils.CollectiveKeys() super(CollectiveAllReduceStrategy, self).__init__( devices=local_devices, cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce( num_workers=num_workers, num_gpus_per_worker=self._num_gpus_per_worker, collective_keys=self._collective_keys)) # Add a default device so that ops without specified devices will not end up # on other workers. if cluster_spec: self._default_device = "/job:%s/replica:0/task:%d" % (task_type, task_id)