def _configure(self, session_config=None, cluster_spec=None, task_type=None, task_id=None): del task_type, task_id if session_config: session_config.isolate_session_state = True if cluster_spec: self._initialize_multi_worker(self._num_gpus, cluster_spec) if self._cross_device_ops is None: if self._cluster_spec: # It currently cannot detect the toplogy of remote workers. So we # hard-code the multi-worker all-reduce algorithm for now. if len(self._workers) == 1: # The default is "nccl". self._cross_device_ops = ( cross_device_ops_lib.AllReduceCrossDeviceOps()) else: # The default is hierarchical reduce and broadcast. self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce( self._workers, self._num_gpus) else: self._cross_device_ops = cross_device_ops_lib.choose_the_best( self._devices, session_config=session_config)
def _initialize_local(self, devices): """Initializes the object for local training.""" self._local_mode = True assert devices, "Must specify at least one device." devices = tuple(device_util.resolve(d) for d in devices) assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument: %s" % devices) # TODO(josh11b): Require at least 2 devices? self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map) self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best( devices) self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
def _initialize_local(self, devices): """Initializes the object for local training.""" self._local_mode = True assert devices, "Must specify at least one device." devices = tuple(device_util.resolve(d) for d in devices) assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument: %s" % (devices, )) # TODO(josh11b): Require at least 2 devices? self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map) self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best( devices) self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
def _initialize_local(self, devices): """Initializes the object for local training.""" self._local_mode = True assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = [device_util.resolve(d) for d in devices] self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)}) self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best( devices)
def testChooseAlgorithm(self): # Not use nccl if there is any cpu device. self.assertIsInstance( cross_device_ops_lib.choose_the_best(["/cpu:0"]), cross_device_ops_lib.ReductionToOneDevice) # Not use nccl if requested device is not visible to TensorFlow. # TODO(yuefengz): make `choose_the_best` work with device strings # self.assertIsInstance( # cross_device_ops_lib.choose_the_best(["/gpu:100"]), # cross_device_ops_lib.ReductionToOneDevice) if context.num_gpus() < 1: return devices = ["/gpu:0"] def mock_get_registered_kernels_for_op(op): if op == "NcclAllReduce": return [object] else: return [] # Use nccl if nccl kernel is found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", mock_get_registered_kernels_for_op): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.NcclAllReduce) # Not use nccl if nccl kernel is not found. with test.mock.patch.object(kernels, "get_registered_kernels_for_op", lambda _: []): self.assertIsInstance( cross_device_ops_lib.choose_the_best(devices), cross_device_ops_lib.ReductionToOneDevice)
def _initialize_single_worker(self, devices): """Initializes the object for single-worker training.""" self._device_map = values.ReplicaDeviceMap(devices) self._input_workers = input_lib.InputWorkers(self._device_map) self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) self._host_input_device = numpy_dataset.SingleDevice( self._input_workers.worker_devices[0]) self._is_multi_worker_training = False device_spec = tf_device.DeviceSpec.from_string( self._input_workers.worker_devices[0]) # Ensures when we enter strategy.scope() we use the correct default device if device_spec.job is not None and device_spec.job != "localhost": self._default_device = "/job:%s/replica:%d/task:%d" % ( device_spec.job, device_spec.replica, device_spec.task)
def _initialize_single_worker(self, devices): """Initializes the object for single-worker training.""" self._devices = tuple(device_util.canonicalize(d) for d in devices) self._input_workers_devices = ( (device_util.canonicalize("/device:CPU:0", devices[0]), devices),) self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) self._host_input_device = numpy_dataset.SingleDevice( self._input_workers_devices[0][0]) self._is_multi_worker_training = False logging.info("Using MirroredStrategy with devices %r", devices) device_spec = tf_device.DeviceSpec.from_string( self._input_workers_devices[0][0]) # Ensures when we enter strategy.scope() we use the correct default device if device_spec.job is not None and device_spec.job != "localhost": self._default_device = "/job:%s/replica:%d/task:%d" % ( device_spec.job, device_spec.replica, device_spec.task)
def _initialize_local(self, devices): """Initializes the object for local training. ZJW - We modify this protected function for fixing a bug: We must pass an argument `session_config` to cross_device_ops_lib.choose_the_best(). Otherwise, all the gpu memory will be allocated when calling device_lib.list_local_devices() in choose_the_best(). Finally it's still a compromise because session config `allow_growth` turns to futility. """ self._local_mode = True assert devices, "Must specify at least one device." assert len(set(devices)) == len(devices), ( "No duplicates allowed in `devices` argument.") # TODO(josh11b): Require at least 2 devices? self._devices = tuple(device_util.resolve(d) for d in devices) self._canonical_device_set = set(self._devices) self._device_index = values.PerReplica( {d: i for i, d in enumerate(devices)}) self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best( devices, session_config=self._zjw_session_config)