def _configure(self,
                 session_config=None,
                 cluster_spec=None,
                 task_type=None,
                 task_id=None):
    del task_type, task_id

    if session_config:
      session_config.isolate_session_state = True

    if cluster_spec:
      self._initialize_multi_worker(self._num_gpus, cluster_spec)

    if self._cross_device_ops is None:
      if self._cluster_spec:
        # It currently cannot detect the toplogy of remote workers. So we
        # hard-code the multi-worker all-reduce algorithm for now.
        if len(self._workers) == 1:
          # The default is "nccl".
          self._cross_device_ops = (
              cross_device_ops_lib.AllReduceCrossDeviceOps())
        else:
          # The default is hierarchical reduce and broadcast.
          self._cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
              self._workers, self._num_gpus)
      else:
        self._cross_device_ops = cross_device_ops_lib.choose_the_best(
            self._devices, session_config=session_config)
Пример #2
0
 def _initialize_local(self, devices):
   """Initializes the object for local training."""
   self._local_mode = True
   assert devices, "Must specify at least one device."
   devices = tuple(device_util.resolve(d) for d in devices)
   assert len(set(devices)) == len(devices), (
       "No duplicates allowed in `devices` argument: %s" % devices)
   # TODO(josh11b): Require at least 2 devices?
   self._device_map = values.ReplicaDeviceMap(devices)
   self._input_workers = input_lib.InputWorkers(self._device_map)
   self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
       devices)
   self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
Пример #3
0
 def _initialize_local(self, devices):
     """Initializes the object for local training."""
     self._local_mode = True
     assert devices, "Must specify at least one device."
     devices = tuple(device_util.resolve(d) for d in devices)
     assert len(set(devices)) == len(devices), (
         "No duplicates allowed in `devices` argument: %s" % (devices, ))
     # TODO(josh11b): Require at least 2 devices?
     self._device_map = values.ReplicaDeviceMap(devices)
     self._input_workers = input_lib.InputWorkers(self._device_map)
     self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
         devices)
     self._host_input_device = numpy_dataset.SingleDevice("/cpu:0")
Пример #4
0
  def _initialize_local(self, devices):
    """Initializes the object for local training."""
    self._local_mode = True
    assert devices, "Must specify at least one device."
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument.")
    # TODO(josh11b): Require at least 2 devices?
    self._devices = [device_util.resolve(d) for d in devices]
    self._canonical_device_set = set(self._devices)
    self._device_index = values.PerReplica(
        {d: i for i, d in enumerate(devices)})

    self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
        devices)
Пример #5
0
  def testChooseAlgorithm(self):
    # Not use nccl if there is any cpu device.
    self.assertIsInstance(
        cross_device_ops_lib.choose_the_best(["/cpu:0"]),
        cross_device_ops_lib.ReductionToOneDevice)

    # Not use nccl if requested device is not visible to TensorFlow.
    # TODO(yuefengz): make `choose_the_best` work with device strings
    # self.assertIsInstance(
    #     cross_device_ops_lib.choose_the_best(["/gpu:100"]),
    #     cross_device_ops_lib.ReductionToOneDevice)

    if context.num_gpus() < 1:
      return

    devices = ["/gpu:0"]

    def mock_get_registered_kernels_for_op(op):
      if op == "NcclAllReduce":
        return [object]
      else:
        return []

    # Use nccl if nccl kernel is found.
    with test.mock.patch.object(kernels, "get_registered_kernels_for_op",
                                mock_get_registered_kernels_for_op):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.NcclAllReduce)

    # Not use nccl if nccl kernel is not found.
    with test.mock.patch.object(kernels,
                                "get_registered_kernels_for_op", lambda _: []):
      self.assertIsInstance(
          cross_device_ops_lib.choose_the_best(devices),
          cross_device_ops_lib.ReductionToOneDevice)
Пример #6
0
 def _initialize_single_worker(self, devices):
   """Initializes the object for single-worker training."""
   self._device_map = values.ReplicaDeviceMap(devices)
   self._input_workers = input_lib.InputWorkers(self._device_map)
   self._inferred_cross_device_ops = None if self._cross_device_ops else (
       cross_device_ops_lib.choose_the_best(devices))
   self._host_input_device = numpy_dataset.SingleDevice(
       self._input_workers.worker_devices[0])
   self._is_multi_worker_training = False
   device_spec = tf_device.DeviceSpec.from_string(
       self._input_workers.worker_devices[0])
   # Ensures when we enter strategy.scope() we use the correct default device
   if device_spec.job is not None and device_spec.job != "localhost":
     self._default_device = "/job:%s/replica:%d/task:%d" % (
         device_spec.job, device_spec.replica, device_spec.task)
Пример #7
0
 def _initialize_single_worker(self, devices):
   """Initializes the object for single-worker training."""
   self._devices = tuple(device_util.canonicalize(d) for d in devices)
   self._input_workers_devices = (
       (device_util.canonicalize("/device:CPU:0", devices[0]), devices),)
   self._inferred_cross_device_ops = None if self._cross_device_ops else (
       cross_device_ops_lib.choose_the_best(devices))
   self._host_input_device = numpy_dataset.SingleDevice(
       self._input_workers_devices[0][0])
   self._is_multi_worker_training = False
   logging.info("Using MirroredStrategy with devices %r", devices)
   device_spec = tf_device.DeviceSpec.from_string(
       self._input_workers_devices[0][0])
   # Ensures when we enter strategy.scope() we use the correct default device
   if device_spec.job is not None and device_spec.job != "localhost":
     self._default_device = "/job:%s/replica:%d/task:%d" % (
         device_spec.job, device_spec.replica, device_spec.task)
Пример #8
0
    def _initialize_local(self, devices):
        """Initializes the object for local training.

        ZJW - We modify this protected function for fixing a bug:
            We must pass an argument `session_config` to cross_device_ops_lib.choose_the_best().
            Otherwise, all the gpu memory will be allocated when calling device_lib.list_local_devices()
            in choose_the_best().

            Finally it's still a compromise because session config `allow_growth` turns to futility.
        """
        self._local_mode = True
        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = tuple(device_util.resolve(d) for d in devices)
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerReplica(
            {d: i for i, d in enumerate(devices)})

        self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
            devices, session_config=self._zjw_session_config)