Пример #1
0
  def testInitializableIterator(self):
    with context.graph_mode():
      devices = ["/device:CPU:0"]
      # Using random input since that is only allowed with initializable
      # iterator.
      dataset = dataset_ops.Dataset.from_tensor_slices(
          random_ops.random_uniform((10,)))

      device_map = values.ReplicaDeviceMap(devices)
      input_workers = values.InputWorkers(device_map)
      per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
      iterator = per_replica_dataset.make_initializable_iterator()

      self.evaluate(iterator.initializer)
      next_element = iterator.get_next_as_list()
      for _ in range(10):
        self.evaluate(next_element)

      # Should fail after the input is finished.
      with self.assertRaises(errors.OutOfRangeError):
        self.evaluate(next_element)

      # After re-initializing the iterator, should be able to iterate again.
      self.evaluate(iterator.initializer)
      for _ in range(10):
        self.evaluate(next_element)
Пример #2
0
  def _initialize_multi_worker(self, devices):
    """Initializes the object for multi-worker training."""
    self._local_mode = False

    assert devices, "Must specify at least one device."
    devices = tuple(device_util.resolve(d) for d in devices)
    assert len(set(devices)) == len(devices), (
        "No duplicates allowed in `devices` argument: %s" % devices)
    # TODO(josh11b): Require at least 2 devices?

    device_dict = _group_device_list(devices)
    workers = []
    worker_devices = []
    for job in ("chief", "worker"):
      for task in range(len(device_dict.get(job, []))):
        worker = "/job:%s/task:%d" % (job, task)
        workers.append(worker)
        worker_devices.append((worker, device_dict[job][task]))

    # Setting `_default_device` will add a device scope in the
    # distribution.scope. We set the default device to the first worker. When
    # users specify device under distribution.scope by
    #   with tf.device("/cpu:0"):
    #     ...
    # their ops will end up on the cpu device of its first worker, e.g.
    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
    self._default_device = workers[0]

    self._device_map = values.ReplicaDeviceMap(devices)
    self._input_workers = values.InputWorkers(self._device_map, worker_devices)
    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
        workers, _infer_num_gpus_per_worker(devices))
Пример #3
0
    def _initialize_local(self, num_gpus_per_worker):
        """Initialize internal devices for local training."""
        worker_device = device_util.canonicalize("/device:CPU:0")
        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus_per_worker > 0:
            compute_devices = tuple(
                map("/device:GPU:{}".format, range(num_gpus_per_worker)))
        else:
            compute_devices = (_LOCAL_CPU, )

        self._device_map = values.ReplicaDeviceMap(compute_devices)
        self._input_workers = values.InputWorkers(
            self._device_map, [(worker_device, compute_devices)])

        # If there is only one GPU, put everything on that GPU. Otherwise, place
        # variables on CPU.
        if num_gpus_per_worker == 1:
            assert len(compute_devices) == 1
            self._variable_device = _LOCAL_GPU_0
            self._parameter_devices = (_LOCAL_GPU_0, )
        else:
            self._variable_device = _LOCAL_CPU
            self._parameter_devices = (_LOCAL_CPU, )

        self._is_chief = True
        self._cluster_spec = None
        self._task_type = None
        self._task_id = None

        logging.info(
            "ParameterServerStrategy with compute_devices = %r, "
            "variable_device = %r", compute_devices, self._variable_device)
Пример #4
0
 def __init__(self, container_strategy, device):
   super(OneDeviceExtended, self).__init__(container_strategy)
   self._device = device
   self._default_device = device
   worker = device_util.canonicalize("/device:CPU:0")
   worker_device_pairs = [(worker, [self._device])]
   device_map = values.SingleDeviceMap(device)
   self._input_workers = values.InputWorkers(device_map, worker_device_pairs)
Пример #5
0
 def testValueErrorForIterator(self):
   # Incompatiable arguments.
   d1 = "/device:GPU:0"
   d2 = "/device:GPU:1"
   device_map = values.ReplicaDeviceMap([d1, d2])
   input_workers = values.InputWorkers(
       device_map, (("w1", (d1,)), ("w2", (d2,))))
   with self.assertRaises(ValueError):
     values.MultiWorkerDataIterator([("w1", None)], input_workers)
Пример #6
0
 def _test_dataset(self, dataset_fn, worker_devices, devices,
                   expected_values):
   device_map = values.ReplicaDeviceMap(devices)
   input_workers = values.InputWorkers(device_map, worker_devices)
   multi_worker_dataset = values.MultiWorkerDataset(
       dataset_fn, input_workers)
   multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
   with self.cached_session() as sess:
     sess.run(multi_worker_iterator.initializer)
     self._test_iterator(sess, multi_worker_iterator, devices, expected_values)
Пример #7
0
 def _initialize_local(self, devices):
   """Initializes the object for local training."""
   self._local_mode = True
   assert devices, "Must specify at least one device."
   devices = tuple(device_util.resolve(d) for d in devices)
   assert len(set(devices)) == len(devices), (
       "No duplicates allowed in `devices` argument: %s" % devices)
   # TODO(josh11b): Require at least 2 devices?
   self._device_map = values.ReplicaDeviceMap(devices)
   self._input_workers = values.InputWorkers(self._device_map)
   self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
       devices)
Пример #8
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initializes the object for multi-worker training."""
        if task_type is None or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        if task_type not in ("chief", "worker"):
            raise ValueError(
                "Unrecognized task_type: %r, valid task types are: \"chief\", "
                "\"worker\"." % task_type)
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._num_workers = multi_worker_util.worker_count(
            cluster_spec, task_type)
        if not self._num_workers:
            raise ValueError("No `worker` or `chief` tasks can be found in "
                             "`cluster_spec`.")

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)

        self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
        if num_gpus_per_worker:
            local_devices = tuple("%s/device:GPU:%d" % (self._worker_device, i)
                                  for i in range(num_gpus_per_worker))
        else:
            local_devices = (self._worker_device, )

        self._collective_keys = cross_device_utils.CollectiveKeys()
        self._initialize_local(local_devices)
        self._input_workers = values.InputWorkers(
            self._device_map, [(self._worker_device, self.worker_devices)])
        self._cross_device_ops = cross_device_ops_lib.CollectiveAllReduce(
            num_workers=self._num_workers,
            num_gpus_per_worker=num_gpus_per_worker,
            collective_keys=self._collective_keys)

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = "/job:%s/task:%d" % (task_type, task_id)

        self._cluster_spec = multi_worker_util.normalize_cluster_spec(
            cluster_spec)
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker CollectiveAllReduceStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_workers = %r, local_devices = %r", cluster_spec.as_dict(),
            task_type, task_id, self._num_workers, local_devices)
Пример #9
0
    def __init__(self,
                 container_strategy,
                 tpu_cluster_resolver=None,
                 steps_per_run=None,
                 num_cores=None):
        super(TPUExtended, self).__init__(container_strategy)

        if tpu_cluster_resolver is None:
            tpu_cluster_resolver = resolver_lib.TPUClusterResolver("")

        if steps_per_run is None:
            # TODO(frankchn): Warn when we are being used by DS/Keras and this is
            # not specified.
            steps_per_run = 1

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        # TODO(sourabhbajaj): Change this from num_cores to metadata_override
        self._num_cores_override = num_cores

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        self._device_index = {
            d.name: i
            for i, d in enumerate(self._tpu_metadata.devices)
            if "device:TPU:" in d.name
        }
        self._host_device = self.get_host_cpu_device(0)
        self._tpu_devices = tuple(sorted(self._device_index.keys()))
        # Only create variables for the number of replicas we're running.
        self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
        self._device_map = values.ReplicaDeviceMap(self._tpu_devices)

        # For input:
        input_device_map = values.ReplicaDeviceMap(
            tuple(
                self.get_host_cpu_device(hid)
                for hid in range(self.num_hosts)))
        worker_devices = [(self.get_host(hid), [self.get_host_cpu_device(hid)])
                          for hid in range(self.num_hosts)]
        self._input_workers = values.InputWorkers(input_device_map,
                                                  worker_devices)

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run
        self._require_static_shapes = True

        # Initialize the TPU devices.
        self._initialize_tpu()
Пример #10
0
  def _test_iterator(self, devices, dataset, expected_values):
    device_map = values.ReplicaDeviceMap(devices)
    input_workers = values.InputWorkers(device_map)
    per_replica_dataset = values.PerReplicaDataset(dataset, input_workers, 0)
    if context.executing_eagerly():
      iterator = per_replica_dataset.make_one_shot_iterator()
    else:
      iterator = per_replica_dataset.make_initializable_iterator()
      self.evaluate([iterator.initializer])

    for expected_value in expected_values:
      next_element = iterator.get_next_as_list()
      computed_value = self.evaluate(next_element)
      self.assertEqual(expected_value, computed_value)

    with self.assertRaises(errors.OutOfRangeError):
      next_element = iterator.get_next_as_list()
      self.evaluate(next_element)
Пример #11
0
  def testInitializableIterator(self):
    worker_devices, devices = self._cpu_devices()
    with context.graph_mode(), self.cached_session() as sess:
      dataset_fn = lambda: dataset_ops.Dataset.range(8)
      device_map = values.ReplicaDeviceMap(devices)
      input_workers = values.InputWorkers(device_map, worker_devices)
      multi_worker_dataset = values.MultiWorkerDataset(
          dataset_fn, input_workers)
      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()

      sess.run(multi_worker_iterator.initializer)
      self._test_iterator(
          sess, multi_worker_iterator, devices,
          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])

      # After re-initializing the iterator, should be able to iterate again.
      sess.run(multi_worker_iterator.initializer)
      self._test_iterator(
          sess, multi_worker_iterator, devices,
          [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7]])
Пример #12
0
  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
                     expected_values, sess=None, split_batch_by=None):
    devices = nest.flatten([ds for _, ds in worker_device_pairs])
    device_map = values.ReplicaDeviceMap(devices)
    input_workers = values.InputWorkers(device_map, worker_device_pairs)

    if input_type == "input_fn":
      input_contexts = [
          distribute_lib.InputContext() for _ in worker_device_pairs]
      input_fn = lambda _: dataset_fn()
      iterator = values.InputFunctionIterator(
          input_fn, input_workers, input_contexts)
    else:
      iterator = values.DatasetIterator(
          dataset_fn(), input_workers, split_batch_by)

    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)

    evaluate(control_flow_ops.group(iterator.initialize()))

    for expected_value in expected_values:
      next_element = iterator.get_next()
      computed_value = evaluate(
          [values.select_replica(r, next_element) for r in range(len(devices))])
      self.assertAllEqual(expected_value, computed_value)

    with self.assertRaises(errors.OutOfRangeError):
      next_element = iterator.get_next()
      evaluate([values.select_replica(r, next_element)
                for r in range(len(devices))])

    # After re-initializing the iterator, should be able to iterate again.
    evaluate(control_flow_ops.group(iterator.initialize()))

    for expected_value in expected_values:
      next_element = iterator.get_next()
      computed_value = evaluate(
          [values.select_replica(r, next_element) for r in range(len(devices))])
      self.assertAllEqual(expected_value, computed_value)
Пример #13
0
    def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
                                 task_type, task_id):
        """Initialize devices for multiple workers.

    It creates variable devices and compute devices. Variables and operations
    will be assigned to them respectively. We have one compute device per
    replica. The variable device is a device function or device string. The
    default variable device assigns variables to parameter servers in a
    round-robin fashion.

    Args:
      num_gpus_per_worker: number of local GPUs or GPUs per worker.
      cluster_spec: a dict, ClusterDef or ClusterSpec object specifying the
        cluster configurations.
      task_type: the current task type.
      task_id: the current task id.

    Raises:
      ValueError: if the cluster_spec doesn't have ps jobs.
    """
        assert cluster_spec
        if not task_type or task_id is None:
            raise ValueError(
                "When `cluster_spec` is given, you must also specify "
                "`task_type` and `task_id`")
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)

        worker_device = "/job:%s/task:%d" % (self._task_type, self._task_id)

        # Define compute devices which is a list of device strings and one for each
        # replica. When there are GPUs, replicate operations on these GPUs.
        # Otherwise, place operations on CPU.
        if num_gpus_per_worker > 0:
            compute_devices = tuple("%s/device:GPU:%d" % (worker_device, i)
                                    for i in range(num_gpus_per_worker))
        else:
            compute_devices = (worker_device, )

        self._device_map = values.ReplicaDeviceMap(compute_devices)
        self._input_workers = values.InputWorkers(
            self._device_map, [(worker_device, compute_devices)])

        # In distributed mode, place variables on ps jobs in a round-robin fashion.
        # Note that devices returned from `replica_device_setter` are not
        # canonical and therefore we don't canonicalize all variable devices to
        # make them consistent.
        # TODO(yuefengz): support passing a strategy object to control variable
        # assignment.
        # TODO(yuefengz): merge the logic of replica_device_setter into this
        # class.
        num_ps_replicas = len(cluster_spec.as_dict().get("ps", []))
        if num_ps_replicas == 0:
            raise ValueError("The cluster spec needs to have `ps` jobs.")
        self._variable_device = device_setter.replica_device_setter(
            ps_tasks=num_ps_replicas,
            worker_device=worker_device,
            merge_devices=True,
            cluster=cluster_spec)

        # The `_parameter_devices` is needed for the `parameter_devices` property
        # and is a list of all variable devices. Here parameter devices are all
        # tasks of the "ps" job.
        self._parameter_devices = tuple(
            map("/job:ps/task:{}".format, range(num_ps_replicas)))

        # Add a default device so that ops without specified devices will not end up
        # on other workers.
        self._default_device = worker_device

        self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                    task_id)
        self._cluster_spec = cluster_spec
        self._task_type = task_type
        self._task_id = task_id

        logging.info(
            "Multi-worker ParameterServerStrategy with "
            "cluster_spec = %r, task_type = %r, task_id = %r, "
            "num_ps_replicas = %r, is_chief = %r, device_map = %r, "
            "variable_device = %r", cluster_spec.as_dict(), task_type, task_id,
            num_ps_replicas, self._is_chief, self._device_map,
            self._variable_device)