예제 #1
0
  def testPassPerDevice(self):
    self._skip_eager_if_gpus_less_than(1)

    @function.defun
    def fn1(mock_model, factor):
      return mock_model(factor)

    factors = values.PerDevice({"CPU:0": 5.0, "GPU:0": 3.0})
    expected_result = values.PerDevice({"CPU:0": 5.0 * 1.25,
                                        "GPU:0": 3.0 * 1.25})
    self._call_and_check(fn1, [factors], expected_result, [fn1])
예제 #2
0
    def __init__(self,
                 devices=None,
                 num_gpus=None,
                 cross_tower_ops=None,
                 prefetch_on_device=None):
        super(MirroredStrategy, self).__init__()
        # Convert `num_gpus` into `devices`, shouldn't specify both.
        if devices is None:
            if num_gpus is None:
                num_gpus = context.num_gpus()
            devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
        elif num_gpus is not None:
            raise ValueError(
                "Must only specify one of `devices` and `num_gpus`.")

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = devices
        self._canonical_device_set = set(
            [device_util.canonicalize(d) for d in devices])
        self._device_index = values.PerDevice(
            dict((d, i) for i, d in enumerate(devices)))
        self._cross_tower_ops = cross_tower_ops
        self._prefetch_on_device = prefetch_on_device
    def _initialize_local(self, num_gpus, devices):
        """Initializes the object for local training."""
        self._cluster_spec = None
        # Convert `num_gpus` into `devices`, shouldn't specify both.
        if devices is None:
            if num_gpus is None:
                num_gpus = context.num_gpus()
            if num_gpus == 0:
                devices = ["/device:CPU:0"]
            else:
                devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
        elif num_gpus is not None:
            raise ValueError(
                "Must only specify one of `devices` and `num_gpus`.")
        self._num_gpus = num_gpus
        # TODO(yuefengz): consider setting the default device.

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerDevice(
            {d: i
             for i, d in enumerate(devices)})
예제 #4
0
  def _reduce(self, method_string, value, destinations):
    if len(self._devices) == 1 and not isinstance(value, values.PerDevice):
      value = values.PerDevice({self._devices[0]: value})
    assert isinstance(value, values.PerDevice)

    return self._get_cross_tower_ops().reduce(
        method_string, value, destinations=destinations)
 def testContainsIndexedSlices_PerDevice(self):
   t0 = math_ops._as_indexed_slices(
       constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
   t1 = math_ops._as_indexed_slices(
       constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
   per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1})
   self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
def _make_per_device(values, devices):
  devices = cross_tower_ops_lib._get_devices_from(devices)
  assert len(values) == len(devices)
  index = {}
  for d, v in zip(devices, values):
    with ops.device(d):
      placed_v = array_ops.identity(v)
    index[d] = placed_v
  return value_lib.PerDevice(index)
예제 #7
0
 def map(self, map_over, fn, *args, **kwargs):
   # TODO(josh11b): In eager mode, use one thread per device.
   index = {}
   for i, m in enumerate(map_over):
     d = self._devices[i % len(self._devices)]
     with ops.device(d):
       l = index.get(d, [])
       l.append(fn(m,
                   *values.select_device_mirrored(d, args),
                   **values.select_device_mirrored(d, kwargs)))
       index[d] = l
   # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
   # in addition to PerDevice data.
   return values.PerDevice({k: values.MapOutput(v) for k, v in index.items()})
  def testSimpleReduceWithIndexedSlices(self):
    devices = ["/cpu:0", "/gpu:0"]
    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
    result = cross_tower_ops_lib._simple_reduce(per_device, devices[0],
                                                math_ops.add_n, "sum")

    # Test that the result is semantically equal to both the concatenated
    # IndexedSlices with and without duplicate indices.
    total_with_dups = _make_indexed_slices(
        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
    total_without_dups = _make_indexed_slices(
        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
    self._assert_indexed_slices_equal(total_with_dups, result)
    self._assert_indexed_slices_equal(total_without_dups, result)
    def _initialize_multi_worker(self, num_gpus, cluster_spec):
        """Initializes the object for multi-worker training."""
        cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
        self._cluster_spec = cluster_spec

        self._workers = []
        for job in ["chief", "worker"]:
            for task in range(len(cluster_spec.as_dict().get(job, []))):
                self._workers.append("/job:%s/task:%d" % (job, task))

        if num_gpus is None:
            raise ValueError(
                "`num_gpus` is required if `cluster_spec` is given.")
        if num_gpus > 0:
            self._worker_device_map = {
                worker: [
                    device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
                    for gpu in range(num_gpus)
                ]
                for worker in self._workers
            }
        else:
            self._worker_device_map = {
                worker: [device_util.canonicalize(worker, "/device:CPU:0")]
                for worker in self._workers
            }

        devices = nest.flatten(self._worker_device_map)

        # Setting `_default_device` will add a device scope in the
        # distribution.scope. We set the default device to the first worker. When
        # users specify device under distribution.scope by
        #   with tf.device("/cpu:0"):
        #     ...
        # their ops will end up on the cpu device of its first worker, e.g.
        # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
        self._default_device = self._workers[0]

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerDevice(
            {d: i
             for i, d in enumerate(devices)})
def _make_per_device(values, devices, regroup=False):
    devices = cross_tower_ops_lib.get_devices_from(devices)
    assert len(values) == len(devices)

    # We simulate the result of regroup called on PerDevice which strips the
    # PerDevice wrapper if it has only one value.
    if len(values) == 1 and regroup:
        with ops.device(devices[0]):
            placed_v = array_ops.identity(values[0])
        return placed_v

    index = {}
    for d, v in zip(devices, values):
        with ops.device(d):
            placed_v = array_ops.identity(v)
        index[d] = placed_v
    return value_lib.PerDevice(index)
예제 #11
0
def _make_tensor_into_per_device(input_tensor):
    """Converts a single tensor into a PerDevice object."""
    if isinstance(input_tensor, (tuple, list)):
        raise ValueError(
            "Cannot convert `input_tensor` to a `PerDevice` object, "
            "got %r but expected a object that is not a tuple or list." %
            (input_tensor, ))
    if isinstance(input_tensor, value_lib.PerDevice):
        return input_tensor

    try:
        device = input_tensor.device
    except AttributeError:
        raise ValueError(
            "Cannot convert `input_tensor` to a `PerDevice` object "
            "because it doesn't have device set.")

    return value_lib.PerDevice({device: input_tensor})
예제 #12
0
 def map(self, map_over, fn, *args, **kwargs):
     # TODO (josh11b): In eager mode, use one thread per device. id:1098
     # https://github.com/imdone/tensorflow/issues/1099
     index = {}
     i = 0
     for m in map_over:
         d = self._devices[i % len(self._devices)]
         with ops.device(d):
             l = index.get(d, [])
             l.append(
                 fn(m, *values.select_device_mirrored(d, args),
                    **values.select_device_mirrored(d, kwargs)))
             index[d] = l
     # TODO (josh11b): Need a values.regroup equivalent that handles MapOutput id:1079
     # https://github.com/imdone/tensorflow/issues/1080
     # in addition to PerDevice data.
     return values.PerDevice(
         {k: values.MapOutput(v)
          for k, v in index.items()})
예제 #13
0
    def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation,
                                   batch_reduce):
        devices = ["/cpu:0", "/gpu:0"]
        dense_shape = [5, 2]
        t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
        t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], dense_shape,
                                  devices[1])
        per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})

        if batch_reduce:
            result = cross_tower_ops_instance.batch_reduce(
                aggregation, [(per_device, devices)])
        else:
            result = cross_tower_ops_instance.reduce(aggregation, per_device,
                                                     devices)

        total_indices_with_dups = [1, 1, 3]
        total_indices_without_dups = [1, 3]

        if aggregation == vs.VariableAggregation.SUM:
            total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
            total_values_without_dups = [[4., 6.], [5., 6.]]
        else:
            assert aggregation == vs.VariableAggregation.MEAN
            total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
            total_values_without_dups = [[2., 3.], [2.5, 3.]]

        total_mirrored_with_dups = _make_mirrored_indexed_slices(
            devices, total_values_with_dups, total_indices_with_dups,
            dense_shape)
        total_mirrored_without_dups = _make_mirrored_indexed_slices(
            devices, total_values_without_dups, total_indices_without_dups,
            dense_shape)

        # Test that the result is semantically equal to both the concatenated
        # IndexedSlices, as well as when the duplicate indices are summed up.
        if batch_reduce:
            total_mirrored_with_dups = [total_mirrored_with_dups]
            total_mirrored_without_dups = [total_mirrored_without_dups]

        self._assert_values_equal(total_mirrored_with_dups, result)
        self._assert_values_equal(total_mirrored_without_dups, result)
예제 #14
0
    def __init__(self, tpu_cluster_resolver, steps_per_run, num_cores=None):
        """Initializes the TPUStrategy object.

    Args:
      tpu_cluster_resolver: A tf.contrib.cluster_resolver.TPUClusterResolver,
          which provides information about the TPU cluster.
      steps_per_run: Number of steps to run on device before returning to the
          host. Note that this can have side-effects on performance, hooks,
          metrics, summaries etc.
          This parameter is only used when Distribution Strategy is used with
          estimator or keras.
      num_cores: Number of cores to use on the TPU. If None specified, then
          auto-detect the cores and topology of the TPU system.
    """
        # TODO(sourabhbajaj): OneDeviceStrategy should be initialized with the
        # master node fetched from the cluster resolver.
        super(TPUStrategy, self).__init__("/device:CPU:0")

        self._tpu_cluster_resolver = tpu_cluster_resolver
        self._tpu_metadata = get_tpu_system_metadata(
            self._tpu_cluster_resolver)
        # TODO(sourabhbajaj): Change this from num_cores to metadata_override
        self._num_cores_override = num_cores

        # TODO(jhseu): Switch to DeviceAssignment to support pods and model
        # parallelism.
        device_map = {
            d.name: i
            for i, d in enumerate(self._tpu_metadata.devices)
            if "device:TPU:" in d.name
        }
        self._device_index = values.PerDevice(device_map)
        self._tpu_devices = sorted(device_map.keys())
        # Only create variables for the number of towers we're running.
        self._tpu_devices = self._tpu_devices[:self.num_towers]

        # TODO(sourabhbajaj): Remove this once performance of running one step
        # at a time is comparable to multiple steps.
        self.steps_per_run = steps_per_run

        self._require_static_shapes = True
예제 #15
0
    def __init__(self,
                 devices=None,
                 num_gpus=None,
                 cluster_spec=None,
                 cross_tower_ops=None,
                 prefetch_on_device=None):
        super(MirroredStrategy, self).__init__()

        if cluster_spec:
            if devices is not None:
                raise ValueError(
                    "Specifying devices when `cluster_spec` is also given "
                    "is not supported in MirroredStrategy.")

            # TODO(yuefengz): use the utility method to normalize cluster_spec.
            if isinstance(cluster_spec, (dict, cluster_pb2.ClusterDef)):
                cluster_spec = server_lib.ClusterSpec(cluster_spec)
            elif not isinstance(cluster_spec, server_lib.ClusterSpec):
                raise ValueError(
                    "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
                    "`tf.train.ClusterDef` object")
            self._cluster_spec = cluster_spec

            self._workers = []
            for job in sorted(cluster_spec.jobs):
                for task in range(cluster_spec.num_tasks(job)):
                    self._workers.append("/job:%s/task:%d" % (job, task))

            if num_gpus is None:
                raise ValueError(
                    "`num_gpus` is required if `cluster_spec` is given.")
            self._num_gpus = num_gpus
            if num_gpus > 0:
                self._worker_device_map = {
                    worker: [
                        device_util.canonicalize(worker +
                                                 "/device:GPU:%d" % gpu)
                        for gpu in range(num_gpus)
                    ]
                    for worker in self._workers
                }
            else:
                self._worker_device_map = {
                    worker:
                    [device_util.canonicalize(worker, "/device:CPU:0")]
                    for worker in self._workers
                }
            devices = nest.flatten(self._worker_device_map)

            # Setting `_default_device` will add a device scope in the
            # distribution.scope. We set the default device to the first worker. When
            # users specify device under distribution.scope by
            #   with tf.device("/cpu:0"):
            #     ...
            # their ops will end up on the cpu device of its first worker, e.g.
            # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
            self._default_device = self._workers[0]
        else:
            self._cluster_spec = None
            # Convert `num_gpus` into `devices`, shouldn't specify both.
            if devices is None:
                if num_gpus is None:
                    num_gpus = context.num_gpus()
                devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
            elif num_gpus is not None:
                raise ValueError(
                    "Must only specify one of `devices` and `num_gpus`.")
            # TODO(yuefengz): consider setting the default device.

        assert devices, "Must specify at least one device."
        assert len(set(devices)) == len(devices), (
            "No duplicates allowed in `devices` argument.")
        # TODO(josh11b): Require at least 2 devices?
        self._devices = [device_util.resolve(d) for d in devices]
        self._canonical_device_set = set(self._devices)
        self._device_index = values.PerDevice(
            {d: i
             for i, d in enumerate(devices)})
        self._cross_tower_ops = cross_tower_ops
        self._prefetch_on_device = prefetch_on_device