Пример #1
0
    def assertDeviceEqual(self, device1, device2):
        """Asserts that the two given devices are the same.

    Args:
      device1: A string device name or TensorFlow `DeviceSpec` object.
      device2: A string device name or TensorFlow `DeviceSpec` object.
    """
        device1 = pydev.canonical_name(device1)
        device2 = pydev.canonical_name(device2)
        self.assertEqual(device1, device2, "Devices %s and %s are not equal" % (device1, device2))
Пример #2
0
def all_gather(t, group_size, group_key, instance_key):
  """Accumulates tensors collectively, across devices, along first dimension.

  Args:
    t: the tensor to participate in the accumulation.
    group_size: the total number of tensors to be collectively accumulated.
      Each must reside on a different device.
    group_key: an integer identifying the group of devices.
    instance_key: an integer identifying the participating group of Ops.

  Returns:
    An Op implementing the distributed operation.

  Raises:
    ValueError: if any of the input parameter constraints are not met.
  """
  if not device.canonical_name(t.device):
    raise ValueError('Device assignment required for collective ops')
  if group_size <= 1:
    raise ValueError('Parameter group_size to all_gather must be at least 2.')
  dims = t.shape.as_list()
  output_shape = [dims[0] * group_size] + dims[1:]
  return gen_collective_ops.collective_gather(t,
                                              shape=output_shape,
                                              group_size=group_size,
                                              group_key=group_key,
                                              instance_key=instance_key)
Пример #3
0
def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
               subdiv_offsets=(0,)):
  """Reduces tensors collectively, across devices.

  Args:
    t: the tensor to be reduced.
    group_size: the total number of tensors to be collectively reduced.
      Each must reside on a different device.
    group_key: an integer identifying the group of devices.
    instance_key: an integer identifying the participating group of Ops.
    merge_op: string naming the binary Op to be applied to compute each
      partial reduction.
    final_op: string naming the unary Op to be applied to each fully
      reduced value.  Can be 'Id' for no operation.
    subdiv_offsets: a list of integer offsets into the tensor at which each
      independent subdivision should begin.  Use [0] if no subdivision should
      be done.

  Returns:
    An Op implementing the distributed reduction.

  Raises:
    ValueError: if any of the input parameter constraints are not met.
  """
  if not device.canonical_name(t.device):
    raise ValueError('Device assignment required for collective ops')
  if group_size <= 1:
    raise ValueError('Parameter group_size to all_reduce must be at least 2.')
  return gen_collective_ops.collective_reduce(t,
                                              group_size=group_size,
                                              group_key=group_key,
                                              instance_key=instance_key,
                                              merge_op=merge_op,
                                              final_op=final_op,
                                              subdiv_offsets=subdiv_offsets)
Пример #4
0
  def _initialize_handle_and_devices(self):
    """Initialize handle and devices."""
    with self._initialize_lock:
      if self._context_handle is not None:
        return
      assert self._context_devices is None
      opts = pywrap_tensorflow.TFE_NewContextOptions()
      try:
        if self._config is not None:
          config_str = self._config.SerializeToString()
          pywrap_tensorflow.TFE_ContextOptionsSetConfig(opts, config_str)
        if self._device_policy is not None:
          pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
              opts, self._device_policy)
        if self._execution_mode == ASYNC:
          pywrap_tensorflow.TFE_ContextOptionsSetAsync(opts, True)
        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts)
      finally:
        pywrap_tensorflow.TFE_DeleteContextOptions(opts)
      # Store list of devices
      self._context_devices = []
      device_list = pywrap_tensorflow.TFE_ContextListDevices(
          self._context_handle)
      try:
        self._num_gpus = 0
        for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
          dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
          self._context_devices.append(pydev.canonical_name(dev_name))
          dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
          if dev_type == "GPU":
            self._num_gpus += 1

      finally:
        pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #5
0
  def _initialize_handle_and_devices(self):
    """Initialize handle and devices."""
    with self._initialize_lock:
      if self._context_handle is not None:
        return
      assert self._context_devices is None
      opts = pywrap_tensorflow.TF_NewSessionOptions(
          target=compat.as_bytes(""), config=self._config)
      with errors.raise_exception_on_not_ok_status() as status:
        self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
        pywrap_tensorflow.TF_DeleteSessionOptions(opts)
      # Store list of devices
      self._context_devices = []
      with errors.raise_exception_on_not_ok_status() as status:
        device_list = pywrap_tensorflow.TFE_ContextListDevices(
            self._context_handle, status)
      try:
        self._num_gpus = 0
        for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
          with errors.raise_exception_on_not_ok_status() as status:
            dev_name = pywrap_tensorflow.TF_DeviceListName(
                device_list, i, status)
          self._context_devices.append(pydev.canonical_name(dev_name))
          with errors.raise_exception_on_not_ok_status() as status:
            dev_type = pywrap_tensorflow.TF_DeviceListType(
                device_list, i, status)
          if dev_type == "GPU":
            self._num_gpus += 1

      finally:
        pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #6
0
def _apply_all_reduce(reduction_op, tensors):
  if not tensors:
    raise ValueError('Must pass >0 tensors to all reduce operations')
  shared_name = _get_shared_name()
  res = []
  for t in tensors:
    if not device.canonical_name(t.device):
      raise ValueError('Device assignment required for nccl collective ops')
    with ops.device(t.device):
      res.append(
          gen_nccl_ops.nccl_all_reduce(
              t,
              reduction=reduction_op,
              num_devices=len(tensors),
              shared_name=shared_name))
  return res
Пример #7
0
def all_reduce(t,
               group_size,
               group_key,
               instance_key,
               merge_op,
               final_op,
               subdiv_offsets=(0, ),
               communication_hint='auto'):
    """Reduces tensors collectively, across devices.

  Args:
    t: the tensor to be reduced.
    group_size: the total number of tensors to be collectively reduced.
      Each must reside on a different device.
    group_key: an integer identifying the group of devices.
    instance_key: an integer identifying the participating group of Ops.
    merge_op: string naming the binary Op to be applied to compute each
      partial reduction.
    final_op: string naming the unary Op to be applied to each fully
      reduced value.  Can be 'Id' for no operation.
    subdiv_offsets: a list of integer offsets into the tensor at which each
      independent subdivision should begin.  Use [0] if no subdivision should
      be done.
    communication_hint: preferred collective communication.  The implementation
      may fall back to another mechanism.  Options include `auto`, `ring`, and
      `nccl`.

  Returns:
    An Op implementing the distributed reduction.

  Raises:
    ValueError: if any of the input parameter constraints are not met.
  """
    if not device.canonical_name(t.device):
        raise ValueError('Device assignment required for collective ops')
    if group_size <= 1:
        raise ValueError(
            'Parameter group_size to all_reduce must be at least 2.')
    return gen_collective_ops.collective_reduce(
        t,
        group_size=group_size,
        group_key=group_key,
        instance_key=instance_key,
        merge_op=merge_op,
        final_op=final_op,
        subdiv_offsets=subdiv_offsets,
        communication_hint=communication_hint.lower())
Пример #8
0
    def _initialize_devices(self):
        """Helper to initialize devices."""
        # Store list of devices
        self._context_devices = []
        device_list = pywrap_tensorflow.TFE_ContextListDevices(
            self._context_handle)
        try:
            self._num_gpus = 0
            for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
                dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
                self._context_devices.append(pydev.canonical_name(dev_name))
                dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
                if dev_type == "GPU":
                    self._num_gpus += 1

        finally:
            pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #9
0
  def _initialize_devices(self):
    """Helper to initialize devices."""
    # Store list of devices
    self._context_devices = []
    device_list = pywrap_tensorflow.TFE_ContextListDevices(
        self._context_handle)
    try:
      self._num_gpus = 0
      for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
        dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i)
        self._context_devices.append(pydev.canonical_name(dev_name))
        dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i)
        if dev_type == "GPU":
          self._num_gpus += 1

    finally:
      pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #10
0
def device(name):
    """Context-manager to force placement of operations and Tensors on a device.

  For example:
  ```python
  with tfe.device('gpu:0'):
    with tfe.device('cpu:0'):
      shape = tfe.Tensor([], dtype=tf.int32)
    x = ops.truncated_normal(shape, tf.float32)
  ```
  will ensure that the `shape` Tensor is on CPU but the `truncated_normal`
  operation
  runs on GPU 0.

  Args:
    name: Name of the device (see get_default_context().devices()), or None to
      enable automatic placement.

  Yields:
    Nothing.

  Raises:
    ValueError: If name does not correspond to a valid device.
  """
    device_index = -1
    ctx = get_default_context()
    if name is not None:
        name = pydev.canonical_name(name)
        all_devices = ctx.devices()
        for i, d in enumerate(all_devices):
            # TODO(ashankar): This will change when we have distributed support.
            # At that point, should not look for a string suffix but be able to
            # do a full string comparison.
            if d.endswith(name):
                device_index = i
                break
        if device_index < 0:
            raise ValueError(
                "device {} does not match the available devices ({})".format(
                    name, all_devices))
    old_device_index = ctx._device_index  # pylint: disable=protected-access
    try:
        ctx._device_index = device_index  # pylint: disable=protected-access
        yield
    finally:
        ctx._device_index = old_device_index  # pylint: disable=protected-access
Пример #11
0
def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
  """Broadcasts one tensor to a group of others, across devices.

  Args:
    t: the tensor to be sent.
    shape: the shape of the tensor being sent, which must agree with t.
    dtype: the type of the tensor being sent, which must agree with t.
    group_size: one plus the number of receiving tensors, i.e. the total
      number of devices participating.  Each tensor must reside on a
      different device.
    group_key: an integer identifying the group of devices.
    instance_key: an integer identifying the participating group of Ops.

  Returns:
    An Op implementing the distributed broadcast send.

  Raises:
    ValueError: if any of the input parameter constraints are not met.

  Note that the shape and dtype arguments appear redundant since they
  should be obtainable from t.  The are two reasons for including
  them.  First, the shape and type of tensors passed via broadcast must
  be known ahead of time in their most specific form so that the receive
  side can allocate memory for the operation and shape/type inference can
  carry forward from there.  Including the same declarations on the
  send side clarifies a commitment already made.  Secondly, having nearly
  identical use syntax for send and receive sides may simplify tool-driven
  generation of broadcast.
  """
  if not device.canonical_name(t.device):
    raise ValueError('Device assignment required for collective ops')
  if group_size <= 1:
    raise ValueError(
        'Parameter group_size to broadcast_send must be at least 2.')
  if t.shape != shape:
    raise ValueError(
        'Shape of broadcast_send tensor not equal to delcared shape')
  if t.dtype != dtype:
    raise ValueError(
        'Type of broadcast_send tensor not equal to declared type')
  return gen_collective_ops.collective_bcast_send(t,
                                                  shape=shape,
                                                  group_size=group_size,
                                                  group_key=group_key,
                                                  instance_key=instance_key)
Пример #12
0
def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
    """Broadcasts one tensor to a group of others, across devices.

  Args:
    t: the tensor to be sent.
    shape: the shape of the tensor being sent, which must agree with t.
    dtype: the type of the tensor being sent, which must agree with t.
    group_size: one plus the number of receiving tensors, i.e. the total
      number of devices participating.  Each tensor must reside on a
      different device.
    group_key: an integer identifying the group of devices.
    instance_key: an integer identifying the participating group of Ops.

  Returns:
    An Op implementing the distributed broadcast send.

  Raises:
    ValueError: if any of the input parameter constraints are not met.

  Note that the shape and dtype arguments appear redundant since they
  should be obtainable from t.  The are two reasons for including
  them.  First, the shape and type of tensors passed via broadcast must
  be known ahead of time in their most specific form so that the receive
  side can allocate memory for the operation and shape/type inference can
  carry forward from there.  Including the same declarations on the
  send side clarifies a commitment already made.  Secondly, having nearly
  identical use syntax for send and receive sides may simplify tool-driven
  generation of broadcast.
  """
    if not device.canonical_name(t.device):
        raise ValueError('Device assignment required for collective ops')
    if group_size <= 1:
        raise ValueError(
            'Parameter group_size to broadcast_send must be at least 2.')
    if t.shape != shape:
        raise ValueError(
            'Shape of broadcast_send tensor not equal to delcared shape')
    if t.dtype != dtype:
        raise ValueError(
            'Type of broadcast_send tensor not equal to declared type')
    return gen_collective_ops.collective_bcast_send(t,
                                                    shape=shape,
                                                    group_size=group_size,
                                                    group_key=group_key,
                                                    instance_key=instance_key)
Пример #13
0
    def _initialize_handle_and_devices(self):
        """Initialize handle and devices."""
        with self._initialize_lock:
            if self._context_handle is not None:
                return
            assert self._context_devices is None
            opts = pywrap_tensorflow.TFE_NewContextOptions()
            try:
                with errors.raise_exception_on_not_ok_status() as status:
                    if self._config is not None:
                        config_str = self._config.SerializeToString()
                        pywrap_tensorflow.TFE_ContextOptionsSetConfig(
                            opts, config_str, len(config_str), status)
                    if self._device_policy is not None:
                        pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
                            opts, self._device_policy)
                    if self._execution_mode == ASYNC:
                        pywrap_tensorflow.TFE_ContextOptionsSetAsync(True)
                    self._context_handle = pywrap_tensorflow.TFE_NewContext(
                        opts, status)
            finally:
                pywrap_tensorflow.TFE_DeleteContextOptions(opts)
            # Store list of devices
            self._context_devices = []
            with errors.raise_exception_on_not_ok_status() as status:
                device_list = pywrap_tensorflow.TFE_ContextListDevices(
                    self._context_handle, status)
            try:
                self._num_gpus = 0
                for i in range(
                        pywrap_tensorflow.TF_DeviceListCount(device_list)):
                    with errors.raise_exception_on_not_ok_status() as status:
                        dev_name = pywrap_tensorflow.TF_DeviceListName(
                            device_list, i, status)
                    self._context_devices.append(
                        pydev.canonical_name(dev_name))
                    with errors.raise_exception_on_not_ok_status() as status:
                        dev_type = pywrap_tensorflow.TF_DeviceListType(
                            device_list, i, status)
                    if dev_type == "GPU":
                        self._num_gpus += 1

            finally:
                pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #14
0
  def testCanonicalName(self):
    self.assertEqual("/job:foo/replica:0",
                     device.canonical_name("/job:foo/replica:0"))
    self.assertEqual("/job:foo/replica:0",
                     device.canonical_name("/replica:0/job:foo"))

    self.assertEqual("/job:foo/replica:0/task:0",
                     device.canonical_name("/job:foo/replica:0/task:0"))
    self.assertEqual("/job:foo/replica:0/task:0",
                     device.canonical_name("/job:foo/task:0/replica:0"))

    self.assertEqual("/device:CPU:0", device.canonical_name("/device:CPU:0"))
    self.assertEqual("/device:GPU:2", device.canonical_name("/device:GPU:2"))

    self.assertEqual(
        "/job:foo/replica:0/task:0/device:GPU:0",
        device.canonical_name("/job:foo/replica:0/task:0/device:GPU:0"))
    self.assertEqual(
        "/job:foo/replica:0/task:0/device:GPU:0",
        device.canonical_name("/device:GPU:0/task:0/replica:0/job:foo"))
Пример #15
0
  def _GroupByDevices(self, vars_to_save):
    """Group Variable tensor slices per device.

    TODO(touts): Make sure that all the devices found are on different
    job/replica/task/cpu|gpu.  It would be bad if 2 were on the same device.
    It can happen if the devices as unspecified.

    Args:
      vars_to_save: A list of BaseSaverBuilder.VarToSave objects.

    Returns:
      A list of tuples: (device_name, BaseSaverBuilder.VarToSave) tuples.
      The list is sorted by ascending device_name.
    """
    per_device = collections.defaultdict(lambda: [])
    for var_to_save in vars_to_save:
      canonical_device = pydev.canonical_name(var_to_save.var.device)
      per_device[canonical_device].append(var_to_save)
    return sorted(per_device.items(), key=lambda t: t[0])
Пример #16
0
    def _GroupByDevices(self, vars_to_save):
        """Group Variable tensor slices per device.

    TODO(touts): Make sure that all the devices found are on different
    job/replica/task/cpu|gpu.  It would be bad if 2 were on the same device.
    It can happen if the devices as unspecified.

    Args:
      vars_to_save: A list of BaseSaverBuilder.VarToSave objects.

    Returns:
      A list of tuples: (device_name, BaseSaverBuilder.VarToSave) tuples.
      The list is sorted by ascending device_name.
    """
        per_device = collections.defaultdict(lambda: [])
        for var_to_save in vars_to_save:
            canonical_device = pydev.canonical_name(var_to_save.var.device)
            per_device[canonical_device].append(var_to_save)
        return sorted(per_device.items(), key=lambda t: t[0])
Пример #17
0
  def testCanonicalName(self):
    self.assertEqual("/job:foo/replica:0",
                     device.canonical_name("/job:foo/replica:0"))
    self.assertEqual("/job:foo/replica:0",
                     device.canonical_name("/replica:0/job:foo"))

    self.assertEqual("/job:foo/replica:0/task:0",
                     device.canonical_name("/job:foo/replica:0/task:0"))
    self.assertEqual("/job:foo/replica:0/task:0",
                     device.canonical_name("/job:foo/task:0/replica:0"))

    self.assertEqual("/device:CPU:0",
                     device.canonical_name("/device:CPU:0"))
    self.assertEqual("/device:GPU:2",
                     device.canonical_name("/device:GPU:2"))

    self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                     device.canonical_name(
                         "/job:foo/replica:0/task:0/gpu:0"))
    self.assertEqual("/job:foo/replica:0/task:0/device:GPU:0",
                     device.canonical_name(
                         "/gpu:0/task:0/replica:0/job:foo"))
Пример #18
0
  def _initialize_handle_and_devices(self):
    """Initialize handle and devices."""
    with self._initialize_lock:
      if self._context_handle is not None:
        return
      assert self._context_devices is None
      opts = pywrap_tensorflow.TFE_NewContextOptions()
      try:
        with errors.raise_exception_on_not_ok_status() as status:
          if self._config is not None:
            config_str = self._config.SerializeToString()
            pywrap_tensorflow.TFE_ContextOptionsSetConfig(
                opts, config_str, len(config_str), status)
          if self._device_policy is not None:
            pywrap_tensorflow.TFE_ContextOptionsSetDevicePlacementPolicy(
                opts, self._device_policy)
          self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
      finally:
        pywrap_tensorflow.TFE_DeleteContextOptions(opts)
      # Store list of devices
      self._context_devices = []
      with errors.raise_exception_on_not_ok_status() as status:
        device_list = pywrap_tensorflow.TFE_ContextListDevices(
            self._context_handle, status)
      try:
        self._num_gpus = 0
        for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
          with errors.raise_exception_on_not_ok_status() as status:
            dev_name = pywrap_tensorflow.TF_DeviceListName(
                device_list, i, status)
          self._context_devices.append(pydev.canonical_name(dev_name))
          with errors.raise_exception_on_not_ok_status() as status:
            dev_type = pywrap_tensorflow.TF_DeviceListType(
                device_list, i, status)
          if dev_type == "GPU":
            self._num_gpus += 1

      finally:
        pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #19
0
    def __init__(self):
        self._eager_context = _EagerContext()
        # Create a handle
        opts = pywrap_tensorflow.TF_NewSessionOptions(
            target=compat.as_bytes(""), config=None)
        with errors.raise_exception_on_not_ok_status() as status:
            self._handle = pywrap_tensorflow.TFE_NewContext(opts, status)
            pywrap_tensorflow.TF_DeleteSessionOptions(opts)
        # Store list of devices
        self._devices = []
        with errors.raise_exception_on_not_ok_status() as status:
            device_list = pywrap_tensorflow.TFE_ContextListDevices(
                self._handle, status)
        try:
            for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
                with errors.raise_exception_on_not_ok_status() as status:
                    dev_name = pywrap_tensorflow.TF_DeviceListName(
                        device_list, i, status)
                self._devices.append(pydev.canonical_name(dev_name))
        finally:
            pywrap_tensorflow.TF_DeleteDeviceList(device_list)

        self._summary_writer_resource = None
Пример #20
0
 def _initialize_handle_and_devices(self):
   """Initialize handle and devices."""
   with self._initialize_lock:
     if self._context_handle is not None:
       return
     assert self._context_devices is None
     opts = pywrap_tensorflow.TF_NewSessionOptions(
         target=compat.as_bytes(""), config=self._config)
     with errors.raise_exception_on_not_ok_status() as status:
       self._context_handle = pywrap_tensorflow.TFE_NewContext(opts, status)
       pywrap_tensorflow.TF_DeleteSessionOptions(opts)
     # Store list of devices
     self._context_devices = []
     with errors.raise_exception_on_not_ok_status() as status:
       device_list = pywrap_tensorflow.TFE_ContextListDevices(
           self._context_handle, status)
     try:
       for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)):
         with errors.raise_exception_on_not_ok_status() as status:
           dev_name = pywrap_tensorflow.TF_DeviceListName(
               device_list, i, status)
         self._context_devices.append(pydev.canonical_name(dev_name))
     finally:
       pywrap_tensorflow.TF_DeleteDeviceList(device_list)
Пример #21
0
def _check_device(tensor, expected=None):
  if not device.canonical_name(tensor.device):
    raise ValueError('Device assignment required for nccl collective ops')
  if expected and expected != tensor.device:
    raise ValueError('Expected device %s, got %s' % (expected, tensor.device))
Пример #22
0
 def _get_device_name(handle):
   """The device name encoded in the handle."""
   handle_str = compat.as_str_any(handle)
   return pydev.canonical_name(handle_str.split(";")[-1])
Пример #23
0
def _check_device(tensor, expected=None):
    if not device.canonical_name(tensor.device):
        raise ValueError('Device assignment required for nccl collective ops')
    if expected and expected != tensor.device:
        raise ValueError('Expected device %s, got %s' %
                         (expected, tensor.device))
Пример #24
0
 def _get_device_name(handle):
     """The device name encoded in the handle."""
     handle_str = compat.as_str_any(handle)
     return pydev.canonical_name(handle_str.split(";")[-1])
Пример #25
0
def _check_device_assignment(tensor):
    if not device.canonical_name(tensor.device):
        raise ValueError('Device assignment required for nccl collective ops')
Пример #26
0
def _check_device_assignment(tensor):
  if not device.canonical_name(tensor.device):
    raise ValueError('Device assignment required for nccl collective ops')