def testRaggedExpandDims(self,
                             rt_input,
                             axis,
                             expected,
                             ragged_rank=None,
                             expected_shape=None):
        rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
        expanded = ragged_array_ops.expand_dims(rt, axis=axis)
        self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
        if expected_shape is not None:
            self.assertEqual(expanded.shape.as_list(), expected_shape)

        self.assertAllEqual(expanded, expected)
  def testRaggedExpandDims(self,
                           rt_input,
                           axis,
                           expected,
                           ragged_rank=None,
                           expected_shape=None):
    rt = ragged_factory_ops.constant(rt_input, ragged_rank=ragged_rank)
    expanded = ragged_array_ops.expand_dims(rt, axis=axis)
    self.assertEqual(expanded.shape.ndims, rt.shape.ndims + 1)
    if expected_shape is not None:
      self.assertEqual(expanded.shape.as_list(), expected_shape)

    self.assertRaggedEqual(expanded, expected)
Exemplo n.º 3
0
def unicode_split(input,
                  input_encoding,
                  errors="replace",
                  replacement_char=0xFFFD,
                  name=None):
    r"""Splits each string in `input` into a sequence of Unicode code points.

  `result[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
  `j`th character, when decoded using `input_encoding`.

  Args:
    input: An `N` dimensional potentially ragged `string` tensor with shape
      `[D1...DN]`.  `N` must be statically known.
    input_encoding: String name for the unicode encoding that should be used to
      decode each string.
    errors: Specifies the response when an input string can't be converted
      using the indicated encoding. One of:
      * `'strict'`: Raise an exception for any illegal substrings.
      * `'replace'`: Replace illegal substrings with `replacement_char`.
      * `'ignore'`: Skip illegal substrings.
    replacement_char: The replacement codepoint to be used in place of invalid
      substrings in `input` when `errors='replace'`.
    name: A name for the operation (optional).

  Returns:
    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
    `tf.RaggedTensor` otherwise.

  #### Example:
    ```python
    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
    >>> tf.strings.unicode_split(input, 'UTF-8').tolist()
    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
     ['\xf0\x9f\x98\x8a']]
    ```
  """
    with ops.name_scope(name, "UnicodeSplit", [input]):
        codepoints = _unicode_decode(input,
                                     input_encoding,
                                     errors,
                                     replacement_char,
                                     False,
                                     with_offsets=False)
        return unicode_encode(ragged_array_ops.expand_dims(codepoints, -1),
                              output_encoding=input_encoding,
                              errors=errors,
                              replacement_char=replacement_char)
Exemplo n.º 4
0
def unicode_split(input,
                  input_encoding,
                  errors="replace",
                  replacement_char=0xFFFD,
                  name=None):
  r"""Splits each string in `input` into a sequence of Unicode code points.

  `result[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
  `j`th character, when decoded using `input_encoding`.

  Args:
    input: An `N` dimensional potentially ragged `string` tensor with shape
      `[D1...DN]`.  `N` must be statically known.
    input_encoding: String name for the unicode encoding that should be used to
      decode each string.
    errors: Specifies the response when an input string can't be converted
      using the indicated encoding. One of:
      * `'strict'`: Raise an exception for any illegal substrings.
      * `'replace'`: Replace illegal substrings with `replacement_char`.
      * `'ignore'`: Skip illegal substrings.
    replacement_char: The replacement codepoint to be used in place of invalid
      substrings in `input` when `errors='replace'`.
    name: A name for the operation (optional).

  Returns:
    A `N+1` dimensional `int32` tensor with shape `[D1...DN, (num_chars)]`.
    The returned tensor is a `tf.Tensor` if `input` is a scalar, or a
    `tf.RaggedTensor` otherwise.

  #### Example:
    ```python
    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
    >>> tf.strings.unicode_split(input, 'UTF-8').tolist()
    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
     ['\xf0\x9f\x98\x8a']]
    ```
  """
  with ops.name_scope(name, "UnicodeSplit", [input]):
    codepoints = _unicode_decode(input, input_encoding, errors,
                                 replacement_char, False, with_offsets=False)
    return unicode_encode(
        ragged_array_ops.expand_dims(codepoints, -1),
        output_encoding=input_encoding,
        errors=errors,
        replacement_char=replacement_char)
Exemplo n.º 5
0
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
    """Helper function to concatenate or stack ragged tensors.

  Args:
    rt_inputs: A list of RaggedTensors or Tensors to combine.
    axis: The axis along which to concatenate or stack.
    stack_values: A boolean -- if true, then stack values; otherwise,
      concatenate them.

  Returns:
    A RaggedTensor.
  Raises:
    ValueError: If rt_inputs is empty, or if axis is out of range.
  """
    # Validate parameters.
    if not rt_inputs:
        raise ValueError('rt_inputs may not be empty.')

    # Convert input tensors.
    rt_inputs = [
        ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input,
                                                         name='rt_input')
        for rt_input in rt_inputs
    ]
    row_splits_dtype, rt_inputs = ragged_tensor.match_row_splits_dtypes(
        *rt_inputs, return_dtype=True)
    rt_inputs = list(rt_inputs)

    # Special case: if there's only one input, then return it as-is.
    if len(rt_inputs) == 1:
        if stack_values:
            return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
        else:
            return rt_inputs[0]

    # Check the rank (number of dimensions) of the input tensors.
    ndims = None
    for rt in rt_inputs:
        if ndims is None:
            ndims = rt.shape.ndims
        else:
            rt.shape.assert_has_rank(ndims)

    out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
    axis = ragged_util.get_positive_axis(axis, out_ndims)

    # If all the inputs are Tensors, and we're combining the final dimension,
    # then we can delegate to the tf.stack/tf.concat operation, and return a
    # Tensor.
    if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
        if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
            if stack_values:
                return array_ops.stack(rt_inputs, axis)
            else:
                return array_ops.concat(rt_inputs, axis)

    # Convert any Tensor inputs to RaggedTensors.  This makes it
    # possible to concatenate Tensors and RaggedTensors together.
    for i in range(len(rt_inputs)):
        if not ragged_tensor.is_ragged(rt_inputs[i]):
            rt_inputs[i] = ragged_tensor.RaggedTensor.from_tensor(
                rt_inputs[i], ragged_rank=1, row_splits_dtype=row_splits_dtype)

    # Convert the input tensors to all have the same ragged_rank.
    ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
    rt_inputs = [
        _increase_ragged_rank_to(rt, ragged_rank, row_splits_dtype)
        for rt in rt_inputs
    ]

    if axis == 0:
        return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
    elif axis == 1:
        return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
    else:  # axis > 1: recurse.
        values = [rt.values for rt in rt_inputs]
        splits = [[rt_input.row_splits] for rt_input in rt_inputs]
        with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
            return ragged_tensor.RaggedTensor.from_row_splits(
                _ragged_stack_concat_helper(values, axis - 1, stack_values),
                splits[0][0],
                validate=False)
Exemplo n.º 6
0
def gather_nd(params, indices, batch_dims=0, name=None):
  """Gather slices from `params` using `n`-dimensional indices.

  This operation is similar to `gather`, but it uses the innermost dimension
  of `indices` to define a slice into `params`.  In particular, if:

  * `indices` has shape `[A1...AN, I]`
  * `params` has shape `[B1...BM]`

  Then:

  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
  * `result[a1...aN] = params[indices[a1...aN, :]]`

  Args:
    params: A potentially ragged tensor with shape `[A1...AN, I]`.
    indices: A potentially ragged tensor with shape `[B1...BM]`.
    batch_dims: Must be zero.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.

  #### Examples:
    ```python
    >>> params = tf.compat.v1.ragged.constant_value(
    ...     [ [ ['000', '001'], ['010'              ]          ],
    ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
    ...       [ [            ], ['210'              ]          ] ])

    >>> # Gather 2D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2], [0]])
    [ [ [            ], ['210'] ]
      [ ['000', '001'], ['010'] ] ]

    >>> # Gather 1D slices from a 3D tensor
    >>> ragged.gather_nd(params, [[2, 1], [0, 0]])
    [['210'], ['000', '001']]

    >>> # Gather scalars from a 3D tensor
    >>> ragged.gather_nd(params, [[0, 0, 1], [1, 1, 2]])
    ['001', '112']
    ```
  """
  if not isinstance(batch_dims, int) or batch_dims != 0:
    raise ValueError('batch_dims != 0 is not supported for ragged gather yet.')
  if not (ragged_tensor.is_ragged(params) or ragged_tensor.is_ragged(indices)):
    return array_ops.gather_nd(params, indices, name)

  with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):

    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params')
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices')
    params, indices = ragged_tensor.match_row_splits_dtypes(params, indices)
    indices_shape = indices.shape
    indices_ndims = indices_shape.ndims
    if indices_ndims is None:
      raise ValueError('indices.rank be statically known.')
    if indices_ndims == 0:
      raise ValueError('indices.rank must be at least 1.')
    if (ragged_tensor.is_ragged(indices) and
        indices_ndims == indices.ragged_rank + 1):
      raise ValueError('The innermost dimension of indices may not be ragged')

    # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
    # that each index slices into.
    index_size = tensor_shape.dimension_value(indices_shape[-1])
    if index_size is None:
      raise ValueError('indices.shape[-1] must be statically known.')

    # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
    # dense, then we convert it to ragged before recursing, and then convert
    # the result back to `dense` if appropriate.
    if indices_ndims > 2:
      indices_is_dense = not ragged_tensor.is_ragged(indices)
      if indices_is_dense:
        indices = ragged_tensor.RaggedTensor.from_tensor(
            indices, ragged_rank=indices_ndims - 2,
            row_splits_dtype=params.row_splits.dtype)
      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
      if (indices_is_dense and ragged_tensor.is_ragged(result) and
          result.ragged_rank == indices_ndims - 2):
        result = ragged_tensor.RaggedTensor.to_tensor(result)
      return result

    # indices_ndims <= 2, and the innermost dimension of indices may not be
    # ragged, so `indices` must not be ragged.
    assert not ragged_tensor.is_ragged(indices)
    assert ragged_tensor.is_ragged(params)

    # Handle corner case: An empty index tuple selects the entire `params`
    # value.  So if `index_size` is zero, then tile `params`.
    if index_size == 0:
      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
      for dim in range(indices_ndims - 1):
        params = ragged_array_ops.expand_dims(params, axis=0)
      multiples = array_ops.concat([
          array_ops.shape(indices)[:-1],
          array_ops.ones([params_ndims], dtypes.int32)
      ],
                                   axis=0)
      return ragged_array_ops.tile(params, multiples)

    # When index_size=1, we can just flatten the index tuples and use gather.
    elif index_size == 1:
      flattened_index_tuples = array_ops.reshape(indices, [-1])
      return gather(params, flattened_index_tuples)

    # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
    # Flatten both the index tuples and the params, such that the flattened
    # index tuples point to the correct values in the flattened params; and
    # then use ragged.gather on the flattened index tuples & params.
    else:
      indices = math_ops.cast(indices, params.row_splits.dtype)

      # Flatten the outermost 2 dimensions of the index tuples & params.
      flattened_index_tuples = array_ops.gather(params.row_splits,
                                                indices[..., 0])
      flattened_index_tuples += indices[..., 1]
      flattened_params = params.values

      # Flatten any remaining dimensions.
      for dim in range(2, index_size):
        if not ragged_tensor.is_ragged(flattened_params):
          flattened_index_tuples = array_ops.expand_dims(
              flattened_index_tuples, axis=1)
          flattened_index_tuples = array_ops.concat(
              [flattened_index_tuples, indices[..., dim:]], axis=1)
          return array_ops.gather_nd(flattened_params, flattened_index_tuples)

        flattened_index_tuples = array_ops.gather(
            flattened_params.row_starts(), flattened_index_tuples)
        flattened_index_tuples += indices[..., dim]
        flattened_params = flattened_params.values

      # Gather using the flattened index tuples and params.
      return gather(flattened_params, flattened_index_tuples)
Exemplo n.º 7
0
def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None):  # pylint: disable=redefined-builtin
    if dim is not None:
        axis = dim
    return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
Exemplo n.º 8
0
def batch_gather_with_default(params, indices, default_value='', name=None):
    """Same as `batch_gather` but inserts `default_value` for invalid indices.

  This operation is similar to `batch_gather` except that it will substitute
  the value for invalid indices with `default_value` as the contents.
  See `batch_gather` for more details.


  Args:
    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
      `M>0`).
    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
    default_value: A value to be inserted in places where `indices` are out of
      bounds. Must be the same dtype as params and either a scalar or rank 1.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.

  #### Example:
    ```python
    >>> params = tf.ragged.constant([
          ['a', 'b', 'c'],
          ['d'],
          [],
          ['e']])
    >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
    >>> batch_gather_with_default(params, indices, 'FOO')
    [['b', 'c', 'FOO'], [], [], ['e', 'FOO']]
  ```
  """
    with ops.name_scope(name, 'RaggedBatchGatherWithDefault'):
        params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            params,
            name='params',
        )
        indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            indices,
            name='indices',
        )
        default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            default_value,
            name='default_value',
        )
        # TODO(hterry): lift this restriction and support default_values of
        #               of rank > 1
        if (default_value.shape.ndims is not 0
                and default_value.shape.ndims is not 1):
            raise ValueError('"default_value" must be a scalar or vector')
        upper_bounds = None
        if indices.shape.ndims is None:
            raise ValueError('Indices must have a known rank.')
        if params.shape.ndims is None:
            raise ValueError('Params must have a known rank.')

        num_batch_dimensions = indices.shape.ndims - 1
        pad = None
        # The logic for this works as follows:
        # - create a padded params, where:
        #    padded_params[b1...bn, 0] = default_value
        #    padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0)
        # - create an `upper_bounds` Tensor that contains the number of elements
        #   in each innermost rank. Broadcast `upper_bounds` to be the same shape
        #   as `indices`.
        # - check to see which index in `indices` are out of bounds and substitute
        #   it with the index containing `default_value` (the first).
        # - call batch_gather with the indices adjusted.
        with ops.control_dependencies([
                check_ops.assert_greater_equal(array_ops.rank(params),
                                               array_ops.rank(indices))
        ]):
            if ragged_tensor.is_ragged(params):
                row_lengths = ragged_array_ops.expand_dims(
                    params.row_lengths(axis=num_batch_dimensions), axis=-1)
                upper_bounds = math_ops.cast(row_lengths, indices.dtype)

                pad_shape = _get_pad_shape(params, indices)

                pad = ragged_tensor_shape.broadcast_to(default_value,
                                                       pad_shape)
            else:
                params_shape = array_ops.shape(params)
                pad_shape = array_ops.concat([
                    params_shape[:num_batch_dimensions], [1],
                    params_shape[num_batch_dimensions + 1:params.shape.ndims]
                ], 0)
                upper_bounds = params_shape[num_batch_dimensions]
                pad = array_ops.broadcast_to(default_value, pad_shape)

            # Add `default_value` as the first value in the innermost (ragged) rank.
            pad = math_ops.cast(pad, params.dtype)
            padded_params = array_ops.concat([pad, params],
                                             axis=num_batch_dimensions)

            # Adjust the indices by substituting out-of-bound indices to the
            # default-value index (which is the first element)
            shifted_indices = indices + 1
            is_out_of_bounds = (indices < 0) | (indices > upper_bounds)
            adjusted_indices = ragged_where_op.where(
                is_out_of_bounds,
                x=array_ops.zeros_like(indices),
                y=shifted_indices,
            )
            return array_ops.batch_gather(params=padded_params,
                                          indices=adjusted_indices,
                                          name=name)
Exemplo n.º 9
0
def unicode_split_with_offsets(input,
                               input_encoding,
                               errors="replace",
                               replacement_char=0xFFFD,
                               name=None):
    r"""Splits each string into a sequence of code points with start offsets.

  This op is similar to `tf.strings.decode(...)`, but it also returns the
  start offset for each character in its respective string.  This information
  can be used to align the characters with the original byte sequence.

  Returns a tuple `(chars, start_offsets)` where:

  * `chars[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
    `j`th character, when decoded using `input_encoding`.
  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
    character in `input[i1...iN]`, when decoded using `input_encoding`.

  Args:
    input: An `N` dimensional potentially ragged `string` tensor with shape
      `[D1...DN]`.  `N` must be statically known.
    input_encoding: String name for the unicode encoding that should be used to
      decode each string.
    errors: Specifies the response when an input string can't be converted
      using the indicated encoding. One of:
      * `'strict'`: Raise an exception for any illegal substrings.
      * `'replace'`: Replace illegal substrings with `replacement_char`.
      * `'ignore'`: Skip illegal substrings.
    replacement_char: The replacement codepoint to be used in place of invalid
      substrings in `input` when `errors='replace'`.
    name: A name for the operation (optional).

  Returns:
    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.

    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.

    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
    `tf.RaggedTensor`s otherwise.

  #### Example:

  >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
  >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
  >>> result[0].to_list()  # character substrings
  [[b'G', b'\xc3\xb6', b'\xc3\xb6', b'd', b'n', b'i', b'g', b'h', b't'],
   [b'\xf0\x9f\x98\x8a']]
  >>> result[1].to_list()  # offsets
  [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]

  """
    with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]):
        codepoints, offsets = _unicode_decode(input,
                                              input_encoding,
                                              errors,
                                              replacement_char,
                                              False,
                                              with_offsets=True)
        chars = unicode_encode(ragged_array_ops.expand_dims(codepoints, -1),
                               output_encoding=input_encoding,
                               errors=errors,
                               replacement_char=replacement_char)
        return chars, offsets
def batch_gather_with_default(params,
                              indices,
                              default_value='',
                              name=None):
  """Same as `batch_gather` but inserts `default_value` for invalid indices.

  This operation is similar to `batch_gather` except that it will substitute
  the value for invalid indices with `default_value` as the contents.
  See `batch_gather` for more details.


  Args:
    params: A potentially ragged tensor with shape `[B1...BN, P1...PM]` (`N>=0`,
      `M>0`).
    indices: A potentially ragged tensor with shape `[B1...BN, I]` (`N>=0`).
    default_value: A value to be inserted in places where `indices` are out of
      bounds. Must be the same dtype as params and either a scalar or rank 1.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[B1...BN, I, P2...PM]`.
    `result.ragged_rank = max(indices.ragged_rank, params.ragged_rank)`.

  #### Example:
    ```python
    >>> params = tf.ragged.constant([
          ['a', 'b', 'c'],
          ['d'],
          [],
          ['e']])
    >>> indices = tf.ragged.constant([[1, 2, -1], [], [], [0, 10]])
    >>> batch_gather_with_default(params, indices, 'FOO')
    [['b', 'c', 'FOO'], [], [], ['e', 'FOO']]
  ```
  """
  with ops.name_scope(name, 'RaggedBatchGatherWithDefault'):
    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        params, name='params',
    )
    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        indices, name='indices',
    )
    default_value = ragged_tensor.convert_to_tensor_or_ragged_tensor(
        default_value, name='default_value',
    )
    # TODO(hterry): lift this restriction and support default_values of
    #               of rank > 1
    if (default_value.shape.ndims is not 0
        and default_value.shape.ndims is not 1):
      raise ValueError('"default_value" must be a scalar or vector')
    upper_bounds = None
    if indices.shape.ndims is None:
      raise ValueError('Indices must have a known rank.')
    if params.shape.ndims is None:
      raise ValueError('Params must have a known rank.')

    num_batch_dimensions = indices.shape.ndims - 1
    pad = None
    # The logic for this works as follows:
    # - create a padded params, where:
    #    padded_params[b1...bn, 0] = default_value
    #    padded_params[b1...bn, i] = params[b1...bn, i-1] (i>0)
    # - create an `upper_bounds` Tensor that contains the number of elements
    #   in each innermost rank. Broadcast `upper_bounds` to be the same shape
    #   as `indices`.
    # - check to see which index in `indices` are out of bounds and substitute
    #   it with the index containing `default_value` (the first).
    # - call batch_gather with the indices adjusted.
    with ops.control_dependencies([
        check_ops.assert_greater_equal(array_ops.rank(params),
                                       array_ops.rank(indices))]):
      if ragged_tensor.is_ragged(params):
        row_lengths = ragged_array_ops.expand_dims(
            params.row_lengths(axis=num_batch_dimensions),
            axis=-1)
        upper_bounds = math_ops.cast(row_lengths, indices.dtype)

        pad_shape = _get_pad_shape(params, indices)

        pad = ragged_tensor_shape.broadcast_to(
            default_value, pad_shape)
      else:
        params_shape = array_ops.shape(params)
        pad_shape = array_ops.concat([
            params_shape[:num_batch_dimensions],
            [1],
            params_shape[num_batch_dimensions + 1:params.shape.ndims]
        ], 0)
        upper_bounds = params_shape[num_batch_dimensions]
        pad = array_ops.broadcast_to(default_value, pad_shape)

      # Add `default_value` as the first value in the innermost (ragged) rank.
      pad = math_ops.cast(pad, params.dtype)
      padded_params = array_ops.concat(
          [pad, params], axis=num_batch_dimensions)

      # Adjust the indices by substituting out-of-bound indices to the
      # default-value index (which is the first element)
      shifted_indices = indices + 1
      is_out_of_bounds = (indices < 0) | (indices > upper_bounds)
      adjusted_indices = ragged_where_op.where(
          is_out_of_bounds,
          x=array_ops.zeros_like(indices), y=shifted_indices,
      )
      return array_ops.batch_gather(
          params=padded_params, indices=adjusted_indices, name=name)
Exemplo n.º 11
0
def _ragged_expand_dims_v1(input, axis=None, name=None, dim=None):  # pylint: disable=redefined-builtin
  if dim is not None:
    axis = dim
  return ragged_array_ops.expand_dims(input=input, axis=axis, name=name)
def gather_nd(params, indices, batch_dims=0, name=None):
    """Gather slices from `params` using `n`-dimensional indices.

  This operation is similar to `gather`, but it uses the innermost dimension
  of `indices` to define a slice into `params`.  In particular, if:

  * `indices` has shape `[A1...AN, I]`
  * `params` has shape `[B1...BM]`

  Then:

  * `result` has shape `[A1...AN, B_{I+1}...BM]`.
  * `result[a1...aN] = params[indices[a1...aN, :]]`

  Args:
    params: A potentially ragged tensor with shape `[A1...AN, I]`.
    indices: A potentially ragged tensor with shape `[B1...BM]`.
    batch_dims: Must be zero.
    name: A name for the operation (optional).

  Returns:
    A potentially ragged tensor with shape `[A1...AN, B_{I+1}...BM]`.

  #### Examples:

  >>> params = tf.ragged.constant(
  ...     [ [ ['000', '001'], ['010'              ]          ],
  ...       [ ['100'       ], ['110', '111', '112'], ['120'] ],
  ...       [ [            ], ['210'              ]          ] ])

  >>> # Gather 2D slices from a 3D tensor
  >>> tf.gather_nd(params, [[2], [0]])
  <tf.RaggedTensor [[[], [b'210']], [[b'000', b'001'], [b'010']]]>

  >>> # Gather 1D slices from a 3D tensor
  >>> tf.gather_nd(params, [[2, 1], [0, 0]])
  <tf.RaggedTensor [[b'210'], [b'000', b'001']]>

  >>> # Gather scalars from a 3D tensor
  >>> tf.gather_nd(params, [[0, 0, 1], [1, 1, 2]]).numpy()
  array([b'001', b'112'], dtype=object)
  """
    if not isinstance(batch_dims, int) or batch_dims != 0:
        raise ValueError(
            'batch_dims != 0 is not supported for ragged gather yet.')
    if not (ragged_tensor.is_ragged(params)
            or ragged_tensor.is_ragged(indices)):
        return array_ops.gather_nd(params, indices, name)

    with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):

        params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            params, name='params')
        indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
            indices, name='indices')
        params, indices = ragged_tensor.match_row_splits_dtypes(
            params, indices)
        indices_shape = indices.shape
        indices_ndims = indices_shape.ndims
        if indices_ndims is None:
            raise ValueError('indices.rank be statically known.')
        if indices_ndims == 0:
            raise ValueError('indices.rank must be at least 1.')
        if (ragged_tensor.is_ragged(indices)
                and indices_ndims == indices.ragged_rank + 1):
            raise ValueError(
                'The innermost dimension of indices may not be ragged')

        # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
        # that each index slices into.
        index_size = tensor_shape.dimension_value(indices_shape[-1])
        if index_size is None:
            raise ValueError('indices.shape[-1] must be statically known.')

        # If `indices` has more than 2 dimensions, then recurse.  If `indices` is
        # dense, then we convert it to ragged before recursing, and then convert
        # the result back to `dense` if appropriate.
        if indices_ndims > 2:
            indices_is_dense = not ragged_tensor.is_ragged(indices)
            if indices_is_dense:
                indices = ragged_tensor.RaggedTensor.from_tensor(
                    indices,
                    ragged_rank=indices_ndims - 2,
                    row_splits_dtype=params.row_splits.dtype)
            result = indices.with_flat_values(
                gather_nd(params, indices.flat_values))
            if (indices_is_dense and ragged_tensor.is_ragged(result)
                    and result.ragged_rank == indices_ndims - 2):
                result = ragged_tensor.RaggedTensor.to_tensor(result)
            return result

        # indices_ndims <= 2, and the innermost dimension of indices may not be
        # ragged, so `indices` must not be ragged.
        assert not ragged_tensor.is_ragged(indices)
        assert ragged_tensor.is_ragged(params)

        # Handle corner case: An empty index tuple selects the entire `params`
        # value.  So if `index_size` is zero, then tile `params`.
        if index_size == 0:
            params_ndims = params.ragged_rank + array_ops.rank(
                params.flat_values)
            for dim in range(indices_ndims - 1):
                params = ragged_array_ops.expand_dims(params, axis=0)
            multiples = array_ops.concat([
                array_ops.shape(indices)[:-1],
                array_ops.ones([params_ndims], dtypes.int32)
            ],
                                         axis=0)
            return ragged_array_ops.tile(params, multiples)

        # When index_size=1, we can just flatten the index tuples and use gather.
        elif index_size == 1:
            flattened_index_tuples = array_ops.reshape(indices, [-1])
            return gather(params, flattened_index_tuples)

        # Otherwise, params is a RaggedTensor, and indices is a 1D or 2D Tensor.
        # Flatten both the index tuples and the params, such that the flattened
        # index tuples point to the correct values in the flattened params; and
        # then use ragged.gather on the flattened index tuples & params.
        else:
            indices = math_ops.cast(indices, params.row_splits.dtype)

            # Flatten the outermost 2 dimensions of the index tuples & params.
            flattened_index_tuples = array_ops.gather(params.row_splits,
                                                      indices[..., 0])
            flattened_index_tuples += indices[..., 1]
            flattened_params = params.values

            # Flatten any remaining dimensions.
            for dim in range(2, index_size):
                if not ragged_tensor.is_ragged(flattened_params):
                    flattened_index_tuples = array_ops.expand_dims(
                        flattened_index_tuples, axis=1)
                    flattened_index_tuples = array_ops.concat(
                        [flattened_index_tuples, indices[..., dim:]], axis=1)
                    return array_ops.gather_nd(flattened_params,
                                               flattened_index_tuples)

                flattened_index_tuples = array_ops.gather(
                    flattened_params.row_starts(), flattened_index_tuples)
                flattened_index_tuples += indices[..., dim]
                flattened_params = flattened_params.values

            # Gather using the flattened index tuples and params.
            return gather(flattened_params, flattened_index_tuples)
Exemplo n.º 13
0
def unicode_split_with_offsets(input,
                               input_encoding,
                               errors="replace",
                               replacement_char=0xFFFD,
                               name=None):
  r"""Splits each string into a sequence of code points with start offsets.

  This op is similar to `tf.strings.decode(...)`, but it also returns the
  start offset for each character in its respective string.  This information
  can be used to align the characters with the original byte sequence.

  Returns a tuple `(chars, start_offsets)` where:

  * `chars[i1...iN, j]` is the substring of `input[i1...iN]` that encodes its
    `j`th character, when decoded using `input_encoding`.
  * `start_offsets[i1...iN, j]` is the start byte offset for the `j`th
    character in `input[i1...iN]`, when decoded using `input_encoding`.

  Args:
    input: An `N` dimensional potentially ragged `string` tensor with shape
      `[D1...DN]`.  `N` must be statically known.
    input_encoding: String name for the unicode encoding that should be used to
      decode each string.
    errors: Specifies the response when an input string can't be converted
      using the indicated encoding. One of:
      * `'strict'`: Raise an exception for any illegal substrings.
      * `'replace'`: Replace illegal substrings with `replacement_char`.
      * `'ignore'`: Skip illegal substrings.
    replacement_char: The replacement codepoint to be used in place of invalid
      substrings in `input` when `errors='replace'`.
    name: A name for the operation (optional).

  Returns:
    A tuple of `N+1` dimensional tensors `(codepoints, start_offsets)`.

    * `codepoints` is an `int32` tensor with shape `[D1...DN, (num_chars)]`.
    * `offsets` is an `int64` tensor with shape `[D1...DN, (num_chars)]`.

    The returned tensors are `tf.Tensor`s if `input` is a scalar, or
    `tf.RaggedTensor`s otherwise.

  #### Example:
    ```python
    >>> input = [s.encode('utf8') for s in (u'G\xf6\xf6dnight', u'\U0001f60a')]
    >>> result = tf.strings.unicode_split_with_offsets(input, 'UTF-8')
    >>> result[0].tolist()  # character substrings
    [['G', '\xc3\xb6', '\xc3\xb6', 'd', 'n', 'i', 'g', 'h', 't'],
     ['\xf0\x9f\x98\x8a']]
    >>> result[1].tolist()  # offsets
   [[0, 1, 3, 5, 6, 7, 8, 9, 10], [0]]
    ```
  """
  with ops.name_scope(name, "UnicodeSplitWithOffsets", [input]):
    codepoints, offsets = _unicode_decode(input, input_encoding, errors,
                                          replacement_char, False,
                                          with_offsets=True)
    chars = unicode_encode(
        ragged_array_ops.expand_dims(codepoints, -1),
        output_encoding=input_encoding,
        errors=errors,
        replacement_char=replacement_char)
    return chars, offsets
Exemplo n.º 14
0
def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
  """Helper function to concatenate or stack ragged tensors.

  Args:
    rt_inputs: A list of RaggedTensors or Tensors to combine.
    axis: The axis along which to concatenate or stack.
    stack_values: A boolean -- if true, then stack values; otherwise,
      concatenate them.

  Returns:
    A RaggedTensor.
  Raises:
    ValueError: If rt_inputs is empty, or if axis is out of range.
  """
  # Validate parameters.
  if not rt_inputs:
    raise ValueError('rt_inputs may not be empty.')

  # Convert input tensors.
  rt_inputs = [
      ragged_tensor.convert_to_tensor_or_ragged_tensor(
          rt_input, name='rt_input') for rt_input in rt_inputs
  ]
  row_splits_dtype, rt_inputs = ragged_tensor.match_row_splits_dtypes(
      *rt_inputs, return_dtype=True)
  rt_inputs = list(rt_inputs)

  # Special case: if there's only one input, then return it as-is.
  if len(rt_inputs) == 1:
    if stack_values:
      return ragged_array_ops.expand_dims(rt_inputs[0], axis=axis)
    else:
      return rt_inputs[0]

  # Check the rank (number of dimensions) of the input tensors.
  ndims = None
  for rt in rt_inputs:
    if ndims is None:
      ndims = rt.shape.ndims
    else:
      rt.shape.assert_has_rank(ndims)

  out_ndims = ndims if (ndims is None or not stack_values) else ndims + 1
  axis = ragged_util.get_positive_axis(axis, out_ndims)

  # If all the inputs are Tensors, and we're combining the final dimension,
  # then we can delegate to the tf.stack/tf.concat operation, and return a
  # Tensor.
  if all(not ragged_tensor.is_ragged(rt) for rt in rt_inputs):
    if ndims is not None and (axis == out_ndims - 1 or axis == ndims - 1):
      if stack_values:
        return array_ops.stack(rt_inputs, axis)
      else:
        return array_ops.concat(rt_inputs, axis)

  # Convert any Tensor inputs to RaggedTensors.  This makes it
  # possible to concatenate Tensors and RaggedTensors together.
  for i in range(len(rt_inputs)):
    if not ragged_tensor.is_ragged(rt_inputs[i]):
      rt_inputs[i] = ragged_tensor.RaggedTensor.from_tensor(
          rt_inputs[i], ragged_rank=1, row_splits_dtype=row_splits_dtype)

  # Convert the input tensors to all have the same ragged_rank.
  ragged_rank = max(max(rt.ragged_rank for rt in rt_inputs), 1)
  rt_inputs = [_increase_ragged_rank_to(rt, ragged_rank, row_splits_dtype)
               for rt in rt_inputs]

  if axis == 0:
    return _ragged_stack_concat_axis_0(rt_inputs, stack_values)
  elif axis == 1:
    return _ragged_stack_concat_axis_1(rt_inputs, stack_values)
  else:  # axis > 1: recurse.
    values = [rt.values for rt in rt_inputs]
    splits = [[rt_input.row_splits] for rt_input in rt_inputs]
    with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
      return ragged_tensor.RaggedTensor.from_row_splits(
          _ragged_stack_concat_helper(values, axis - 1, stack_values),
          splits[0][0], validate=False)