def unicode_encode(input,
                   output_encoding,
                   errors="replace",
                   replacement_char=65533,
                   name=None):
    r"""Encodes each sequence of Unicode code points in `input` into a string.

  `result[i1...iN]` is the string formed by concatenating the Unicode
  codepoints `input[1...iN, :]`, encoded using `output_encoding`.

  Args:
    input: An `N+1` dimensional potentially ragged integer tensor with
        shape `[D1...DN, num_chars]`.
    output_encoding: Unicode encoding that should be used to encode each
      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
    errors: Specifies the response when an invalid codepoint is encountered
      (optional). One of:
            * `'replace'`: Replace invalid codepoint with the
              `replacement_char`. (default)
            * `'ignore'`: Skip invalid codepoints.
            * `'strict'`: Raise an exception for any invalid codepoint.
    replacement_char: The replacement character codepoint to be used in place of
      any invalid input when `errors='replace'`. Any valid unicode codepoint may
      be used. The default value is the default unicode replacement character
      which is 0xFFFD (U+65533).
    name: A name for the operation (optional).

  Returns:
    A `N` dimensional `string` tensor with shape `[D1...DN]`.

  #### Example:
    ```python
      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
      >>> unicode_encode(input, 'UTF8')
      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
    ```
  """
    with ops.name_scope(name, "UnicodeEncode", [input]):
        input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
            input)
        if input_tensor.shape.ndims is None:
            raise ValueError("Rank of input_tensor must be statically known.")
        if ragged_tensor.is_ragged(input_tensor):
            if input_tensor.inner_values.shape.ndims > 1:
                # If the inner_values of our ragged tensor is multi-dimensional, we can
                # process it separately and our output will have the same nested splits
                # as our input.
                return input_tensor.with_inner_values(
                    unicode_encode(input_tensor.inner_values, output_encoding,
                                   errors, replacement_char))
            elif input_tensor.ragged_rank > 1:
                # Recursively process the values of the ragged tensor.
                return input_tensor.with_values(
                    unicode_encode(input_tensor.values, output_encoding,
                                   errors, replacement_char))
            else:
                # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
                # with ragged_rank of 1) so we can process it as normal.
                return gen_string_ops.unicode_encode(
                    input_values=input_tensor.values,
                    input_splits=input_tensor.row_splits,
                    output_encoding=output_encoding,
                    errors=errors,
                    replacement_char=replacement_char)
        else:
            if input_tensor.shape.ndims == 2:
                # The input tensor is of the correct 2-D shape, it's just not ragged.
                return unicode_encode(
                    ragged_conversion_ops.from_tensor(input_tensor),
                    output_encoding, errors, replacement_char)
            elif input_tensor.shape.ndims > 2:
                # We need to initially flatten the input tensor to 2-D, and then can
                # reshape the output of our processed flattened tensor.
                flat_input_tensor = array_ops.reshape(
                    input_tensor,
                    array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
                flat_output_tensor = unicode_encode(flat_input_tensor,
                                                    output_encoding, errors,
                                                    replacement_char)
                return array_ops.reshape(flat_output_tensor,
                                         input_tensor.shape[:-1])
            elif input_tensor.shape.ndims == 0:
                raise ValueError("input_tensor's rank must be at least 1.")
            else:
                # Our input tensor is rank 1, so we create a ragged tensor with an added
                # dimension to create the correct input shape & type, and then remove
                # the additional dimension from the output and return the string scalar.
                ragged_input_tensor = ragged_factory_ops.from_row_splits(
                    input_tensor,
                    array_ops.stack([
                        0,
                        array_ops.shape(input_tensor, out_type=dtypes.int64)[0]
                    ]))
                output_tensor = unicode_encode(ragged_input_tensor,
                                               output_encoding, errors,
                                               replacement_char)
                return array_ops.reshape(output_tensor, [])
Пример #2
0
def unicode_encode(input, output_encoding, errors="replace",
                   replacement_char=65533, name=None):
  r"""Encodes each sequence of Unicode code points in `input` into a string.

  `result[i1...iN]` is the string formed by concatenating the Unicode
  codepoints `input[1...iN, :]`, encoded using `output_encoding`.

  Args:
    input: An `N+1` dimensional potentially ragged integer tensor with
        shape `[D1...DN, num_chars]`.
    output_encoding: Unicode encoding that should be used to encode each
      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
    errors: Specifies the response when an invalid codepoint is encountered
      (optional). One of:
            * `'replace'`: Replace invalid codepoint with the
              `replacement_char`. (default)
            * `'ignore'`: Skip invalid codepoints.
            * `'strict'`: Raise an exception for any invalid codepoint.
    replacement_char: The replacement character codepoint to be used in place of
      any invalid input when `errors='replace'`. Any valid unicode codepoint may
      be used. The default value is the default unicode replacement character
      which is 0xFFFD (U+65533).
    name: A name for the operation (optional).

  Returns:
    A `N` dimensional `string` tensor with shape `[D1...DN]`.

  #### Example:
    ```python
      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
      >>> unicode_encode(input, 'UTF8')
      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
    ```
  """
  with ops.name_scope(name, "UnicodeEncode", [input]):
    input_tensor = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(input)
    if input_tensor.shape.ndims is None:
      raise ValueError("Rank of input_tensor must be statically known.")
    if ragged_tensor.is_ragged(input_tensor):
      if input_tensor.inner_values.shape.ndims > 1:
        # If the inner_values of our ragged tensor is multi-dimensional, we can
        # process it separately and our output will have the same nested splits
        # as our input.
        return input_tensor.with_inner_values(
            unicode_encode(input_tensor.inner_values, output_encoding, errors,
                           replacement_char))
      elif input_tensor.ragged_rank > 1:
        # Recursively process the values of the ragged tensor.
        return input_tensor.with_values(
            unicode_encode(input_tensor.values, output_encoding, errors,
                           replacement_char))
      else:
        # Our ragged tensor is of the correct shape (rank 1 inner_values tensor
        # with ragged_rank of 1) so we can process it as normal.
        return gen_string_ops.unicode_encode(
            input_values=input_tensor.values,
            input_splits=input_tensor.row_splits,
            output_encoding=output_encoding,
            errors=errors,
            replacement_char=replacement_char)
    else:
      if input_tensor.shape.ndims == 2:
        # The input tensor is of the correct 2-D shape, it's just not ragged.
        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
                              output_encoding, errors, replacement_char)
      elif input_tensor.shape.ndims > 2:
        # We need to initially flatten the input tensor to 2-D, and then can
        # reshape the output of our processed flattened tensor.
        flat_input_tensor = array_ops.reshape(
            input_tensor,
            array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
        flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding,
                                            errors, replacement_char)
        return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1])
      elif input_tensor.shape.ndims == 0:
        raise ValueError("input_tensor's rank must be at least 1.")
      else:
        # Our input tensor is rank 1, so we create a ragged tensor with an added
        # dimension to create the correct input shape & type, and then remove
        # the additional dimension from the output and return the string scalar.
        ragged_input_tensor = ragged_factory_ops.from_row_splits(
            input_tensor,
            array_ops.stack([0, array_ops.shape(input_tensor,
                                                out_type=dtypes.int64)[0]]))
        output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
                                       errors, replacement_char)
        return array_ops.reshape(output_tensor, [])