Exemplo n.º 1
0
    def testConstruction(self):
        tensor_values = constant_op.constant(
            ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
        values = WrappedTensor(tensor_values)

        row_splits = constant_op.constant([0, 2, 2, 5, 6, 8], dtypes.int64)
        rt = RaggedTensor.from_row_splits(values, row_splits)
        self.assertIsInstance(rt.values, WrappedTensor)
        self.assertAllEqual(rt.values.value, tensor_values)
        self.assertAllEqual(rt.row_splits, row_splits)

        row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
        rt = RaggedTensor.from_row_starts(values, row_starts)
        self.assertIsInstance(rt.values, WrappedTensor)
        self.assertAllEqual(rt.values.value, tensor_values)
        self.assertAllEqual(rt.row_starts(), row_starts)

        row_limits = constant_op.constant([2, 2, 5, 6, 8], dtypes.int64)
        rt = RaggedTensor.from_row_limits(values, row_limits)
        self.assertIsInstance(rt.values, WrappedTensor)
        self.assertAllEqual(rt.values.value, tensor_values)
        self.assertAllEqual(rt.row_limits(), row_limits)

        row_lengths = constant_op.constant([2, 0, 3, 1, 2], dtypes.int64)
        rt = RaggedTensor.from_row_lengths(values, row_lengths)
        self.assertIsInstance(rt.values, WrappedTensor)
        self.assertAllEqual(rt.values.value, tensor_values)
        self.assertAllEqual(rt.row_lengths(), row_lengths)

        rt = RaggedTensor.from_uniform_row_length(values, 4)
        self.assertIsInstance(rt.values, WrappedTensor)
        self.assertAllEqual(rt.values.value, tensor_values)
        self.assertAllEqual(rt.uniform_row_length, 4)
Exemplo n.º 2
0
    def tokenize_with_offsets(self, input):  # pylint: disable=redefined-builtin
        """Tokenizes utf-8 encoded tokens into subword pieces based off of a vocab.

    ### Example:

    ```python
    >>> tokens = [["they're", "the", "greatest"]],
    >>> tokenizer = WordpieceTokenizer(vocab, token_out_type=tf.string)
    >>> result = tokenizer.tokenize_with_offsets(tokens)
    >>> result[0].to_list()  # subwords
    [[['they', "##'", '##re'], ['the'], ['great', '##est']]]
    >>> result[1].to_list()  # offset starts
    [[[0, 4, 5], [0], [0, 5]]]
    >>> result[2].to_list()  # offset limits
    [[[4, 5, 7], [3], [5, 8]]]
    ```

    Args:
      input: An N-dimensional `Tensor` or `RaggedTensor` of UTF-8 strings.

    Returns:
      A tuple of `RaggedTensor`s `tokens`, `start_offsets`, and `limit_offsets`
      where:
        * `tokens[i1...iN, j]` is the string contents, or ID in the
          vocab_lookup_table representing that string, of the `j`th token in
          `input[i1...iN]`
        * `start_offsets[i1...iN, j]` is the byte offset for the start of the
          `j`th token in `input[i1...iN]`
        * `limit_offsets[i1...iN, j]` is the byte offset for the end of the
    """
        name = None
        if not isinstance(self._vocab_lookup_table,
                          lookup_ops.LookupInterface):
            raise TypeError('vocab_lookup_table must be a LookupInterface')
        with ops.name_scope(
                name, 'WordpieceTokenizeWithOffsets',
            [input, self._vocab_lookup_table, self._suffix_indicator]):
            # Check that the types are expected and the ragged rank is appropriate.
            tokens = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
            rank = tokens.shape.ndims
            if rank is None:
                raise ValueError('input must have a known rank.')

            if rank == 0:
                wordpieces, starts, limits = self.tokenize_with_offsets(
                    array_ops.stack([tokens]))
                return wordpieces.values, starts.values, limits.values

            elif rank > 1:
                if not ragged_tensor.is_ragged(tokens):
                    tokens = ragged_tensor.RaggedTensor.from_tensor(
                        tokens, ragged_rank=rank - 1)
                wordpieces, starts, limits = self.tokenize_with_offsets(
                    tokens.flat_values)
                wordpieces = wordpieces.with_row_splits_dtype(
                    tokens.row_splits.dtype)
                starts = starts.with_row_splits_dtype(tokens.row_splits.dtype)
                limits = limits.with_row_splits_dtype(tokens.row_splits.dtype)
                return (tokens.with_flat_values(wordpieces),
                        tokens.with_flat_values(starts),
                        tokens.with_flat_values(limits))

            # Tokenize the tokens into subwords
            values, row_lengths, starts, limits = (
                gen_wordpiece_tokenizer.wordpiece_tokenize_with_offsets(
                    input_values=tokens,
                    vocab_lookup_table=self._vocab_lookup_table.
                    resource_handle,
                    suffix_indicator=self._suffix_indicator,
                    use_unknown_token=self._use_unknown_token,
                    max_bytes_per_word=self._max_bytes_per_word,
                    unknown_token=self._unknown_token,
                ))

            # If ids are desired, look them up in the vocab table. Otherwise just
            # return the string values.
            if self._token_out_type == dtypes.int64:
                values = self._vocab_lookup_table.lookup(values)

            wordpieces = RaggedTensor.from_row_lengths(values, row_lengths)
            starts = RaggedTensor.from_row_lengths(starts, row_lengths)
            limits = RaggedTensor.from_row_lengths(limits, row_lengths)

            return wordpieces, starts, limits