Пример #1
0
 def testToWordsWithSpacer(self, tokens, expected):
     tokens = tf.constant(tokens)
     expected = tf.constant(expected)
     words = text.tokens_to_words(tokens, subword_token="▁", is_spacer=True)
     words = words.to_tensor()
     words, expected = self.evaluate([words, expected])
     self.assertAllEqual(words, expected)
Пример #2
0
 def testToWordsWithJoiner(self, tokens, expected):
     tokens = tf.constant(tokens)
     expected = tf.constant(expected)
     words = text.tokens_to_words(tokens)
     words = words.to_tensor()
     words, expected = self.evaluate([words, expected])
     self.assertAllEqual(words, expected)
Пример #3
0
    def __call__(self, tokens, sequence_length=None, keep_shape=False):
        """Applies noise on :obj:`tokens`.

    Args:
      tokens: A string ``tf.Tensor`` or batch of string ``tf.Tensor``.
      sequence_length: When :obj:`tokens` is ND, the length of each sequence in
        the batch.
      keep_shape: Ensure that the shape is kept. Otherwise, fit the shape to the
        new lengths.

    Returns:
      A tuple with the noisy version of :obj:`tokens` and the new lengths.
    """
        rank = tokens.shape.ndims
        if rank == 1:
            input_length = tf.shape(tokens)[0]
            if sequence_length is not None:
                tokens = tokens[:sequence_length]
            else:
                tokens = tokens[:tf.math.count_nonzero(tokens)]
            words = text.tokens_to_words(tokens,
                                         subword_token=self.subword_token,
                                         is_spacer=self.is_spacer)
            words = words.to_tensor()
            for noise in self.noises:
                words = noise(words)
            outputs = tf.RaggedTensor.from_tensor(words,
                                                  padding="").flat_values
            output_length = tf.shape(outputs)[0]
            if keep_shape:
                outputs = tf.pad(outputs, [[0, input_length - output_length]])
            return outputs, output_length
        elif rank == 2:
            if sequence_length is None:
                raise ValueError(
                    "sequence_length must be passed for 2D inputs")
            tokens, sequence_length = tf.map_fn(
                lambda arg: self(*arg, keep_shape=True),
                (tokens, sequence_length),
                back_prop=False)
            if not keep_shape:
                tokens = tokens[:, :tf.reduce_max(sequence_length)]
            return tokens, sequence_length
        else:
            if sequence_length is None:
                raise ValueError(
                    "sequence_length must be passed for ND inputs")
            original_shape = misc.shape_list(tokens)
            tokens = tf.reshape(tokens, [-1, original_shape[-1]])
            sequence_length = tf.reshape(sequence_length, [-1])
            tokens, sequence_length = self(tokens,
                                           sequence_length,
                                           keep_shape=keep_shape)
            tokens = tf.reshape(tokens, original_shape[:-1] + [-1])
            sequence_length = tf.reshape(sequence_length, original_shape[:-1])
            return tokens, sequence_length
Пример #4
0
 def _call(self, tokens, sequence_length, keep_shape):
     rank = tokens.shape.ndims
     if rank == 1:
         input_length = tf.shape(tokens)[0]
         if sequence_length is not None:
             tokens = tokens[:sequence_length]
         else:
             tokens = tokens[:tf.math.count_nonzero(tokens)]
         words = text.tokens_to_words(tokens,
                                      subword_token=self.subword_token,
                                      is_spacer=self.is_spacer)
         words = words.to_tensor()
         for noise in self.noises:
             words = noise(words)
         outputs = tf.RaggedTensor.from_tensor(words,
                                               padding="").flat_values
         output_length = tf.shape(outputs)[0]
         if keep_shape:
             outputs = tf.pad(outputs, [[0, input_length - output_length]])
         return outputs, output_length
     elif rank == 2:
         if sequence_length is None:
             raise ValueError(
                 "sequence_length must be passed for 2D inputs")
         tokens, sequence_length = tf.map_fn(
             lambda arg: self._call(*arg, keep_shape=True),
             (tokens, sequence_length),
             back_prop=False)
         if not keep_shape:
             tokens = tokens[:, :tf.reduce_max(sequence_length)]
         return tokens, sequence_length
     else:
         if sequence_length is None:
             raise ValueError(
                 "sequence_length must be passed for ND inputs")
         original_shape = misc.shape_list(tokens)
         tokens = tf.reshape(tokens, [-1, original_shape[-1]])
         sequence_length = tf.reshape(sequence_length, [-1])
         tokens, sequence_length = self._call(tokens,
                                              sequence_length,
                                              keep_shape=keep_shape)
         tokens = tf.reshape(tokens, original_shape[:-1] + [-1])
         sequence_length = tf.reshape(sequence_length, original_shape[:-1])
         return tokens, sequence_length
Пример #5
0
 def testToWordsWithSpacer(self, tokens, expected):
     expected = tf.nest.map_structure(tf.compat.as_bytes, expected)
     tokens = tf.constant(tokens)
     words = text.tokens_to_words(tokens, subword_token="▁", is_spacer=True)
     self.assertAllEqual(words.to_list(), expected)
Пример #6
0
 def testToWordsWithJoiner(self, tokens, expected):
     expected = tf.nest.map_structure(tf.compat.as_bytes, expected)
     tokens = tf.constant(tokens)
     words = text.tokens_to_words(tokens)
     self.assertAllEqual(words.to_list(), expected)