def _tokenize_tensor(self, text): """Tokenizes a tensor. When not overriden, this default implementation calls the string-based tokenization. Args: text: A 1-D string ``tf.Tensor``. Returns: A 1-D string ``tf.Tensor``. """ if compat.tf_supports("py_function"): def _python_wrapper(string_t): string = tf.compat.as_text(string_t.numpy()) tokens = self._tokenize_string(string) return tf.constant(tokens) tokens = tf.py_function(_python_wrapper, [text], tf.string) tokens.set_shape([None]) return tokens text = tf.py_func( lambda x: tf.compat.as_bytes("\0".join(self.tokenize(x))), [text], tf.string) tokens = tf.string_split([text], delimiter="\0").values return tokens
def alignment_matrix_from_pharaoh(alignment_line, source_length, target_length, dtype=tf.float32): """Parse Pharaoh alignments into an alignment matrix. Args: alignment_line: A string ``tf.Tensor`` in the Pharaoh format. source_length: The length of the source sentence, without special symbols. target_length The length of the target sentence, without special symbols. dtype: The output matrix dtype. Defaults to ``tf.float32`` for convenience when computing the guided alignment loss. Returns: The alignment matrix as a 2-D ``tf.Tensor`` of type :obj:`dtype` and shape ``[target_length, source_length]``, where ``[i, j] = 1`` if the ``i`` th target word is aligned with the ``j`` th source word. """ if compat.tf_supports("strings.split"): align_pairs_str = tf.strings.split([alignment_line]).values align_pairs_flat_str = tf.strings.split(align_pairs_str, sep="-").values else: align_pairs_str = tf.string_split([alignment_line], delimiter=" ").values align_pairs_flat_str = tf.string_split(align_pairs_str, delimiter="-").values align_pairs_flat = compat.tf_compat(v2="strings.to_number", v1="string_to_number")( align_pairs_flat_str, out_type=tf.int64) sparse_indices = tf.reshape(align_pairs_flat, [-1, 2]) sparse_values = tf.ones([tf.shape(sparse_indices)[0]], dtype=dtype) source_length = tf.cast(source_length, tf.int64) target_length = tf.cast(target_length, tf.int64) if compat.tf_supports("sparse.to_dense"): alignment_matrix_sparse = tf.sparse.SparseTensor( sparse_indices, sparse_values, [source_length, target_length]) alignment_matrix = tf.sparse.to_dense(alignment_matrix_sparse, validate_indices=False) else: alignment_matrix = tf.sparse_to_dense(sparse_indices, [source_length, target_length], sparse_values, validate_indices=False) return tf.transpose(alignment_matrix)
def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN): outputs = tf.identity(inputs) if sequence_length is not None and compat.tf_supports("RaggedTensor"): inputs = tf.RaggedTensor.from_tensor(inputs, lengths=sequence_length) state = tf.reduce_mean(inputs, axis=1) return (outputs, state, sequence_length)
def _add_mixed_precision_wrapper(optimizer): # TODO: clean mixed precision API when TensorFlow requirement is updated to >=2.4. wrapper_class = None wrapper_kwargs = {} if compat.tf_supports("keras.mixed_precision.LossScaleOptimizer"): wrapper_class = tf.keras.mixed_precision.LossScaleOptimizer else: wrapper_class = tf.keras.mixed_precision.experimental.LossScaleOptimizer wrapper_kwargs = dict(loss_scale="dynamic") if not isinstance(optimizer, wrapper_class): optimizer = wrapper_class(optimizer, **wrapper_kwargs) return optimizer
def get_padded_shapes(dataset): """Returns the padded shapes for ``tf.data.Dataset.padded_batch``. Args: dataset: The dataset that will be batched with padding. Returns: The same structure as ``dataset.output_shapes`` containing the padded shapes. """ if compat.tf_supports("data.get_output_shapes"): output_shapes = tf.data.get_output_shapes(dataset) else: output_shapes = dataset.output_shapes return compat.nest.map_structure(lambda shape: shape.as_list(), output_shapes)
def _detokenize_tensor(self, tokens): """Detokenizes tokens. When not overriden, this default implementation calls the string-based detokenization. Args: tokens: A 1-D ``tf.Tensor``. Returns: A 0-D string ``tf.Tensor``. """ if compat.tf_supports("py_function"): def _python_wrapper(tokens_t): tokens = [tf.compat.as_text(s) for s in tokens_t.numpy()] string = self._detokenize_string(tokens) return tf.constant(string) text = tf.py_function(_python_wrapper, [tokens], tf.string) text.set_shape([]) return text return tf.py_func(self.detokenize, [tokens], tf.string)
def skip_if_unsupported(symbol): return unittest.skipIf(not compat.tf_supports(symbol), "tf.%s is not supported")
def _detokenize_tensor(self, tokens): if compat.tf_supports("strings.reduce_join"): text = tf.strings.reduce_join(tokens, axis=0) return tf.strings.regex_replace(text, "▁", " ") else: return super(CharacterTokenizer, self)._detokenize_tensor(tokens)
def _tokenize_tensor(self, text): if compat.tf_supports("strings.unicode_split"): text = tf.strings.regex_replace(text, " ", "▁") return tf.strings.unicode_split(text, "UTF-8") else: return super(CharacterTokenizer, self)._tokenize_tensor(text)
def _tokenize_tensor(self, text): if compat.tf_supports("string.splits"): return tf.strings.split([text]).values else: return tf.string_split([text], delimiter=" ").values
def testTFSupports(self): self.assertTrue(compat.tf_supports("data")) self.assertTrue(compat.tf_supports("data.Dataset")) self.assertFalse(compat.tf_supports("data.UnknwonClass")) self.assertFalse(compat.tf_supports("unknown_module"))
def _group_by_window(*args, **kwargs): # TODO: clean this API when TensorFlow requirement is updated to >=2.6. if compat.tf_supports("data.Dataset.group_by_window"): return lambda dataset: dataset.group_by_window(*args, **kwargs) else: return tf.data.experimental.group_by_window(*args, **kwargs)