def _convert_features( self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Convert the input dataset to an output dataset to be fed to the model. The "inputs" and "targets" are concatenated to form the new targets. In addition, the binary mask to distinguish "inputs" and "targets" token are concatenated as well. We define inputs_width to be a width (or a number of tokens) of "inputs" in the concatenated sequence. This method computes the width corresponding with and without additional position. Both of these are necessary `_convert_example`. Args: ds: an input tf.data.Dataset to be converted. task_feature_lengths: a mapping from task feature name to its length. Returns: ds: the converted dataset. """ def concat_and_add_masks(features): inputs = features["inputs"] targets = features["targets"] # Width of the "inputs" portion in the concatenated sequence. width = tf.size(inputs) inputs_width = tf.fill([tf.size(inputs) + tf.size(targets)], width) # Width with an extra position to the right in the inputs mask. See # docstring for details. inputs_width_add_pos = tf.fill( [tf.size(inputs) + tf.size(targets)], width + 1) return { "targets": tf.concat([inputs, targets], axis=-1), "inputs_width": inputs_width, "inputs_width_add_pos": inputs_width_add_pos } ds = ds.map(concat_and_add_masks, num_parallel_calls=tf.data.experimental.AUTOTUNE) concat_length = sum(task_feature_lengths.values()) concat_task_feature_lengths = { "targets": concat_length, "inputs_width": concat_length, "inputs_width_add_pos": concat_length } ds = self._pack_or_pad(ds, concat_task_feature_lengths) return ds.map(self._convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _repeat_batch(batch_sizes: Sequence[int], ds: tf.data.Dataset, repeat: int = 1) -> tf.data.Dataset: """Tiles the inner most batch dimension.""" if repeat <= 1: return ds if batch_sizes[-1] % repeat != 0: raise ValueError( f'The last element of `batch_sizes` ({batch_sizes}) must ' f'be divisible by `repeat` ({repeat}).') # Perform regular batching with reduced number of elements. for i, batch_size in enumerate(reversed(batch_sizes)): ds = ds.batch(batch_size // repeat if i == 0 else batch_size, drop_remainder=True) # Repeat batch. fn = lambda x: tf.repeat(x, repeats=repeat, axis=len(batch_sizes) - 1) def repeat_inner_batch(example): return jax.tree_map(fn, example) ds = ds.map(repeat_inner_batch, num_parallel_calls=tf.data.AUTOTUNE) # Unbatch. for _ in batch_sizes: ds = ds.unbatch() return ds
def append_eos( dataset: tf.data.Dataset, output_features: OutputFeaturesType, ) -> tf.data.Dataset: """Appends EOS to output feature token sequences with `add_eos` set to True. Respects the `add_eos` field of the seqio.Features in `output_features`. Args: dataset: a tf.data.Dataset of tokenized examples to preprocess. output_features: a mapping of output feature names to Feature objects. Returns: a tf.data.Dataset of tokenized examples with EOS added to specified output features. """ def _maybe_add_eos(key: str, value: tf.Tensor) -> tf.Tensor: if key not in output_features or not output_features[key].add_eos: return value else: eos_id = output_features[key].vocabulary.eos_id return tf.concat([value, [eos_id]], axis=0) return dataset.map( lambda ex: {k: _maybe_add_eos(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _validate_dataset(self, dataset: tf.data.Dataset, expected_output_type: tf.DType, expected_output_rank: int, error_label: str, ensure_no_eos: bool = False) -> tf.data.Dataset: """Validates properties of a tf.data.Dataset, raising Exceptions if needed. Args: dataset: a tf.data.Dataset to validate. expected_output_type: a tf.Dtype, the expected type of the model features. expected_output_rank: an int, the expected rank of the model features. error_label: a string, an identifier for the previous processing step to report in raised ValueErrors. ensure_no_eos: a bool, whether or not to verify that the model features contain no EOS tokens. Returns: a validated tf.data.Dataset. """ element_spec = dataset.element_spec for feat in self.output_features: if feat not in element_spec: if self.output_features[feat].required: raise ValueError( "Task dataset is missing expected output feature after {label}: " "{feat}".format(label=error_label, feat=feat)) else: # It's ok that this feature does not exist. continue if expected_output_type != element_spec[feat].dtype: raise ValueError( "Task dataset has incorrect type for feature '{feat}' after " "{label}: Got {actual}, expected {expected}".format( feat=feat, label=error_label, actual=element_spec[feat].dtype.name, expected=expected_output_type.name)) if expected_output_rank != len(element_spec[feat].shape): raise ValueError( "Task dataset has incorrect rank for feature '{feat}' after " "{label}: Got {actual}, expected {expected}".format( feat=feat, label=error_label, actual=len(element_spec[feat].shape), expected=expected_output_rank)) def _ensure_no_eos(feat, v): if feat not in self.output_features: return v with tf.control_dependencies([ tf.debugging.assert_none_equal( v, tf.constant(1, tf.int64), message="Feature '{feat}' unexpectedly contains EOS=1 token " "after {label}.".format(feat=feat, label=error_label)) ]): return v if ensure_no_eos: dataset = dataset.map( lambda ex: {k: _ensure_no_eos(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset
def _pack_with_custom_ops( dataset: tf.data.Dataset, feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Helper-function for packing a dataset which has already been batched. See trim_and_pack_dataset() Relies on custom ops which require a custom compiled binary. Faster than _pack_with_tf_ops(), and denser packing. Args: dataset: a dataset containing padded batches of examples. feature_lengths: mapping from feature key to packed length. Returns: a dataset. """ # TODO(adarob): Move ops into this library and fix int64 issue. from tensor2tensor.data_generators.ops import pack_sequences_ops # pylint: disable=g-import-not-at-top keys = list(feature_lengths) if len(keys) == 1: k1, = keys k2 = k1 elif len(keys) == 2: k1, k2 = keys else: raise ValueError(f"Packing op requires 1 or 2 keys. Got {len(keys)}") def custom_pack_batch(x): """Map-function.""" (k1_packed, k1_segment_ids, k1_positions, k2_packed, k2_segment_ids, k2_positions) = ( pack_sequences_ops.pack_sequences2( # cast to int64 for compatibility with custom ops tf.cast(x[k1], tf.int64), tf.cast(x[k2], tf.int64), feature_lengths[k1], feature_lengths[k2])) packed = { k1: k1_packed, k1 + "_segment_ids": k1_segment_ids, k1 + "_positions": k1_positions, } if len(keys) == 2: packed.update({ k2: k2_packed, k2 + "_segment_ids": k2_segment_ids, k2 + "_positions": k2_positions, }) # cast back to int32 for k, v in packed.items(): packed[k] = tf.cast(v, tf.int32) return packed dataset = dataset.map(custom_pack_batch, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.unbatch() return dataset
def _convert_features( self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Convert the dataset to be fed to a language model.""" ds = self._pack_or_pad(ds, task_feature_lengths) return ds.map(self._convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def trim_and_pad_dataset( dataset: tf.data.Dataset, feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Trim and pad first dimension of features to `feature_lengths`. Args: dataset: tf.data.Dataset, the dataset to trimp/pad examples in. feature_lengths: map from feature key to final length. Other features will be returned unchanged. Returns: Trimmed/padded tf.data.Dataset. """ def _trim_and_pad(k: str, t: tf.Tensor) -> tf.Tensor: """Trim/pad to the first axis of `t` to be of size `length`.""" if k not in feature_lengths: return t length_k = feature_lengths[k] t = t[:length_k] pad_amt = length_k - tf.shape(t)[0] padded_t = tf.pad(t, [(0, pad_amt)] + [(0, 0)] * (len(t.shape) - 1)) padded_t.set_shape([length_k] + t.shape.as_list()[1:]) return padded_t return dataset.map( lambda x: {k: _trim_and_pad(k, t) for k, t in x.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def tokenize(dataset: tf.data.Dataset, output_features: OutputFeaturesType, copy_pretokenized: bool = True, with_eos: bool = False) -> tf.data.Dataset: """Encode output features with specified vocbularies. Passes through other features unchanged. Optionally passes through copy of original features with "_pretokenized" suffix added to the key. Args: dataset: a tf.data.Dataset of examples to tokenize. output_features: a dict of Feature objects; their vocabulary attribute will be used to tokenize the specified features. copy_pretokenized: bool, whether to pass through copies of original features with "_pretokenized" suffix added to the key. with_eos: bool, whether to append EOS to the end of the sequence. Returns: a tf.data.Dataset """ def _tokenize(features): ret = {} for k, v in features.items(): if k in output_features: if copy_pretokenized: ret[f'{k}_pretokenized'] = v vocab = output_features[k].vocabulary v = vocab.encode_tf(v) if with_eos and output_features[k].add_eos: v = tf.concat([v, [vocab.eos_id]], axis=-1) ret[k] = v return ret return dataset.map(_tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _check_lengths(ds: tf.data.Dataset, expected_lengths: Mapping[str, int], strict: bool, error_label: str) -> tf.data.Dataset: """Check the length of each feature in `ds` against `expected_lengths`. There are two checking criteria controlled by `strict` arg. If strict = True, for each feature in ds, check len(feature) == expected_lengths[feature]. If strict = False, for each feature in ds, check len(feature) <= expected_lengths[feature]. Features of the input dataset may have [None] shape. The assertion is run at the graph execution time when the length is determined. Args: ds: a tf.data.Dataset to be checked. expected_lengths: a mapping from a feature name to an expected length. strict: if true, the length of each feature should exactly match the expected length whereas false condition allows the length to be less than or equal to the expected length. error_label: a label used to indicate the validation stage Returns: ds: the same dataset as but with the assertion ops attached. """ def _check_length(feat, v): if feat not in expected_lengths: return v if strict: error_message = ( f"Feature '{feat}' has length not equal to the expected length of " f"{expected_lengths[feat]} during {error_label} validation") assertion_op = functools.partial( tf.debugging.assert_equal, message=error_message) else: error_message = ( f"Feature '{feat}' has length not less than or equal to the expected " f"length of {expected_lengths[feat]} during {error_label} validation") assertion_op = functools.partial( tf.debugging.assert_less_equal, message=error_message) expected_length = tf.constant(expected_lengths[feat], dtype=tf.int64) # Assumes that v has rank of 1. actual_length = tf.size(v, out_type=tf.int64) assertion_op(actual_length, expected_length) return v ds = ds.map( lambda ex: {k: _check_length(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE) return ds
def pack_as_supervised_ds( ds: tf.data.Dataset, ds_info: DatasetInfo, ) -> tf.data.Dataset: """Pack `(input, label)` dataset as `{'key0': input, 'key1': label}`.""" if (ds_info.supervised_keys and isinstance(ds.element_spec, tuple) and len(ds.element_spec) == 2): x_key, y_key = ds_info.supervised_keys ds = ds.map(lambda x, y: {x_key: x, y_key: y}) return ds else: # If dataset isn't a supervised tuple (input, label), return as-is return ds
def _rename_plaintext_to_pretokenized( dataset: tf.data.Dataset) -> tf.data.Dataset: """Rename cached _plaintext features to new _pretokenized standard.""" def _rename(inputs): outputs = {} for k, v in inputs.items(): if k.endswith("_plaintext"): k = k[:-len("plaintext")] + "pretokenized" outputs[k] = v return outputs return dataset.map( _rename, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def show_examples( ds: tf.data.Dataset, ds_info: dataset_info.DatasetInfo, **options_kwargs: Any ): """Visualize images (and labels) from an image classification dataset. This function is for interactive use (Colab, Jupyter). It displays and return a plot of (rows*columns) images from a tf.data.Dataset. Usage: ```python ds, ds_info = tfds.load('cifar10', split='train', with_info=True) fig = tfds.show_examples(ds, ds_info) ``` Args: ds: `tf.data.Dataset`. The tf.data.Dataset object to visualize. Examples should not be batched. Examples will be consumed in order until (rows * cols) are read or the dataset is consumed. ds_info: The dataset info object to which extract the label and features info. Available either through `tfds.load('mnist', with_info=True)` or `tfds.builder('mnist').info` **options_kwargs: Additional display options, specific to the dataset type to visualize. Are forwarded to `tfds.visualization.Visualizer.show`. See the `tfds.visualization` for a list of available visualizers. Returns: fig: The `matplotlib.Figure` object """ if not isinstance(ds_info, dataset_info.DatasetInfo): # Arguments inverted # `absl.logging` does not appear on Colab by default, so uses print instead. print('WARNING: For consistency with `tfds.load`, the `tfds.show_examples` ' 'signature has been modified from (info, ds) to (ds, info).\n' 'The old signature is deprecated and will be removed. ' 'Please change your call to `tfds.show_examples(ds, info)`') ds, ds_info = ds_info, ds # Pack `as_supervised=True` datasets if ( ds_info.supervised_keys and isinstance(ds.element_spec, tuple) and len(ds.element_spec) == 2 ): x_key, y_key = ds_info.supervised_keys ds = ds.map(lambda x, y: {x_key: x, y_key: y}) for visualizer in _ALL_VISUALIZERS: if visualizer.match(ds_info): return visualizer.show(ds, ds_info, **options_kwargs) raise ValueError( 'Visualisation not supported for dataset `{}`'.format(ds_info.name) )
def _convert_features( self, ds: tf.data.Dataset, task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Convert the dataset to be fed to the encoder-decoder model. The conversion process involves three steps 1. Each feature in the `task_feature_lengths` is trimmed/padded and optionally packed depending on the value of self.pack. 2. "inputs" fields are mapped to the encoder input and "targets" are mapped to decoder input (after being shifted) and target. All the keys in the `task_feature_lengths` should be present in the input dataset, which may contain some extra features that are not in the `task_feature_lengths`. They will not be included in the output dataset. One common scenario is the "inputs_pretokenized" and "targets_pretokenized" fields. Args: ds: an input tf.data.Dataset to be converted. task_feature_lengths: a mapping from feature to its length. Returns: ds: the converted dataset. """ def convert_example( features: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]: # targets_segment_id is present only for a packed dataset. decoder_input_tokens = autoregressive_inputs( features["targets"], sequence_id=features.get("targets_segment_ids", None)) d = { "encoder_input_tokens": features["inputs"], "decoder_target_tokens": features["targets"], "decoder_input_tokens": decoder_input_tokens, # Loss is computed for all but the padding positions. "decoder_loss_weights": non_padding_position(features["targets"]) } if self.pack: d["encoder_segment_ids"] = features["inputs_segment_ids"] d["decoder_segment_ids"] = features["targets_segment_ids"] d["encoder_positions"] = features["inputs_positions"] d["decoder_positions"] = features["targets_positions"] return d ds = self._pack_or_pad(ds, task_feature_lengths) return ds.map(convert_example, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _trim_output_features( self, dataset: tf.data.Dataset, sequence_length: Optional[Mapping[str, int]] ) -> tf.data.Dataset: """Trim output features to sequence length.""" def _trim(k: str, v: tf.Tensor) -> tf.Tensor: if k not in self.output_features or not sequence_length: return v return v[:sequence_length[k]] return dataset.map( lambda ex: {k: _trim(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _trim_and_ensure_eos( self, dataset: tf.data.Dataset, sequence_length: Mapping[str, int] ) -> tf.data.Dataset: """Trim and append EOS=1 token to model features.""" def _trim_and_append_eos(feat, v): if feat not in self.output_features: return v if sequence_length and self.output_features[feat].add_eos: v = tf.concat([v[:sequence_length[feat]-1], [1]], axis=0) elif sequence_length: v = v[:sequence_length[feat]] elif self.output_features[feat].add_eos: v = tf.concat([v, [1]], axis=0) return v return dataset.map( lambda ex: {k: _trim_and_append_eos(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def append_eos_after_trim( dataset: tf.data.Dataset, sequence_length: Optional[SequenceLengthType], output_features: OutputFeaturesType, ) -> tf.data.Dataset: """Trims output feature token sequences and then appends EOS. Respects the `add_eos` field of the seqio.Features in `output_features`. Truncates features before adding the EOS to ensure they fit in the max length specified by `sequence_length` once the EOS is added. If `sequence_length` is None, no trimming is performed. Note that sequences are automatically trimmed at the end of the Task pipeline, so unless you want the features to always end in EOS, use `append_eos` instead. Args: dataset: a tf.data.Dataset of tokenized examples to preprocess. sequence_length: a mapping from output feature names to max lengths. If provided, output feature sequences will be trimmed to ensure they are not longer than this length once EOS is added. output_features: a mapping of output feature names to Feature objects. Returns: a tf.data.Dataset of tokenized examples with EOS added to specified output features. """ def _maybe_add_eos_and_trim(key: str, value: tf.Tensor) -> tf.Tensor: if key not in output_features or not output_features[key].add_eos: return value eos_id = output_features[key].vocabulary.eos_id if sequence_length is not None: max_length = sequence_length[key] return tf.concat([value[:max_length - 1], [eos_id]], axis=0) else: return tf.concat([value, [eos_id]], axis=0) return dataset.map( lambda ex: {k: _maybe_add_eos_and_trim(k, v) for k, v in ex.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _pack_with_tf_ops(dataset: tf.data.Dataset, feature_lengths: Mapping[str, int]) -> tf.data.Dataset: """Helper-function for packing a dataset which has already been batched. See trim_and_pack_dataset() Uses tf.while_loop. Slow. Args: dataset: a dataset containing padded batches of examples. feature_lengths: mapping from feature key to packed length. Returns: a dataset. """ empty_example = {} for k in feature_lengths: for suff in ("", "_positions"): empty_example[k + suff] = tf.zeros([0], dtype=tf.int32) empty_example[k + suff].set_shape([None]) keys_etc = empty_example.keys() def _write_packed_example(partial, outputs): new_partial = empty_example.copy() new_outputs = {} for k in keys_etc: new_outputs[k] = outputs[k].write( outputs[k].size(), tf.pad(partial[k], [[ 0, feature_lengths[_strip_packed_feature_key(k)] - tf.size(partial[k]) ]])) return new_partial, new_outputs def pack_batch(x: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]: """Internal function to map over. Consumes a batch of input examples and produces a variable number of output examples. Args: x: a single example Returns: a tf.data.Dataset """ keys = list(feature_lengths) partial = empty_example.copy() first_key, *_ = keys dynamic_batch_size = tf.shape(x[first_key])[0] outputs = {} for k in keys: outputs[k] = tf.TensorArray(tf.int32, size=0, dynamic_size=True, element_shape=[feature_lengths[k]]) outputs[k + "_positions"] = tf.TensorArray( tf.int32, size=0, dynamic_size=True, element_shape=[feature_lengths[k]]) for i in tf.range(0, dynamic_batch_size): tf.autograph.experimental.set_loop_options(shape_invariants=[( partial, {k: tf.TensorShape([None]) for k in keys_etc} ), (outputs, {k: tf.TensorShape(None) for k in keys_etc})]) can_append = True one_example = {} for k in keys: val = tf.cast(x[k][i], tf.int32) val = val[:tf. reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))] one_example[k] = val for k in keys: can_append = tf.logical_and( can_append, tf.less_equal( tf.size(partial[k]) + tf.size(one_example[k]), feature_lengths[k])) if not can_append: partial, outputs = _write_packed_example(partial, outputs) new_partial = {} for k in keys: new_seq = one_example[k][:feature_lengths[k]] new_seq_len = tf.size(new_seq) new_partial[k] = tf.concat([partial[k], new_seq], 0) new_partial[k + "_positions"] = tf.concat([ partial[k + "_positions"], tf.range(new_seq_len, dtype=tf.int32) ], 0) partial = new_partial partial, outputs = _write_packed_example(partial, outputs) packed = {k: outputs[k].stack() for k in keys_etc} for k in keys: packed[k + "_segment_ids"] = (tf.cumsum( tf.cast(tf.equal(packed[k + "_positions"], 0), tf.int32), axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32)) return packed dataset = dataset.map(pack_batch, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset.unbatch()
def trim_and_pack_dataset(dataset: tf.data.Dataset, feature_lengths: Mapping[str, int], use_custom_ops: bool = False) -> tf.data.Dataset: """Creates a 'packed' version of a dataset on-the-fly. Modified from the tensor2tensor library. This is meant to replace the irritation of having to create a separate "packed" version of a dataset to train efficiently on TPU. Each example in the output dataset represents several examples in the input dataset. For each key in the input dataset that also exists in `feature_lengths`, two additional keys are created: <key>_segment_ids: an int32 tensor identifying the parts representing the original example. <key>_positions: an int32 tensor identifying the position within the original example. Features that are not in `feature_lengths` will be removed. Example: Two input examples get combined to form an output example. The input examples are: {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0], "idx": 0} {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1], "idx": 1} The output example is: { "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0] "inputs_segment_ids": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0] "inputs_positions": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0] "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0] "targets_segment_ids": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0] "targets_positions": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0] } 0 represents padding in both the inputs and the outputs. Sequences in the incoming examples are truncated to length in `feature_lengths`, and the sequences in the output examples all have this fixed (padded) length. Features not in `features_length` (i.e, "idx") are removed. Args: dataset: a tf.data.Dataset feature_lengths: map from feature key to final length. Other features will be discarded. use_custom_ops: a boolean - custom ops are faster but require a custom-built binary, which is not currently possible on cloud-tpu. Returns: a tf.data.Dataset """ element_spec = dataset.element_spec # Make sure that the dataset contains all keys in `feature_lengths`. for k in feature_lengths: if k not in element_spec: raise ValueError( f"Feature '{k}' not found in dataset. Available keys are " f"{list(element_spec.keys())}") if not element_spec[k].shape.is_compatible_with(tf.TensorShape([None ])): raise ValueError( f"Features to be packed must be one-dimensional. '{k}' is not.'" ) # Warn if there are any additional keys that will be removed. additional_keys = set(element_spec) - set(feature_lengths) if additional_keys: logging.warn( "Features not in `features_length` will be removed during packing: %s", additional_keys) ds = dataset.map( lambda x: {k: x[k][:l] for k, l in feature_lengths.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Setting batch_size=length ensures that the concatenated sequences (if they # have length >=1) are sufficient to fill at least one packed example. batch_size = max(feature_lengths.values()) ds = ds.padded_batch(batch_size, padded_shapes={k: [-1] for k in feature_lengths}) if use_custom_ops and len(feature_lengths) <= 2: ds = _pack_with_custom_ops(ds, feature_lengths) else: ds = _pack_with_tf_ops(ds, feature_lengths) # Set the Tensor shapes correctly since they get lost in the process. def _set_shape(x): for k, v in x.items(): v.set_shape([feature_lengths[_strip_packed_feature_key(k)]]) return x return ds.map(_set_shape, num_parallel_calls=tf.data.experimental.AUTOTUNE)
def _pack_with_tf_ops(dataset: tf.data.Dataset, keys: List[str], key2length: Dict[str, int]) -> tf.data.Dataset: """Helper-function for packing a dataset which has already been batched. Helper for pack_dataset() Uses tf.while_loop. Args: dataset: a dataset containing padded batches of examples. keys: a list of strings key2length: an dict from feature-key to integer Returns: a dataset. """ empty_example = {} for k in keys: empty_example[k] = tf.zeros([0], dtype=tf.int32) empty_example[k + '_position'] = tf.zeros([0], dtype=tf.int32) keys_etc = empty_example.keys() def write_packed_example(partial, outputs): new_partial = empty_example.copy() new_outputs = {} for k in keys_etc: new_outputs[k] = outputs[k].write( outputs[k].size(), tf.pad(partial[k], [[0, key2length[k] - tf.size(partial[k])]])) return new_partial, new_outputs def map_fn(x): """Internal function to flat_map over. Consumes a batch of input examples and produces a variable number of output examples. Args: x: a single example Returns: a tf.data.Dataset """ partial = empty_example.copy() i = tf.zeros([], dtype=tf.int32) dynamic_batch_size = tf.shape(x[keys[0]])[0] outputs = {} for k in keys: outputs[k] = tf.TensorArray(tf.int32, size=0, dynamic_size=True, element_shape=[key2length[k]]) outputs[k + '_position'] = tf.TensorArray( tf.int32, size=0, dynamic_size=True, element_shape=[key2length[k]]) def body_fn(i, partial, outputs): """Body function for while_loop. Args: i: integer scalar partial: dictionary of Tensor (partially-constructed example) outputs: dictionary of TensorArray Returns: A triple containing the new values of the inputs. """ can_append = True one_example = {} for k in keys: val = tf.cast(x[k][i], tf.int32) val = val[:tf. reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))] one_example[k] = val for k in keys: can_append = tf.logical_and( can_append, tf.less_equal( tf.size(partial[k]) + tf.size(one_example[k]), key2length[k])) def false_fn(): return write_packed_example(partial, outputs) def true_fn(): return partial, outputs partial, outputs = tf.cond(can_append, true_fn, false_fn) new_partial = {} for k in keys: new_seq = one_example[k][:key2length[k]] new_seq_len = tf.size(new_seq) new_partial[k] = tf.concat([partial[k], new_seq], 0) new_partial[k + '_position'] = tf.concat( [partial[k + '_position'], tf.range(new_seq_len)], 0) partial = new_partial return i + 1, partial, outputs # For loop over all examples in the batch. i, partial, outputs = tf.while_loop( cond=lambda *_: True, body=body_fn, loop_vars=(i, partial, outputs), shape_invariants=( tf.TensorShape([]), {k: tf.TensorShape([None]) for k in keys_etc}, {k: tf.TensorShape(None) for k in keys_etc}, ), maximum_iterations=dynamic_batch_size) _, outputs = write_packed_example(partial, outputs) packed = {k: outputs[k].stack() for k in keys_etc} for k in keys: packed[k + '_segmentation'] = (tf.cumsum( tf.cast(tf.equal(packed[k + '_position'], 0), tf.int32), axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32)) return packed dataset = dataset.map(map_fn, num_parallel_calls=AUTOTUNE) return dataset.unbatch()
def pipeline( self, dataset: tf.data.Dataset, input_context: tf.distribute.InputContext = None ) -> tf.data.Dataset: """Build a pipeline fetching, shuffling, and preprocessing the dataset. Args: dataset: A `tf.data.Dataset` that loads raw files. input_context: An optional context provided by `tf.distribute` for cross-replica training. This isn't necessary if using Keras compile/fit. Returns: A TensorFlow dataset outputting batched images and labels. """ if input_context and input_context.num_input_pipelines > 1: dataset = dataset.shard(input_context.num_input_pipelines, input_context.input_pipeline_id) if self.is_training and not self.config.cache: dataset = dataset.repeat() if self.config.builder == 'records': # Read the data from disk in parallel buffer_size = 8 * 1024 * 1024 # Use 8 MiB per file dataset = dataset.interleave( lambda name: tf.data.TFRecordDataset(name, buffer_size=buffer_size), cycle_length=16, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.prefetch(self.global_batch_size) if self.config.cache: dataset = dataset.cache() if self.is_training: dataset = dataset.shuffle(self.config.shuffle_buffer_size) dataset = dataset.repeat() # Parse, pre-process, and batch the data in parallel if self.config.builder == 'records': preprocess = self.parse_record else: preprocess = self.preprocess dataset = dataset.map(preprocess, num_parallel_calls=tf.data.experimental.AUTOTUNE) dataset = dataset.batch(self.batch_size, drop_remainder=self.is_training) # Note: we could do image normalization here, but we defer it to the model # which can perform it much faster on a GPU/TPU # TODO(dankondratyuk): if we fix prefetching, we can do it here if self.is_training and self.config.deterministic_train is not None: options = tf.data.Options() options.experimental_deterministic = self.config.deterministic_train options.experimental_slack = self.config.use_slack options.experimental_optimization.parallel_batch = True options.experimental_optimization.map_fusion = True options.experimental_optimization.map_vectorization.enabled = True options.experimental_optimization.map_parallelization = True dataset = dataset.with_options(options) # Prefetch overlaps in-feed with training # Note: autotune here is not recommended, as this can lead to memory leaks. # Instead, use a constant prefetch size like the the number of devices. dataset = dataset.prefetch(self.config.num_devices) return dataset
def pack_dataset(dataset: tf.data.Dataset, key2length: Union[int, Dict[str, int]], keys: Optional[List[str]] = None) -> tf.data.Dataset: """Creates a 'packed' version of a dataset on-the-fly. Adapted from the mesh-tf implementation. This is meant to replace the irritation of having to create a separate "packed" version of a dataset to train efficiently on TPU. Each example in the output dataset represents several examples in the input dataset. For each key in the input dataset, two additional keys are created: <key>_segmentation: an int32 tensor identifying the parts representing the original example. <key>_position: an int32 tensor identifying the position within the original example. Example: Two input examples get combined to form an output example. The input examples are: {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0]} {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1]} The output example is: { "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0] "inputs_segmentation": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0] "inputs_position": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0] "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0] "targets_segmentation": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0] "targets_position": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0] } 0 represents padding in both the inputs and the outputs. Sequences in the incoming examples are truncated to length "length", and the sequences in the output examples all have fixed (padded) length "length". Args: dataset: a tf.data.Dataset key2length: an integer, or a dict from feature-key to integer keys: a list of strings (e.g. ["inputs", "targets"]) Returns: a tf.data.Dataset """ shapes = tf.nest.map_structure(lambda spec: spec.shape, dataset.element_spec) if keys is None: keys = list(shapes.keys()) for k in keys: if k not in shapes: raise ValueError( 'Key %s not found in dataset. Available keys are %s' % (k, shapes.keys())) if not shapes[k].is_compatible_with(tf.TensorShape([None])): raise ValueError('Tensors to be packed must be one-dimensional.') # make sure that the length dictionary contains all keys as well as the # keys suffixed by "_segmentation" and "_position" if isinstance(key2length, int): key2length = {k: key2length for k in keys} for k in keys: for suffix in ['_segmentation', '_position']: key2length[k + suffix] = key2length[k] # trim to length dataset = dataset.map(lambda x: {k: x[k][:key2length[k]] for k in keys}, num_parallel_calls=AUTOTUNE) # Setting batch_size=length ensures that the concatenated sequences (if they # have length >=1) are sufficient to fill at least one packed example. batch_size = max(key2length.values()) dataset = dataset.padded_batch(batch_size, padded_shapes={k: [-1] for k in keys}) dataset = _pack_with_tf_ops(dataset, keys, key2length) # Set the Tensor shapes correctly since they get lost in the process. def my_fn(x): return {k: tf.reshape(v, [key2length[k]]) for k, v in x.items()} return dataset.map(my_fn, num_parallel_calls=AUTOTUNE)
def _cast_output_features(self, dataset: tf.data.Dataset) -> tf.data.Dataset: """Cast output features to the specified dtypes, leaving others as-is.""" dtypes = {k: f.dtype for k, f in self.output_features.items()} return dataset.map( lambda x: {k: tf.cast(v, dtypes.get(k, v.dtype)) for k, v in x.items()}, num_parallel_calls=tf.data.experimental.AUTOTUNE)