def tokens_to_batches(dataset, sequence_length, batch_size, output_features): """Convert a dataset of token sequences to batches of padded/masked examples. Args: dataset: tf.data.Dataset containing examples with token sequences. sequence_length: dict of int, a dict mapping feature name to length. batch_size: int, the number of padded sequences in each batch. output_features: list of str, features to include in the dataset. Returns: A generator that produces batches of numpy examples. """ dataset = transformer_dataset.pack_or_pad( dataset, sequence_length, pack=False, feature_keys=output_features, ensure_eos=True, ) def _map_fn(ex): for key in output_features: tensor = ex[key] mask = tf.cast(tf.greater(tensor, 0), tensor.dtype) ex[key + "_mask"] = mask return ex dataset = dataset.map( _map_fn, num_parallel_calls=t5.data.preprocessors.num_parallel_calls()) dataset = dataset.batch(batch_size, drop_remainder=False) return tfds.as_numpy(dataset)
def _get_dataset_for_single_task(task, sequence_length): """Get a tensorflow.data.Dataset for the provided task.""" if shuffle_eval_examples and seed is None: logging.warning(("shuffle_seed_examples is true but no seed was ", "provided. Using a random seed.")) ds = task.get_dataset( sequence_length, split=dataset_split, use_cached=use_cached, shuffle=shuffle_eval_examples, seed=seed, ) eos_keys = set(k for k, f in mixture_or_task.output_features.items() if f.add_eos) if sequence_length is None: logging.info( "Skipping packing/padding for '%s' since sequence length is None.", task.name) else: logging.info("%sing '%s' with sequence lengths: %s", "Pack" if pack else "Padd", task.name, sequence_length) ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=pack, feature_keys=tuple( task.output_features), ensure_eos=eos_keys) if num_eval_examples is not None and num_eval_examples >= 0: ds = ds.take(num_eval_examples) return ds
def _get_dataset_for_single_task(task, sequence_length): """Get a tensorflow.data.Dataset for the provided task.""" ds = task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=False) eos_keys = set(k for k, f in mixture_or_task.output_features.items() if f.add_eos) if sequence_length is None: tf.logging.info( "Skipping packing/padding for '%s' since sequence length is None.", task.name) else: tf.logging.info("%sing '%s' with sequence lengths: %s", "Pack" if pack else "Padd", task.name, sequence_length) ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=pack, feature_keys=tuple( task.output_features), ensure_eos=eos_keys) ds = maybe_shuffle_and_subsample_dataset(ds, num_eval_examples, shuffle_eval_examples, shuffle_buffer_size) return ds
def mesh_train_dataset_fn( mixture_or_task_name, sequence_length, vocabulary, dataset_split=tfds.Split.TRAIN, use_cached=False): """Returns the tf.data.Dataset for training on a given mixture. This uses the format required for utils.run's `train_dataset_fn` argument in the Mesh TF transformer standalone. Args: mixture_or_task_name: string, an identifier for a Mixture or Task in the appropriate registry. Must be specified via gin. sequence_length: dict mapping feature key to the int length for that feature the max sequence length. vocabulary: a SentencePieceVocabulary. dataset_split: string, which split of the dataset to load. In most cases this should be "train". use_cached: bool, whether to load the cached version of this dataset. Returns: A tf.data.Dataset of preprocessed, tokenized, and batched examples. """ if not isinstance(vocabulary, data.SentencePieceVocabulary): raise ValueError("vocabulary must be a SentencePieceVocabulary") mixture_or_task = data.get_mixture_or_task(mixture_or_task_name) ds = mixture_or_task.get_dataset( sequence_length, split=dataset_split, use_cached=use_cached, shuffle=True) ds = transformer_dataset.pack_or_pad( ds, sequence_length, pack=True, feature_keys=tuple(mixture_or_task.output_features), ensure_eos=True) return ds
def mesh_train_dataset_fn(mixture_or_task_name, sequence_length, vocabulary=None, dataset_split=tfds.Split.TRAIN, seed=None, use_cached=False, pack=True): """Returns the tf.data.Dataset for training on a given mixture. This uses the format required for utils.run's `train_dataset_fn` argument in the Mesh TF transformer standalone. Args: mixture_or_task_name: string, an identifier for a Mixture or Task in the appropriate registry. Must be specified via gin. sequence_length: dict mapping feature key to the int length for that feature the max sequence length. vocabulary: unused argument, maintains compatibility with other dataset_fns. dataset_split: string, which split of the dataset to load. In most cases this should be "train". seed: tf.int64 scalar tf.Tensor (or None). Used for both the global seed and shuffle seed for tf.data use_cached: bool, whether to load the cached version of this dataset. pack: bool, whether to pack the dataset. Returns: A tf.data.Dataset of preprocessed, tokenized, and batched examples. """ del vocabulary mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name) ds = mixture_or_task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=True, seed=seed) # Select just the output features which are present in the dataset. feature_keys = tuple(k for k in mixture_or_task.output_features if k in tf.data.get_output_shapes(ds)) # Filtering feature keys is done in pack_or_pad function. However, when # packing is turned off, input_features aren't filtered leading to training # problems due to strings showing up in the input example. Filtering features # ensures that we don't rely on pack_or_pad to filter features for training. def _filter_features(ex): return {k: ex[k] for k in feature_keys} ds = ds.map(_filter_features, num_parallel_calls=tf.data.experimental.AUTOTUNE) eos_keys = set(k for k, f in mixture_or_task.output_features.items() if f.add_eos) ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=pack, feature_keys=feature_keys, ensure_eos=eos_keys) return ds
def _get_dataset_for_single_task(task): """Get a tensorflow.data.Dataset for the provided task.""" ds = task.get_dataset( sequence_length, split=dataset_split, use_cached=use_cached, shuffle=False ) ds = transformer_dataset.pack_or_pad( ds, sequence_length, pack=False, feature_keys=task.output_features, ensure_eos=True) if num_eval_examples is not None: ds = ds.take(num_eval_examples) return ds
def mesh_train_dataset_fn(mixture_or_task_name, sequence_length, vocabulary, dataset_split=tfds.Split.TRAIN, seed=None, use_cached=False): """Returns the tf.data.Dataset for training on a given mixture. This uses the format required for utils.run's `train_dataset_fn` argument in the Mesh TF transformer standalone. Args: mixture_or_task_name: string, an identifier for a Mixture or Task in the appropriate registry. Must be specified via gin. sequence_length: dict mapping feature key to the int length for that feature the max sequence length. vocabulary: a t5.data.vocabularies.Vocabulary. dataset_split: string, which split of the dataset to load. In most cases this should be "train". seed: tf.int64 scalar tf.Tensor (or None). Used for both the global seed and shuffle seed for tf.data use_cached: bool, whether to load the cached version of this dataset. Returns: A tf.data.Dataset of preprocessed, tokenized, and batched examples. """ valid_vocabulary(vocabulary) mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name) ds = mixture_or_task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=True, seed=seed) # Select just the output features which are present in the dataset. feature_keys = tuple(k for k in mixture_or_task.output_features if k in tf.data.get_output_shapes(ds)) eos_keys = set(k for k, f in mixture_or_task.output_features.items() if f.add_eos) ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=True, feature_keys=feature_keys, ensure_eos=eos_keys) return ds
def tokens_to_batches(dataset, sequence_length, batch_size, output_features, mixture_or_task=None): """Convert a dataset of token sequences to batches of padded/masked examples. Args: dataset: tf.data.Dataset containing examples with token sequences. sequence_length: dict of int, a dict mapping feature name to length. batch_size: int, the number of padded sequences in each batch. output_features: list of str, features to include in the dataset. mixture_or_task: a Task or Mixture object, used to correctly specify eos if provided. If none, eos is always added at the end of the sequence. Returns: A generator that produces batches of numpy examples. """ if mixture_or_task: eos_keys = set( k for k, f in mixture_or_task.output_features.items() if f.add_eos) else: eos_keys = True dataset = transformer_dataset.pack_or_pad( dataset, sequence_length, pack=False, feature_keys=output_features, ensure_eos=eos_keys, ) def _map_fn(ex): for key in output_features: tensor = ex[key] mask = tf.cast(tf.greater(tensor, 0), tensor.dtype) ex[key + "_mask"] = mask return ex dataset = dataset.map( _map_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE, ) dataset = dataset.batch(batch_size, drop_remainder=False) return tfds.as_numpy(dataset)
def _get_dataset_for_single_task(task): """Get a tensorflow.data.Dataset for the provided task.""" ds = task.get_dataset( sequence_length, split=dataset_split, use_cached=use_cached, shuffle=False ) eos_keys = set( k for k, f in mixture_or_task.output_features.items() if f.add_eos) ds = transformer_dataset.pack_or_pad( ds, sequence_length, pack=pack, feature_keys=tuple(task.output_features), ensure_eos=eos_keys) ds = maybe_shuffle_and_subsample_dataset( ds, num_eval_examples, shuffle_eval_examples, shuffle_buffer_size) return ds
def _get_dataset_for_single_task(task): """Get a tensorflow.data.Dataset for the provided task.""" ds = task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=False) if any(not f.add_eos for f in task.output_features.values()): warnings.warn( "pack_or_pad is being called with ensure_eos=True, but EOS is not " "being added to all features.") ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=False, feature_keys=tuple( task.output_features), ensure_eos=True) if num_eval_examples is not None: ds = ds.take(num_eval_examples) return ds
def mesh_train_dataset_fn(mixture_or_task_name, sequence_length, vocabulary, dataset_split=tfds.Split.TRAIN, use_cached=False): """Returns the tf.data.Dataset for training on a given mixture. This uses the format required for utils.run's `train_dataset_fn` argument in the Mesh TF transformer standalone. Args: mixture_or_task_name: string, an identifier for a Mixture or Task in the appropriate registry. Must be specified via gin. sequence_length: dict mapping feature key to the int length for that feature the max sequence length. vocabulary: a t5.data.vocabularies.Vocabulary. dataset_split: string, which split of the dataset to load. In most cases this should be "train". use_cached: bool, whether to load the cached version of this dataset. Returns: A tf.data.Dataset of preprocessed, tokenized, and batched examples. """ valid_vocabulary(vocabulary) mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name) ds = mixture_or_task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=True) if any(not f.add_eos for f in mixture_or_task.output_features.values()): warnings.warn( "pack_or_pad is being called with ensure_eos=True, but EOS is not " "being added to all features.") ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=True, feature_keys=tuple( mixture_or_task.output_features), ensure_eos=True) return ds
def _get_dataset_for_single_task(task, sequence_length): """Get a tensorflow.data.Dataset for the provided task.""" ds = task.get_dataset(sequence_length, split=dataset_split, use_cached=use_cached, shuffle=shuffle, seed=seed) if "inputs" not in ds.element_spec: if not priming_sequence_length or priming_sequence_length <= 0: logging.warning( "Priming sequence length not specified so priming " "with the empty string.") ds = ds.map(_prepare_for_unprimed_inference) else: logging.info( "Using the first %d tokens of each target as input.", priming_sequence_length) ds = ds.map(_split_targets_for_primed_inference) elif priming_sequence_length is not None: raise ValueError( "Setting a priming sequence length only makes sense for decoder-only " "Tasks, which have `targets` but no `inputs`.") eos_keys = set(k for k, f in mixture_or_task.output_features.items() if f.add_eos) logging.info("Padding '%s' with sequence lengths: %s", task.name, sequence_length) ds = transformer_dataset.pack_or_pad(ds, sequence_length, pack=False, feature_keys=tuple( task.output_features), ensure_eos=eos_keys) if num_inference_examples is not None and num_inference_examples >= 0: ds = ds.take(num_inference_examples) return ds
#!/usr/bin/env python3 import t5 import tensorflow as tf import mesh_tensorflow.transformer.dataset as transformer_dataset import itertools tf.enable_eager_execution() task = t5.data.get_mixture_or_task("glue_cola_v002") ds = task.get_dataset({"inputs": 64, "targets": 8}, "train") ds = transformer_dataset.pack_or_pad( ds, { "inputs": 64, "targets": 8 }, pack=False, feature_keys=tuple(task.output_features), ensure_eos=True, ) def add_attention_masks(ds, feature_keys): def _map_fn(ex): for key in feature_keys: tensor = ex[key] mask = tf.cast(tf.greater(tensor, 0), tensor.dtype) ex[key + "_mask"] = mask return ex return ds.map(