def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_data.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. keys = output_features def encode_string_features_fn(features): """Encode all specified feature that are strings and return a dictionary. Args: features: a dictionary Returns: a dictionary """ ret = {} for k, v in features.items(): if k in keys and v.dtype == tf.string: if copy_plaintext: ret['%s_plaintext' % k] = v v = tf.cast(output_features[k].vocabulary.encode_tf(v), tf.int64) ret[k] = v return ret dataset = dataset.map(encode_string_features_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) # Truncate and then pad the examples -- all examples have the same shape. dataset = truncate_dataset_on_len(dataset, training, sequence_length, True) dataset = pad_dataset_to_length(dataset, training, sequence_length) return dataset
def generic_text_dataset_preprocess_fn(dataset, training=True, text_preprocess_fns=None, token_preprocess_fns=None, spm_path=None, copy_plaintext=False, debug_print_examples=False, debug_print_examples_rate=0.01): """Pre-processes, tokenizes and post-processes a `tf.data.Dataset`. Args: dataset: `tf.data.Dataset` to process. training: boolean, set to True if training, False otherwise. text_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset` this operates before tokenization. Typically used to select which fields we want to learn over or change something into "text to text" form. token_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset`, this operates after tokenization. Since this can view the tokenized fields, this can be used to filter on length etc. spm_path: None or str, path to a sentencepiece model to use for tokenization by default uses the 32k vocabulary from T5. copy_plaintext: bool, if True retains the original fields after tokenization. debug_print_examples: bool, if True this prints examples to the logging stream for inspection, both before and after tokenization. debug_print_examples_rate: float, [0, 1.0], on average this fraction of dataset examples will be printed out in each phase i.e. pre and post tokenization. Returns: a `tf.data.Dataset` with all the preprocessing and tokenization performed. """ # The assumption is that `text_preprocess_fns` finally gives us a dataset # which has `inputs` and `targets`. if text_preprocess_fns is not None: for text_preprocess_fn in text_preprocess_fns: dataset = text_preprocess_fn(dataset, training) # Print debugging examples if needed before tokenization. if debug_print_examples: def print_examples(x): if np.random.uniform() < debug_print_examples_rate: tf.print(x, output_stream=logging.info) return x dataset = dataset.map(print_examples) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_data.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the inputs and targets. dataset = t5_processors.tokenize(dataset, output_features, copy_plaintext=copy_plaintext) # Apply the token-preprocessors. if token_preprocess_fns is not None: for token_preprocess_fn in token_preprocess_fns: dataset = token_preprocess_fn(dataset, training) if debug_print_examples: def print_examples_and_shapes(x): if np.random.uniform() < debug_print_examples_rate: tf.print( { 'inputs_shape': tf.size(x['inputs']), 'targets_shape': tf.size(x['targets']), 'inputs': x['inputs'], 'targets': x['targets'], }, output_stream=logging.info) return x dataset = dataset.map(print_examples_and_shapes) return dataset