def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) return dataset
def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_data.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. keys = output_features def encode_string_features_fn(features): """Encode all specified feature that are strings and return a dictionary. Args: features: a dictionary Returns: a dictionary """ ret = {} for k, v in features.items(): if k in keys and v.dtype == tf.string: if copy_plaintext: ret['%s_plaintext' % k] = v v = tf.cast(output_features[k].vocabulary.encode_tf(v), tf.int64) ret[k] = v return ret dataset = dataset.map(encode_string_features_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) # Truncate and then pad the examples -- all examples have the same shape. dataset = truncate_dataset_on_len(dataset, training, sequence_length, True) dataset = pad_dataset_to_length(dataset, training, sequence_length) return dataset
def _get_vocab(vocab_type='subword', vocab_file=None, vocab_dir=None): """Gets the vocabulary object for tokenization; see tokenize for details.""" if vocab_type not in ['char', 'subword', 'sentencepiece']: raise ValueError('vocab_type must be "subword", "char", or "sentencepiece" ' f'but got {vocab_type}') if vocab_type == 'char': # Note that we set num_reserved_ids=0 below. We could instead pass # the value n_reserved_ids from tokenize here -- ByteTextEncoder does # exactly the same thing as tokenize above, ie., adds num_reserved_ids. return text_encoder.ByteTextEncoder(num_reserved_ids=0) vocab_dir = vocab_dir or 'gs://trax-ml/vocabs/' path = os.path.join(vocab_dir, vocab_file) if vocab_type == 'subword': return text_encoder.SubwordTextEncoder(path) assert vocab_type == 'sentencepiece' return t5_spc_vocab.SentencePieceVocabulary(sentencepiece_model_file=path)
def generic_text_dataset_preprocess_fn(dataset, text_preprocess_fn=None, spm_path=None, copy_plaintext=False): """Applies a text preprocess fn and tokenizes the dataset.""" # The assumption is that `text_preprocess_fn` finally gives us a dataset # which has `inputs` and `targets`. if text_preprocess_fn is not None: dataset = text_preprocess_fn(dataset) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the inputs and targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) return dataset
def get_default_vocabulary(): return sentencepiece_vocabulary.SentencePieceVocabulary( DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)
def generic_text_dataset_preprocess_fn(dataset, training=True, text_preprocess_fns=None, token_preprocess_fns=None, spm_path=None, copy_plaintext=False, debug_print_examples=False, debug_print_examples_rate=0.01): """Pre-processes, tokenizes and post-processes a `tf.data.Dataset`. Args: dataset: `tf.data.Dataset` to process. training: boolean, set to True if training, False otherwise. text_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset` this operates before tokenization. Typically used to select which fields we want to learn over or change something into "text to text" form. token_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset`, this operates after tokenization. Since this can view the tokenized fields, this can be used to filter on length etc. spm_path: None or str, path to a sentencepiece model to use for tokenization by default uses the 32k vocabulary from T5. copy_plaintext: bool, if True retains the original fields after tokenization. debug_print_examples: bool, if True this prints examples to the logging stream for inspection, both before and after tokenization. debug_print_examples_rate: float, [0, 1.0], on average this fraction of dataset examples will be printed out in each phase i.e. pre and post tokenization. Returns: a `tf.data.Dataset` with all the preprocessing and tokenization performed. """ # The assumption is that `text_preprocess_fns` finally gives us a dataset # which has `inputs` and `targets`. if text_preprocess_fns is not None: for text_preprocess_fn in text_preprocess_fns: dataset = text_preprocess_fn(dataset, training) # Print debugging examples if needed before tokenization. if debug_print_examples: def print_examples(x): if np.random.uniform() < debug_print_examples_rate: tf.print(x, output_stream=logging.info) return x dataset = dataset.map(print_examples) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the inputs and targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) # Apply the token-preprocessors. if token_preprocess_fns is not None: for token_preprocess_fn in token_preprocess_fns: dataset = token_preprocess_fn(dataset, training) if debug_print_examples: def print_examples_and_shapes(x): if np.random.uniform() < debug_print_examples_rate: tf.print( { 'inputs_shape': tf.size(x['inputs']), 'targets_shape': tf.size(x['targets']), 'inputs': x['inputs'], 'targets': x['targets'], }, output_stream=logging.info) return x dataset = dataset.map(print_examples_and_shapes) return dataset
def sentencepiece_vocab(extra_ids=0): return sentencepiece_vocabulary.SentencePieceVocabulary( os.path.join(TEST_DATA_DIR, "sentencepiece", "sentencepiece.model"), extra_ids=extra_ids)
from t5.data import sentencepiece_vocabulary from t5.evaluation import metrics from t5.data import preprocessors from t5.data import TaskRegistry from t5.data import TextLineTask import numpy as np import functools import tensorflow as tf from sumeval.metrics.rouge import RougeCalculator rouge_cal = RougeCalculator(stopwords=True, lang="ja") DEFAULT_SPM_PATH = "gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model" DEFAULT_SPM_PATH = "/home/katsumata/work/summarization/mt5/sentencepiece.model" DEFAULT_VOCAB = sentencepiece_vocabulary.SentencePieceVocabulary( DEFAULT_SPM_PATH) DEFAULT_OUTPUT_FEATURES = { "inputs": t5.data.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True, required=False), "targets": t5.data.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True) } # rouge-1, rouge-2, rouge-lを評価指標とします def rouge(targets, predictions): predictions = [tf.compat.as_text(x) for x in predictions] if isinstance(targets[0], list): targets = [[tf.compat.as_text(x) for x in target] for target in targets]
def get_vocabulary(self): """Returns a SentencePieceVocabulary object using the Task's model.""" return sentencepiece_vocabulary.SentencePieceVocabulary( self.sentencepiece_model_path)
def __init__(self, name, dataset_fn, splits, text_preprocessor, sentencepiece_model_path=None, metric_fns=None, postprocess_fn=None, token_preprocessor=None, output_features=None, num_input_examples=None, supports_caching=False): """Task constructor. Attributes of output features, including the vocabulary used for tokenization, should be provided via the `output_features` argument. If a given feature does not have a vocabulary defined, it will use a `vocabularies.SentencePieceVocabulary` whose SentencePiece model path is given by the `sentencepiece_model_path` argument. Args: name: string, a unique name for the Task. A ValueError will be raised if another task with this name is already registered. dataset_fn: callable, a function with the signature `dataset_fn(split, shuffle_files)' that returns a `tf.data.Dataset`. splits: list(string), a list of allowable splits to request from the `dataset_fn`. text_preprocessor: a function (or list of functions) that (each) takes in a tf.data.Dataset of string features and returns a tf.data.Dataset of string features. Can be set to None as a no-op. If a list is given, they will be executed sequentially. sentencepiece_model_path: str or None, path to a SentencePiece model file to use for tokenization when a `Feature` in `output_features` is not supplied that has a vocabulary defined. For provided `Feature`s in `output_features`, this argument will be ignored. If None, use DEFAULT_SPM_PATH. metric_fns: list(callable), list of metric functions with the signature `metric_fn(targets, predictions)` to use during evaluation. By default (None), an empty list will be used, resulting in no evaluation on this task. postprocess_fn: function, a function that takes in decoded model outputs (strings) and returns a string which is ready for evaluation using the metric functions in `metric_fns`. Can be set to None as a no-op. token_preprocessor: an optional function (or list of functions) that (each) takes in a tf.data.Dataset of token features and returns a tf.data.Dataset of token features. Can be set to None as a no-op. If a list is given, they will be executed sequentially. The functions are also passed `sequence_length` and `vocabulary` keyword arguments. output_features: list(str) or dict. Output features of the Task. If list(str) is provided, a `Feature` class instance will be created for each provided feature name using the default values. If a dict is provided, it should map feature names to `Feature` class instances. When `output_features` is None (default), two output features for "inputs" and "targets" will be constructed using the default values for the `Feature` class. num_input_examples: dict(string: int) or None, a dictionary mapping split to its size in number of input examples (before preprocessing). The `num_input_examples` method will return None if not provided. supports_caching: bool, whether or not this task supports offline caching. """ if not _VALID_TASK_NAME_REGEX.match(name): raise ValueError( "Task name '%s' contains invalid characters. Must match regex: %s" % (name, _VALID_TASK_NAME_REGEX.pattern)) _validate_args(dataset_fn, ["split", "shuffle_files"]) metric_fns = metric_fns or [] for metric_fn in metric_fns: _validate_args(metric_fn, ["targets", "predictions"]) self._name = name self._dataset_fn = dataset_fn self._text_preprocessor = ([] if text_preprocessor is None else text_preprocessor) self._token_preprocessor = ([] if token_preprocessor is None else token_preprocessor) self._metric_fns = metric_fns # Use a pass-through if postprocess_fn is not provided self._postprocess_fn = postprocess_fn or (lambda x, **unused_kwargs: x) self._cache_dir = None self._stats = {} if hasattr(output_features, "__len__") and not output_features: raise ValueError("output_features must be non-empty.") if isinstance(output_features, dict): self._output_features = output_features elif output_features is None or isinstance(output_features, list): output_features = output_features or _DEFAULT_FEATURE_KEYS default_vocabulary = sentencepiece_vocabulary.SentencePieceVocabulary( sentencepiece_model_file=sentencepiece_model_path or DEFAULT_SPM_PATH) self._output_features = { f: Feature(vocabulary=default_vocabulary) for f in output_features } else: raise ValueError( "output_features must be a dict, list of str, or None") self._output_features = collections.OrderedDict( sorted(list(self._output_features.items()))) self._splits = splits self._num_input_examples = num_input_examples self._supports_caching = supports_caching
def get_vocabulary(): return sentencepiece_vocabulary.SentencePieceVocabulary( args.spiece_model_path, extra_ids=100 )
def get_default_vocabulary(): return sentencepiece_vocabulary.SentencePieceVocabulary( DEFAULT_SPM_PATH) # TODO update with latest t5 version