Пример #1
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    return dataset
Пример #2
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_data.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    keys = output_features

    def encode_string_features_fn(features):
        """Encode all specified feature that are strings and return a dictionary.

    Args:
      features: a dictionary
    Returns:
      a dictionary
    """
        ret = {}
        for k, v in features.items():
            if k in keys and v.dtype == tf.string:
                if copy_plaintext:
                    ret['%s_plaintext' % k] = v
                v = tf.cast(output_features[k].vocabulary.encode_tf(v),
                            tf.int64)
            ret[k] = v
        return ret

    dataset = dataset.map(encode_string_features_fn,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    # Truncate and then pad the examples -- all examples have the same shape.
    dataset = truncate_dataset_on_len(dataset, training, sequence_length, True)
    dataset = pad_dataset_to_length(dataset, training, sequence_length)

    return dataset
Пример #3
0
def _get_vocab(vocab_type='subword', vocab_file=None, vocab_dir=None):
  """Gets the vocabulary object for tokenization; see tokenize for details."""
  if vocab_type not in ['char', 'subword', 'sentencepiece']:
    raise ValueError('vocab_type must be "subword", "char", or "sentencepiece" '
                     f'but got {vocab_type}')

  if vocab_type == 'char':
    # Note that we set num_reserved_ids=0 below. We could instead pass
    # the value n_reserved_ids from tokenize here -- ByteTextEncoder does
    # exactly the same thing as tokenize above, ie., adds num_reserved_ids.
    return text_encoder.ByteTextEncoder(num_reserved_ids=0)

  vocab_dir = vocab_dir or 'gs://trax-ml/vocabs/'
  path = os.path.join(vocab_dir, vocab_file)

  if vocab_type == 'subword':
    return text_encoder.SubwordTextEncoder(path)

  assert vocab_type == 'sentencepiece'
  return t5_spc_vocab.SentencePieceVocabulary(sentencepiece_model_file=path)
Пример #4
0
def generic_text_dataset_preprocess_fn(dataset,
                                       text_preprocess_fn=None,
                                       spm_path=None,
                                       copy_plaintext=False):
    """Applies a text preprocess fn and tokenizes the dataset."""

    # The assumption is that `text_preprocess_fn` finally gives us a dataset
    # which has `inputs` and `targets`.
    if text_preprocess_fn is not None:
        dataset = text_preprocess_fn(dataset)

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the inputs and targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    return dataset
Пример #5
0
def get_default_vocabulary():
  return sentencepiece_vocabulary.SentencePieceVocabulary(
      DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)
Пример #6
0
def generic_text_dataset_preprocess_fn(dataset,
                                       training=True,
                                       text_preprocess_fns=None,
                                       token_preprocess_fns=None,
                                       spm_path=None,
                                       copy_plaintext=False,
                                       debug_print_examples=False,
                                       debug_print_examples_rate=0.01):
    """Pre-processes, tokenizes and post-processes a `tf.data.Dataset`.

  Args:
    dataset: `tf.data.Dataset` to process.
    training: boolean, set to True if training, False otherwise.
    text_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool ->
      `tf.data.Dataset` this operates before tokenization. Typically used to
      select which fields we want to learn over or change something into
      "text to text" form.
    token_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool ->
      `tf.data.Dataset`, this operates after tokenization. Since this can view
      the tokenized fields, this can be used to filter on length etc.
    spm_path: None or str, path to a sentencepiece model to use for tokenization
      by default uses the 32k vocabulary from T5.
    copy_plaintext: bool, if True retains the original fields after
      tokenization.
    debug_print_examples: bool, if True this prints examples to the logging
      stream for inspection, both before and after tokenization.
    debug_print_examples_rate: float, [0, 1.0], on average this fraction of
      dataset examples will be printed out in each phase i.e. pre and post
      tokenization.

  Returns:
    a `tf.data.Dataset` with all the preprocessing and tokenization performed.
  """

    # The assumption is that `text_preprocess_fns` finally gives us a dataset
    # which has `inputs` and `targets`.
    if text_preprocess_fns is not None:
        for text_preprocess_fn in text_preprocess_fns:
            dataset = text_preprocess_fn(dataset, training)

    # Print debugging examples if needed before tokenization.
    if debug_print_examples:

        def print_examples(x):
            if np.random.uniform() < debug_print_examples_rate:
                tf.print(x, output_stream=logging.info)
            return x

        dataset = dataset.map(print_examples)

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the inputs and targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Apply the token-preprocessors.
    if token_preprocess_fns is not None:
        for token_preprocess_fn in token_preprocess_fns:
            dataset = token_preprocess_fn(dataset, training)

    if debug_print_examples:

        def print_examples_and_shapes(x):
            if np.random.uniform() < debug_print_examples_rate:
                tf.print(
                    {
                        'inputs_shape': tf.size(x['inputs']),
                        'targets_shape': tf.size(x['targets']),
                        'inputs': x['inputs'],
                        'targets': x['targets'],
                    },
                    output_stream=logging.info)
            return x

        dataset = dataset.map(print_examples_and_shapes)

    return dataset
def sentencepiece_vocab(extra_ids=0):
  return sentencepiece_vocabulary.SentencePieceVocabulary(
      os.path.join(TEST_DATA_DIR, "sentencepiece", "sentencepiece.model"),
      extra_ids=extra_ids)
from t5.data import sentencepiece_vocabulary
from t5.evaluation import metrics
from t5.data import preprocessors
from t5.data import TaskRegistry
from t5.data import TextLineTask

import numpy as np
import functools
import tensorflow as tf
from sumeval.metrics.rouge import RougeCalculator

rouge_cal = RougeCalculator(stopwords=True, lang="ja")

DEFAULT_SPM_PATH = "gs://t5-data/vocabs/mc4.250000.100extra/sentencepiece.model"
DEFAULT_SPM_PATH = "/home/katsumata/work/summarization/mt5/sentencepiece.model"
DEFAULT_VOCAB = sentencepiece_vocabulary.SentencePieceVocabulary(
    DEFAULT_SPM_PATH)
DEFAULT_OUTPUT_FEATURES = {
    "inputs":
    t5.data.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True, required=False),
    "targets":
    t5.data.Feature(vocabulary=DEFAULT_VOCAB, add_eos=True)
}


# rouge-1, rouge-2, rouge-lを評価指標とします
def rouge(targets, predictions):
    predictions = [tf.compat.as_text(x) for x in predictions]

    if isinstance(targets[0], list):
        targets = [[tf.compat.as_text(x) for x in target]
                   for target in targets]
Пример #9
0
 def get_vocabulary(self):
     """Returns a SentencePieceVocabulary object using the Task's model."""
     return sentencepiece_vocabulary.SentencePieceVocabulary(
         self.sentencepiece_model_path)
    def __init__(self,
                 name,
                 dataset_fn,
                 splits,
                 text_preprocessor,
                 sentencepiece_model_path=None,
                 metric_fns=None,
                 postprocess_fn=None,
                 token_preprocessor=None,
                 output_features=None,
                 num_input_examples=None,
                 supports_caching=False):
        """Task constructor.

    Attributes of output features, including the vocabulary used for
    tokenization, should be provided via the `output_features` argument. If a
    given feature does not have a vocabulary defined, it will use a
    `vocabularies.SentencePieceVocabulary` whose SentencePiece model path is
    given by the `sentencepiece_model_path` argument.

    Args:
      name: string, a unique name for the Task. A ValueError will be raised if
        another task with this name is already registered.
      dataset_fn: callable, a function with the signature
        `dataset_fn(split, shuffle_files)' that returns a `tf.data.Dataset`.
      splits: list(string), a list of allowable splits to request from the
        `dataset_fn`.
      text_preprocessor: a function (or list of functions) that (each) takes in
        a tf.data.Dataset of string features and returns a tf.data.Dataset of
        string features. Can be set to None as a no-op. If a list is given,
        they will be executed sequentially.
      sentencepiece_model_path: str or None, path to a SentencePiece model file
        to use for tokenization when a `Feature` in `output_features` is not
        supplied that has a vocabulary defined. For provided `Feature`s in
        `output_features`, this argument will be ignored. If None, use
        DEFAULT_SPM_PATH.
      metric_fns: list(callable), list of metric functions with the signature
        `metric_fn(targets, predictions)` to use during evaluation. By default
        (None), an empty list will be used, resulting in no evaluation on this
        task.
      postprocess_fn: function, a function that takes in decoded model outputs
        (strings) and returns a string which is ready for evaluation using the
        metric functions in `metric_fns`. Can be set to None as a no-op.
      token_preprocessor: an optional function (or list of functions) that
        (each) takes in a tf.data.Dataset of token features and returns a
        tf.data.Dataset of token features.
        Can be set to None as a no-op. If a list is given, they will be
        executed sequentially.
        The functions are also passed `sequence_length` and `vocabulary`
        keyword arguments.
      output_features: list(str) or dict. Output features of the Task. If
        list(str) is provided, a `Feature` class instance will be created for
        each provided feature name using the default values. If a dict is
        provided, it should map feature names to `Feature` class instances. When
        `output_features` is None (default), two output features for "inputs"
        and "targets" will be constructed using the default values for the
        `Feature` class.
      num_input_examples: dict(string: int) or None, a dictionary mapping split
        to its size in number of input examples (before preprocessing). The
        `num_input_examples` method will return None if not provided.
      supports_caching: bool, whether or not this task supports offline caching.
    """
        if not _VALID_TASK_NAME_REGEX.match(name):
            raise ValueError(
                "Task name '%s' contains invalid characters. Must match regex: %s"
                % (name, _VALID_TASK_NAME_REGEX.pattern))
        _validate_args(dataset_fn, ["split", "shuffle_files"])
        metric_fns = metric_fns or []
        for metric_fn in metric_fns:
            _validate_args(metric_fn, ["targets", "predictions"])

        self._name = name
        self._dataset_fn = dataset_fn
        self._text_preprocessor = ([] if text_preprocessor is None else
                                   text_preprocessor)
        self._token_preprocessor = ([] if token_preprocessor is None else
                                    token_preprocessor)
        self._metric_fns = metric_fns
        # Use a pass-through if postprocess_fn is not provided
        self._postprocess_fn = postprocess_fn or (lambda x, **unused_kwargs: x)
        self._cache_dir = None
        self._stats = {}

        if hasattr(output_features, "__len__") and not output_features:
            raise ValueError("output_features must be non-empty.")
        if isinstance(output_features, dict):
            self._output_features = output_features
        elif output_features is None or isinstance(output_features, list):
            output_features = output_features or _DEFAULT_FEATURE_KEYS
            default_vocabulary = sentencepiece_vocabulary.SentencePieceVocabulary(
                sentencepiece_model_file=sentencepiece_model_path
                or DEFAULT_SPM_PATH)
            self._output_features = {
                f: Feature(vocabulary=default_vocabulary)
                for f in output_features
            }
        else:
            raise ValueError(
                "output_features must be a dict, list of str, or None")
        self._output_features = collections.OrderedDict(
            sorted(list(self._output_features.items())))

        self._splits = splits
        self._num_input_examples = num_input_examples
        self._supports_caching = supports_caching
Пример #11
0
def get_vocabulary():
    return sentencepiece_vocabulary.SentencePieceVocabulary(
            args.spiece_model_path, extra_ids=100
            )
def get_default_vocabulary():
    return sentencepiece_vocabulary.SentencePieceVocabulary(
        DEFAULT_SPM_PATH)  # TODO update with latest t5 version