DEFAULT_OUTPUT_FEATURES = { "inputs": Feature(vocabulary=get_default_vocabulary(), add_eos=True), "targets": Feature(vocabulary=get_default_vocabulary(), add_eos=True) } # ==================================== C4 ====================================== _c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"] for config_suffix in _c4_config_suffixes: TaskRegistry.add( "c4{name}_v020_unsupervised".format( name=config_suffix.replace(".", "_")), TfdsTask, tfds_name="c4/en{config}:2.2.0".format(config=config_suffix), text_preprocessor=functools.partial(preprocessors.rekey, key_map={ "inputs": None, "targets": "text" }), token_preprocessor=preprocessors.unsupervised, output_features=DEFAULT_OUTPUT_FEATURES, metric_fns=[]) # ================================ Wikipedia =================================== TaskRegistry.add("wikipedia_20190301.en_v003_unsupervised", TfdsTask, tfds_name="wikipedia/20190301.en:1.0.0", text_preprocessor=functools.partial(preprocessors.rekey, key_map={ "inputs": None, "targets": "text"
DEFAULT_OUTPUT_FEATURES = { "inputs": Feature(vocabulary=get_default_vocabulary(), add_eos=True), "targets": Feature(vocabulary=get_default_vocabulary(), add_eos=True) } # ==================================== C4 ====================================== # Configurable tasks used for comparisons in Raffel et al., 2019. _c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"] for config_suffix in _c4_config_suffixes: TaskRegistry.add( "c4{name}_v020_unsupervised".format( name=config_suffix.replace(".", "_")), TfdsTask, tfds_name="c4/en{config}:2.2.0".format(config=config_suffix), text_preprocessor=functools.partial(preprocessors.rekey, key_map={ "inputs": None, "targets": "text" }), token_preprocessor=preprocessors.unsupervised, output_features=DEFAULT_OUTPUT_FEATURES, metric_fns=[]) # Final pretraining task used in Raffel et al., 2019. TaskRegistry.add( "c4_v220_span_corruption", TfdsTask, tfds_name="c4/en:2.2.0".format(config=config_suffix), text_preprocessor=functools.partial(preprocessors.rekey, key_map={ "inputs": None,
from t5.data import preprocessors from t5.data.utils import DEFAULT_SPM_PATH from t5.data.utils import TaskRegistry from t5.data.utils import TfdsTask from t5.evaluation import metrics # ==================================== C4 ====================================== _c4_config_suffixes = ["", ".noclean", ".realnewslike", ".webtextlike"] for config_suffix in _c4_config_suffixes: TaskRegistry.add( "c4{name}_v020_unsupervised".format( name=config_suffix.replace(".", "_")), TfdsTask, tfds_name="c4/en{config}:1.0.0".format(config=config_suffix), text_preprocessor=functools.partial(preprocessors.rekey, key_map={ "inputs": None, "targets": "text" }), token_preprocessor=preprocessors.unsupervised, sentencepiece_model_path=DEFAULT_SPM_PATH, metric_fns=[]) # ================================ Wikipedia =================================== TaskRegistry.add( "wikipedia_20190301.en_v003_unsupervised", TfdsTask, # 0.0.4 is identical to 0.0.3 except empty records removed. tfds_name="wikipedia/20190301.en:0.0.4", text_preprocessor=functools.partial(preprocessors.rekey, key_map={