class Wmt15Translate(wmt.WmtTranslate): """WMT 15 translation datasets for all {xx, "en"} language pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2015 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), supported_versions=[ tfds.core.Version( "0.0.4", experiments={tfds.core.Experiment.S3: False}), ], ) for l1, l2 in _LANGUAGE_PAIRS ] + [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description=( "WMT 2015 %s-%s translation task dataset with subword encoding." % (l1, l2)), url=_URL, citation=_CITATION, language_pair=(l1, l2), text_encoder_config=tfds.features.text.TextEncoderConfig( encoder_cls=tfds.features.text.SubwordTextEncoder, name="subwords8k", vocab_size=2**13), version=tfds.core.Version("1.0.0"), supported_versions=[ tfds.core.Version( "0.0.4", experiments={tfds.core.Experiment.S3: False}), ], ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "europarl_v8_16", "commoncrawl", "multiun", "newscommentary_v10", "gigafren", "czeng_10", "yandexcorpus", "wikiheadlines_fi", "wikiheadlines_ru" ], tfds.Split.VALIDATION: ["newsdev2015", "newsdiscussdev2015", "newstest2014"], tfds.Split.TEST: [ "newstest2015", "newsdiscusstest2015", ] }
class Wmt17Translate(wmt.WmtTranslate): """WMT 17 translation datasets for all {xx, "en"} language pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2017 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version( "0.0.3", experiments={tfds.core.Experiment.S3: False})) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "europarl_v8_16", "commoncrawl", "newscommentary_v12", "czeng_16", "yandexcorpus", "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2", "uncorpus_v1", "rapid_2016", "leta_v1", "dcep_v1", "onlinebooks_v1" ] + wmt.CWMT_SUBSET_NAMES, tfds.Split.VALIDATION: [ "newsdev2017", "newstest2016", "newstestB2016" ], tfds.Split.TEST: [ "newstest2017", "newstestB2017" ] }
class Wmt18Translate(wmt.WmtTranslate): """WMT 18 translation datasets for all {xx, "en"} language pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2018 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version="0.0.2") for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "europarl_v8_18", "paracrawl_v1", "commoncrawl", "newscommentary_v13", "czeng_17", "yandexcorpus", "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2", "uncorpus_v1", "rapid_2016" ] + wmt.CWMT_SUBSET_NAMES, tfds.Split.VALIDATION: [ "newsdev2014", "newsdev2015", "newsdev2016", "newsdev2017", "newsdev2018", "newsdiscussdev2015", "newsdiscusstest2015", "newssyscomb2009", "newstest2008", "newstest2009", "newstest2010", "newstest2011", "newstest2012", "newstest2013", "newstest2014", "newstest2015", "newstest2016", "newstestB2016", "newstest2017", "newstestB2017" ], tfds.Split.TEST: ["newstest2018"] }
class WmtT2tTranslate(wmt.WmtTranslate): """The WMT EnDe Translate dataset used by the Tensor2Tensor library.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT T2T EnDe translation task dataset.", url=_URL, citation=_CITATION, language_pair=("de", "en"), version=tfds.core.Version("1.0.0"), ) ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "commoncrawl", "newscommentary_v13"], tfds.Split.VALIDATION: [ "newstest2013" ], tfds.Split.TEST: [ "newstest2014" ] }
class Wmt13Translate(wmt.WmtTranslate): """WMT 13 translation datasets for all {xx, "en"} language pairs.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2013 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "commoncrawl", "multiun", "newscommentary_v8", "gigafren", "wikiheadlines_ru", "yandexcorpus", "czeng_10" ], tfds.Split.VALIDATION: [ "newstest2012", "newstest2011", "newstest2010", "newstest2009", "newstest2008", "newssyscomb2009" ], tfds.Split.TEST: ["newstest2013"] }
class Wmt18Translate(wmt.WmtTranslate): """WMT 18 translation datasets for all {xx, "en"} language pairs.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2018 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "europarl_v8_18", "paracrawl_v1", "commoncrawl", "newscommentary_v13", "czeng_17", "yandexcorpus", "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2", "uncorpus_v1", "rapid_2016" ] + wmt.CWMT_SUBSET_NAMES, tfds.Split.VALIDATION: ["newsdev2018", "newstest2017", "newstestB2017"], tfds.Split.TEST: ["newstest2018"] }
class Wmt14Translate(wmt.WmtTranslate): """WMT 14 translation datasets for all {xx, "en"} language pairs.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.0.3: Initial version. BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2014 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), supported_versions=[ tfds.core.Version("0.0.3", experiments={tfds.core.Experiment.S3: False}), ], ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "commoncrawl", "multiun", "newscommentary_v9", "gigafren", "czeng_10", "yandexcorpus", "wikiheadlines_hi", "wikiheadlines_ru", "hindencorp_01" ], tfds.Split.VALIDATION: ["newsdev2014", "newstest2013"], tfds.Split.TEST: ["newstest2014"] }
class Wmt16Translate(wmt.WmtTranslate): """WMT 16 translation datasets for all {xx, "en"} language pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2016 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "europarl_v8_16", "commoncrawl", "newscommentary_v11", "czeng_16pre", "yandexcorpus", "wikiheadlines_fi", "wikiheadlines_ru", "setimes_2" ], tfds.Split.VALIDATION: [ "newsdev2016", "newstest2015" ], tfds.Split.TEST: [ "newstest2016", "newstestB2016" ] }
class Wmt14Translate(wmt.WmtTranslate): """WMT 14 translation datasets for all {xx, "en"} language pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2014 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version="0.0.1") for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v7", "commoncrawl", "multiun", "newscommentary_v9", "gigafren", "czeng_10", "yandexcorpus", "wikiheadlines_hi", "wikiheadlines_ru", "hindencorp_01" ], tfds.Split.VALIDATION: [ "newsdev2014", "newssyscomb2009", "newstest2008", "newstest2009", "newstest2010", "newstest2011", "newstest2012", "newstest2013" ], tfds.Split.TEST: ["newstest2014"] }
class Wmt19Translate(wmt.WmtTranslate): """WMT 19 translation datasets for {(xx, "en")} + ("fr", "de") pairs.""" BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2019 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version="0.0.3") for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v9", "europarl_v7_frde", "paracrawl_v3", "paracrawl_v1_ru", "paracrawl_v3_frde", "commoncrawl", "commoncrawl_frde", "newscommentary_v14", "newscommentary_v14_frde", "czeng_17", "yandexcorpus", "wikititles_v1", "uncorpus_v1", "rapid_2016_ltfi", "rapid_2019" ] + wmt.CWMT_SUBSET_NAMES, tfds.Split.VALIDATION: ["euelections_dev2019", "newsdev2019", "newstest2018"] }
class WmtT2tTranslate(wmt.WmtTranslate): """The WMT EnDe Translate dataset used by the Tensor2Tensor library.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.0.1: Initial version. BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT T2T EnDe translation task dataset.", url=_URL, citation=_CITATION, language_pair=("de", "en"), version=tfds.core.Version( "0.0.1", experiments={tfds.core.Experiment.S3: False}), supported_versions=[ tfds.core.Version("1.0.0"), ]) ] @property def _subsets(self): return { tfds.Split.TRAIN: ["europarl_v7", "commoncrawl", "newscommentary_v13"], tfds.Split.VALIDATION: ["newstest2013"], tfds.Split.TEST: ["newstest2014"] }
class Wmt19Translate(wmt.WmtTranslate): """WMT 19 translation datasets for {(xx, "en")} + ("fr", "de") pairs.""" # Version history: # 1.0.0: S3 (new shuffling, sharding and slicing mechanism). # 0.0.3: Initial version. BUILDER_CONFIGS = [ wmt.WmtConfig( # pylint:disable=g-complex-comprehension description="WMT 2019 %s-%s translation task dataset." % (l1, l2), url=_URL, citation=_CITATION, language_pair=(l1, l2), version=tfds.core.Version("1.0.0"), supported_versions=[ tfds.core.Version("0.0.3", experiments={tfds.core.Experiment.S3: False}), ], ) for l1, l2 in _LANGUAGE_PAIRS ] @property def _subsets(self): return { tfds.Split.TRAIN: [ "europarl_v9", "europarl_v7_frde", "paracrawl_v3", "paracrawl_v1_ru", "paracrawl_v3_frde", "commoncrawl", "commoncrawl_frde", "newscommentary_v14", "newscommentary_v14_frde", "czeng_17", "yandexcorpus", "wikititles_v1", "uncorpus_v1", "rapid_2016_ltfi", "rapid_2019" ] + wmt.CWMT_SUBSET_NAMES, tfds.Split.VALIDATION: ["euelections_dev2019", "newsdev2019", "newstest2018"] }
def setUpClass(cls): super(TranslateWmtCustomConfigTest, cls).setUpClass() config = wmt.WmtConfig( name="small", language_pair=("cs", "en"), description="Example of custom config", subsets={ "train": ["paracrawl_v3"], "validation": ["newstest2009", "newstest2010"], }, version=tfds.core.Version("1.0.0"), ) wmt.WmtTranslate.BUILDER_CONFIGS = [config]