Exemplo n.º 1
0
def load(index: Index):
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    cite = r"""@inproceedings{espla-etal-2019-paracrawl,
        title = "{P}ara{C}rawl: Web-scale parallel corpora for the languages of the {EU}",
        author = "Espl{\`a}, Miquel  and
          Forcada, Mikel  and
          Ram{\'\i}rez-S{\'a}nchez, Gema  and
          Hoang, Hieu",
        booktitle = "Proceedings of Machine Translation Summit XVII Volume 2: Translator, Project and User Tracks",
        month = aug,
        year = "2019",
        address = "Dublin, Ireland",
        publisher = "European Association for Machine Translation",
        url = "https://www.aclweb.org/anthology/W19-6721",
        pages = "118--119",
    }"""
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v3',
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
Exemplo n.º 2
0
def load_all(index: Index):

    cite = """@inproceedings{post-etal-2012-constructing,
    title = "Constructing Parallel Corpora for Six {I}ndian Languages via Crowdsourcing",
    author = "Post, Matt  and
      Callison-Burch, Chris  and
      Osborne, Miles",
    booktitle = "Proceedings of the Seventh Workshop on Statistical Machine Translation",
    month = jun,
    year = "2012",
    address = "Montr{\'e}al, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W12-3152",
    pages = "401--409",
}"""
    url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz'
    l2 = 'en'
    langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta']
    for l1 in langs:
        for split in ['training', 'dev', 'test', 'devtest', 'dict']:
            if l1 == 'hi' and split == 'dict':
                continue  # hindi dont have dict
            f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}'
            f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}'
            if split not in ('training', 'dict'):
                f2 += '.0'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        name=f'JoshuaIndianCorpus_{split}',
                        filename='joshua-indian-parallel-corpora.tar.gz',
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
Exemplo n.º 3
0
Arquivo: data.py Projeto: kpu/mtdata
 def add_part(self, dir_path: Path, entry: Entry, drop_noise=False):
     path = self.cache.get_entry(entry)
     swap = entry.is_swap(self.langs)
     parser = Parser(path,
                     langs=self.langs,
                     ext=entry.in_ext or None,
                     ent=entry)
     langs = '_'.join(self.langs)
     l1 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}')
     l2 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}')
     mode = dict(mode='w', encoding='utf-8', errors='ignore')
     with l1.open(**mode) as f1, l2.open(**mode) as f2:
         count, skips, noise = 0, 0, 0
         for rec in parser.read_segs():
             rec = rec[:2]  # get the first two recs
             if len(rec) != 2:
                 skips += 1
                 continue
             if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                 skips += 1
                 noise += 1
                 continue
             sent1, sent2 = [s.strip() for s in rec]
             if not sent1 or not sent2:
                 skips += 1
                 continue
             if swap:
                 sent2, sent1 = sent1, sent2
             sent1 = sent1.replace('\n', ' ').replace('\t', ' ')
             sent2 = sent2.replace('\n', ' ').replace('\t', ' ')
             f1.write(f'{sent1}\n')
             f2.write(f'{sent2}\n')
             count += 1
         msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}'
         assert count > 0, msg
         if skips > count:
             log.warning(msg)
         if noise > 0:
             log.info(
                 f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%"
             )
         log.info(f"wrote {count} lines to {l1} == {l2}")
     return count, skips
Exemplo n.º 4
0
def load_all(index: Index):
    with open(REFS_FILE, encoding='utf-8') as data:
        for line in data:
            l1, l2, num, short, name, info, download, licenses, in_paths = line.split('\t', maxsplit=8)
            dataset_name = short.lower().replace(':', '_').replace('__', '_').replace('__', '_')
            in_paths = in_paths.strip().split('\t')
            ent = Entry(did=DatasetId(group='ELRC', name=dataset_name, version='1', langs=(l1, l2)),
                    url=download, filename="ELRC_" + str(num) + ".zip", in_ext='tmx', in_paths=in_paths)
            index.add_entry(ent)
Exemplo n.º 5
0
def load_all(index: Index):
    URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz"
    cite = index.ref_db.get_bibtex('zhang-etal-2020-improving')
    cite += '\n\n' + index.ref_db.get_bibtex('tiedemann2012parallel')
    filename = 'opus-100-corpus-v1.0.tar.gz'
    code_map = dict(
        nb='nob',
        sh='hbs')  # these arent obvious to iso lookup function, so helping
    group, name = 'OPUS', 'opus100'
    for pair in supervised_v1:
        l1, l2 = pair.split("-")
        l1 = code_map.get(l1, l1)
        l2 = code_map.get(l2, l2)
        splits = ['train', 'dev', 'test']
        if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}:
            splits = ['train'
                      ]  # somehow they forgot to include test sets for these
        for split in splits:
            f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}'
            f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}'
            ent = Entry(did=DatasetId(group=group,
                                      name=f'{name}_{split}',
                                      version='1',
                                      langs=(l1, l2)),
                        url=URL,
                        filename=filename,
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
    for pair in zeroshot_v1:
        l1, l2 = pair.split("-")
        f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}'
        f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}'
        ent = Entry(did=DatasetId(group=group,
                                  name=f'{name}_test',
                                  version='1',
                                  langs=(l1, l2)),
                    url=URL,
                    filename=filename,
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite)
        index.add_entry(ent)
Exemplo n.º 6
0
def load_all(index: Index):
    url_ptn = 'https://www.dropbox.com/s/{uid}/wikititles-2014_{l1}{l2}.tgz?dl=1'
    rows = [row.split(',') for row in wiki_titles.splitlines()]
    for row in rows:
        uid, pair = row
        assert len(pair) == 4
        l1, l2 = pair[:2], pair[2:]
        url = url_ptn.format(uid=uid, l1=l1, l2=l2)
        in_file = f'wikititles-2014_{l1}{l2}'
        ent = Entry(did=DatasetId(group='LinguaTools', name=f'wikititles', version='2014', langs=(l1, l2)),
                    url=url, ext='tgz', in_ext='txt', in_paths=[f'{in_file}.{l1}', f'{in_file}.{l2}'])
        index.add_entry(ent)
Exemplo n.º 7
0
def load_all(index: Index):
    with open(REFS_FILE) as data:
        for line in data:
            l1, l2, num, short, name, info, download, licenses, in_paths = line.split(
                '\t', maxsplit=8)
            in_paths = in_paths.strip().split('\t')
            ent = Entry(langs=(l1, l2),
                        url=download,
                        name="ELRC_" + short,
                        filename="ELRC_" + str(num) + ".zip",
                        in_ext='tmx',
                        in_paths=in_paths)
            index.add_entry(ent)
Exemplo n.º 8
0
def load_all(index: Index):
    # === ECDC ===
    # https://ec.europa.eu/jrc/en/language-technologies/ecdc-translation-memory
    cite = index.ref_db.get_bibtex('Steinberger2014')
    langs = 'en bg cs da de el es et fi fr ga hu is it lt lv mt nl no pl pt ro sk sl sv'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="http://optima.jrc.it/Resources/ECDC-TM/ECDC-TM.zip",
                        name="ECDC", in_ext='tmx', cite=cite, in_paths=["ECDC-TM/ECDC.tmx"])
            index.add_entry(ent)

    # === EAC ===
    # https://ec.europa.eu/jrc/en/language-technologies/eac-translation-memory
    # This corpus has two 
    langs = 'bg cs da de el en es et fi fr hu is it lt lv mt nb nl pl pt ro sk sl sv tr'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip",
                        name="EAC_Forms", in_ext='tmx', cite=cite, in_paths=["EAC_FORMS.tmx"])
            index.add_entry(ent)
    langs = 'bg cs da de el en es et fi fr hr hu is it lt lv mt nl no pl pt ro sk sl sv tr'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip",
                        name="EAC_Reference", in_ext='tmx', cite=cite, in_paths=["EAC_REFRENCE_DATA.tmx"])
            index.add_entry(ent)

    # === DCEP ===
    # https://ec.europa.eu/jrc/en/language-technologies/dcep
    # This was annoying to process so I ended up rehosting it.
    # Don't bother with TR; it doesn't have sentences anyway.
    cite = index.ref_db.get_bibtex('dcep')
    langs = 'BG CS DA DE EL EN ES ET FI FR GA HU IT LT LV MT NL PL PT RO SK SL SV'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url=f"http://data.statmt.org/DCEP/{l1}-{l2}.tsv.xz",
                        name="DCEP", in_ext='tsv', cite=cite)
            index.add_entry(ent)
Exemplo n.º 9
0
def load_all(index: Index):

    url_pat = 'https://object.pouta.csc.fi/OPUS-{corpus}/{version}/moses/{l1}-{l2}.txt.zip'
    group_id = 'OPUS'
    citation = index.ref_db.get_bibtex('tiedemann2012parallel')
    skip_counts = defaultdict(int)
    dupes = defaultdict(set)
    assert data_file.exists()
    assert data_file.stat().st_size > 0

    with data_file.open() as lines:
        for line in lines:
            line = line.strip()
            if not line:  # empty lines in the top and bottom
                continue
            assert len(line.split('\t')) == 4, line
            corpus, version, l1, l2 = line.split('\t')
            url = url_pat.format(corpus=corpus, version=version, l1=l1, l2=l2)
            iso_l1, iso_l2 = bcp47.try_parse(
                l1, default=None), bcp47.try_parse(l2, default=None)
            if not iso_l1 or not iso_l2:
                if not iso_l1:
                    skip_counts[str(l1)] += 1
                if not iso_l2:
                    skip_counts[str(l2)] += 1
                continue
            version_cln = version.replace('-', '').lower()
            corpus_cln = corpus.replace('-', '_').lower()

            data_id = DatasetId(group=group_id,
                                name=corpus_cln,
                                version=version_cln,
                                langs=(iso_l1, iso_l2))
            if data_id in index:
                dupes[corpus].add(f'{l1}-{l2}')
                continue
            entry = Entry(did=data_id,
                          url=url,
                          cite=citation,
                          in_paths=[f'*.{l1}', f'*.{l2}'],
                          in_ext='txt')
            index.add_entry(entry)
        if skip_counts:
            skip_counts = list(
                sorted(dict(skip_counts).items(),
                       key=lambda x: x[1],
                       reverse=True))
            log.info(f"Skipped lang counts: {skip_counts}")
        if dupes:
            log.info(f"Duplicates langs: {dupes}")
Exemplo n.º 10
0
def load_all(index: Index):
    cite = index.ref_db.get_bibtex('ziemski-etal-2016-united')
    url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz"
    langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh']
    for split in ['dev', 'test']:
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}'
            f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename='UNv1.0.testsets.tar.gz',
                        name=f'UNv1_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)
Exemplo n.º 11
0
def load_all(index: Index):

    cite = index.ref_db.get_bibtex('ramesh2021samanantar')
    pairs = ('en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi'
             ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa'
             ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or'
             ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te'
             ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te')
    BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip'
    for pair in pairs.strip().split(' '):
        l1, l2 = pair.split('-')
        dirname = 'en2indic' if l1 == 'en' else 'indic2indic'
        url = BASE_v0_2.format(dirname=dirname, pair=pair)
        ent = Entry(langs=(l1, l2), name='AI4B_Samananthar_v02', url=url, cite=cite,
              in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'], in_ext='txt')
        index.add_entry(ent)
Exemplo n.º 12
0
def load_all(index: Index):
    lines = data_file.read_text(encoding='utf-8').splitlines()
    langs = set('hi bn ta ml te kn mr pa gu as ur or'.split())  # other than en
    group_id = 'Anuvaad'
    cite_txt = index.ref_db.get_bibtex('project-anuvaad')
    for url in lines:
        url = url.strip()
        assert url.startswith('http') and url.endswith('.zip')
        file_name = url.split('/')[-1]
        file_name = file_name[:-4]  # .zip
        char_count = coll.Counter(list(file_name))
        n_hyps = char_count.get('-', 0)
        n_unders = char_count.get('_', 0)
        if n_hyps > n_unders:
            parts = file_name.split('-')
        else:
            assert '_' in file_name
            parts = file_name.split('_')
        name, version = '?', '?'
        l1, l2 = 'en', '?'
        if parts[-2] == l1 and parts[-1] in langs:
            l2 = parts[-1]
            version = parts[-3]
        elif parts[-3] == l1 and parts[-2] in langs:
            l2 = parts[-2]
            version = parts[-1]
        else:
            log.warn(f"Unable to parse {file_name} :: {parts}")
            continue
        name = '_'.join(parts[:-3])
        name = name.replace('-', '_')
        f1 = f'{l1}-{l2}/*.{l1}'
        f2 = f'{l1}-{l2}/*.{l2}'
        if name == 'wikipedia':
            f1 = f'{l1}-{l2}/{l1}.txt'
            f2 = f'{l1}-{l2}/{l2}.txt'

        ent = Entry(did=DatasetId(group=group_id,
                                  name=name,
                                  version=version,
                                  langs=(l1, l2)),
                    url=url,
                    ext='zip',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite_txt)
        index.add_entry(ent)
Exemplo n.º 13
0
def load_all(index: Index):
    cite = index.ref_db.get_bibtex('ziemski-etal-2016-united')
    url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz"
    url = "https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx"  # they changed it!
    langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh']
    for split in ['dev', 'test']:
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}'
            f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}'
            ent = Entry(did=DatasetId(group='UN',
                                      name=f'un_{split}',
                                      version='1',
                                      langs=(l1, l2)),
                        url=url,
                        filename='UNv1.0.testsets.tar.gz',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)
Exemplo n.º 14
0
def load_all(index: Index):

    cite = index.ref_db.get_bibtex(key='post-etal-2012-constructing')
    url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz'
    l2 = 'en'
    langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta']
    for l1 in langs:
        for split in ['training', 'dev', 'test', 'devtest', 'dict']:
            if l1 == 'hi' and split == 'dict':
                continue  # hindi dont have dict
            f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}'
            f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}'
            if split not in ('training', 'dict'):
                f2 += '.0'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        name=f'JoshuaIndianCorpus_{split}',
                        filename='joshua-indian-parallel-corpora.tar.gz',
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
Exemplo n.º 15
0
    def get_stats(self, entry: Entry):
        path = self.get_entry(entry)
        parser = Parser(path, ext=entry.in_ext or None, ent=entry)
        count, skips, noise = 0, 0, 0
        toks = [0, 0]
        chars = [0, 0]
        for rec in parser.read_segs():
            if len(rec) < 2 or not rec[0] or not rec[1]:
                skips += 1
                continue
            if entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                noise += 1
                skips += 1
                continue
            count += 1
            s1, s2 = rec[:2]  # get the first two recs
            chars[0] += len(s1)
            chars[1] += len(s2)
            s1_tok, s2_tok = s1.split(), s2.split()
            toks[0] += len(s1_tok)
            toks[1] += len(s2_tok)

        l1, l2 = entry.did.langs
        l1, l2 = l1.lang, l2.lang
        assert count > 0, f'No valid records are found for {entry.did}'
        if l2 < l1:
            l1, l2 = l2, l1
            toks = toks[1], toks[0]
            chars = chars[1], chars[0]
        return {
            'id': str(entry.did),
            'segs': count,
            'segs_err': skips,
            'segs_noise': noise,
            f'{l1}_toks': toks[0],
            f'{l2}_toks': toks[1],
            f'{l1}_chars': chars[0],
            f'{l2}_chars': chars[0]
        }
Exemplo n.º 16
0
def load_all(index: Index):
    URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz"
    cite = """
@inproceedings{zhang-etal-2020-improving,
    title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation",
    author = "Zhang, Biao  and
      Williams, Philip  and
      Titov, Ivan  and
      Sennrich, Rico",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.acl-main.148",
    doi = "10.18653/v1/2020.acl-main.148",
    pages = "1628--1639",
}
@inproceedings{tiedemann2012parallel,
  title={Parallel Data, Tools and Interfaces in OPUS.},
  author={Tiedemann, J{\"o}rg},
  booktitle={Lrec},
  volume={2012},
  pages={2214--2218},
  year={2012}
}"""
    filename = 'opus-100-corpus-v1.0.tar.gz'
    code_map = dict(
        nb='nob',
        sh='hbs')  # these arent obvious to iso lookup function, so helping
    for pair in supervised_v1:
        l1, l2 = pair.split("-")
        l1 = code_map.get(l1, l1)
        l2 = code_map.get(l2, l2)
        splits = ['train', 'dev', 'test']
        if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}:
            splits = ['train'
                      ]  # somehow they forgot to include test sets for these
        for split in splits:
            f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}'
            f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=URL,
                        name=f'OPUS100v1_{split}',
                        filename=filename,
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
    for pair in zeroshot_v1:
        l1, l2 = pair.split("-")
        f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}'
        f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=URL,
                    name=f'OPUS100v1_test',
                    filename=filename,
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite)
        index.add_entry(ent)
Exemplo n.º 17
0
def load(index: Index):
    cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl')
    cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl')
    group_id = 'ParaCrawl'
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='3',
                                langs=(l1, l2)),
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='6',
                                langs=(l1, l2)),
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='6B',
                                langs=(l1, l2)),
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    l1 = 'en'
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz'
    for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split(
    ):
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='7.1',
                                langs=(l1, l2)),
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz'
    for pair in 'en-nb en-nn es-ca es-eu es-gl'.split():
        l1, l2 = pair.split('-')
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='7.1',
                                langs=(l1, l2)),
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz'
    for version, pairs in [
        ('v8.0', 'en-bg en-cs en-da en-de en-el'),
        ('v8.0-0001',
         'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl'
         ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl')
    ]:
        for pair in pairs.split():
            l1, l2 = pair.split('-')
            url = PARACRAWL_V8.format(version=version, pair=pair)
            ent = Entry(did=DatasetId(group=group_id,
                                      name=f'paracrawl',
                                      version='8',
                                      langs=(l1, l2)),
                        url=url,
                        cite=cite,
                        ext='tsv.gz')
            index.add_entry(ent)

    PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz'
    for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split(
    ):
        l1, l2 = pair.split('-')
        url = PARACRAWL_BONUS.format(pair=pair)
        ent = Entry(did=DatasetId(group=group_id,
                                  name=f'paracrawl',
                                  version='1_bonus',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)

    PARACRAWL_V9 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release9/{l1}-{l2}/{l1}-{l2}.txt.gz'
    for pair in (
            'en-bg en-cs en-da en-de en-el en-es en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv'
            ' en-mt en-nb en-nl en-nn en-pl en-pt en-ro en-sk en-sl en-sv es-ca es-eu es-gl'
    ).split():
        l1, l2 = pair.split('-')
        url = PARACRAWL_V9.format(l1=l1, l2=l2)
        ent = Entry(did=DatasetId(group=group_id,
                                  name=f'paracrawl',
                                  version='9',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)
    # this is a new addition in Sept 2021
    index.add_entry(
        Entry(
            did=DatasetId(group=group_id,
                          name=f'paracrawl',
                          version='1_bonus',
                          langs=('en', 'zh')),
            url=
            'http://web-language-models.s3-website-us-east-1.amazonaws.com/paracrawl/bonus/en-zh-v1.txt.gz',
            cite=cite,
            ext='tsv.gz'))

    # Japanese-English paracrawl (5.1) used by WMT20 and WMT21
    for version in ['2', '3']:
        ent = Entry(
            did=DatasetId(group='KECL',
                          name=f'paracrawl',
                          version=version,
                          langs=('eng', 'jpn')),
            in_paths=['en-ja/en-ja.bicleaner05.txt'],
            in_ext='tsv',
            cols=(2, 3),
            cite='',
            url=
            f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz'
        )
        index.add_entry(ent)
Exemplo n.º 18
0
def load(index: Index):
    WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
    WMT14_CITE = """@proceedings{ws-2014-statistical,
        title = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
        editor = "Bojar, Ond{\v{r}}ej  and
          Buck, Christian  and
          Federmann, Christian  and
          Haddow, Barry  and
          Koehn, Philipp  and
          Monz, Christof  and
          Post, Matt  and
          Specia, Lucia",
        month = jun,
        year = "2014",
        address = "Baltimore, Maryland, USA",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W14-3300",
        doi = "10.3115/v1/W14-33",
    }"""
    for l1 in ['de', 'cs', 'fr', 'ru', 'es']:
        l2 = 'en'
        f1 = f'commoncrawl.{l1}-en.{l1}'
        f2 = f'commoncrawl.{l1}-en.en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name=f'wmt13_commoncrawl',
                  url=WMT13_CCRAWL,
                  filename='wmt13_parallel_commoncrawl.tgz',
                  in_paths=[f1, f2],
                  in_ext='txt',
                  cite=WMT14_CITE))

    # === WMT 13 release of europarl_v7 ===
    for l1 in ['cs', 'de', 'fr', 'es']:
        l2 = 'en'
        f1 = f'training/europarl-v7.{l1}-{l2}.{l1}'
        f2 = f'training/europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(
                langs=(l1, l2),
                name=f'wmt13_europarl_v7',
                url=
                "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
                filename="wmt13_europarl_v7.tgz",
                in_paths=[f1, f2],
                in_ext='txt',
                cite=WMT14_CITE))

    # ==== WMT 18  news commentary v13 ===
    for l1 in ['cs', 'de', 'ru', 'zh']:
        l2 = 'en'
        f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}'
        f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(
                langs=(l1, l2),
                name=f'wmt18_news_commentary_v13',
                url=
                "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
                filename="wmt18_news_commentary_v13.tgz",
                in_paths=[f1, f2],
                in_ext='txt',
                cite=WMT14_CITE))

    # === Europarl V9 corpus
    EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz'
    cite = r"""@inproceedings{koehn2005europarl,
      title={Europarl: A parallel corpus for statistical machine translation},
      author={Koehn, Philipp},
      booktitle={MT summit},
      volume={5},
      pages={79--86},
      year={2005},
      organization={Citeseer}
    }"""
    for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v9',
                  url=EUROPARL_v9 % (l1, l2),
                  cite=cite))

    # === Europarl V7 corpus
    EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz'
    cite = r"""@inproceedings{bojar-etal-2017-findings,
      title = "Findings of the 2017 Conference on Machine Translation ({WMT}17)",
      author = "Bojar, Ond{\v{r}}ej  and
        Chatterjee, Rajen  and
        Federmann, Christian  and
        Graham, Yvette  and
        Haddow, Barry  and
        Huang, Shujian  and
        Huck, Matthias  and
        Koehn, Philipp  and
        Liu, Qun  and
        Logacheva, Varvara  and
        Monz, Christof  and
        Negri, Matteo  and
        Post, Matt  and
        Rubino, Raphael  and
        Specia, Lucia  and
        Turchi, Marco",
      booktitle = "Proceedings of the Second Conference on Machine Translation",
      month = sep,
      year = "2017",
      address = "Copenhagen, Denmark",
      publisher = "Association for Computational Linguistics",
      url = "https://www.aclweb.org/anthology/W17-4717",
      doi = "10.18653/v1/W17-4717",
      pages = "169--214",
    }"""
    for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split(
    ):
        l2 = 'en'
        src = f'europarl-v7.{l1}-{l2}.{l1}'
        ref = f'europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v7',
                  in_paths=[src, ref],
                  url=EUROPARL_v7 % (l1, l2),
                  in_ext='txt',
                  cite=cite))

    # === Digital Corpus of European Parliament
    index.add_entry(
        Entry(
            langs=('lv', 'en'),
            name='wmt17_dcep_v1',
            in_paths=['*/*.lv', f'*/*.en'],
            cite=cite,
            url=
            'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz'))
    index.add_entry(
        Entry(
            langs=('lv', 'en'),
            name='wmt17_books_v1',
            in_paths=['*/*.lv', f'*/*.en'],
            cite=cite,
            url=
            'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz')
    )

    # === News Commentary v14
    NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz"
    cite = r"""@inproceedings{bojar-etal-2018-findings,
        title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",
        author = "Bojar, Ond{\v{r}}ej  and
          Federmann, Christian  and
          Fishel, Mark  and
          Graham, Yvette  and
          Haddow, Barry  and
          Koehn, Philipp  and
          Monz, Christof",
        booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
        month = oct,
        year = "2018",
        address = "Belgium, Brussels",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W18-6401",
        doi = "10.18653/v1/W18-6401",
        pages = "272--303"
    }"""
    for pair in [
            'ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id',
            'ar it', 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh',
            'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it',
            'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en',
            'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk',
            'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi',
            'en id', 'en it', 'en ja', 'en kk', 'en nl', 'en pt', 'en ru',
            'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja', 'es kk',
            'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it',
            'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id',
            'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk',
            'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt',
            'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru',
            'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='news_commentary_v14',
                  url=NEWSCOM_v14 % (l1, l2),
                  cite=cite))

    # ===== Wiki Titles V1
    WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz'
    cite = r"""@inproceedings{barrault-etal-2019-findings,
        title = "Findings of the 2019 Conference on Machine Translation ({WMT}19)",
        author = {Barrault, Lo{\"\i}c  and
          Bojar, Ond{\v{r}}ej  and
          Costa-juss{\`a}, Marta R.  and
          Federmann, Christian  and
          Fishel, Mark  and
          Graham, Yvette  and
          Haddow, Barry  and
          Huck, Matthias  and
          Koehn, Philipp  and
          Malmasi, Shervin  and
          Monz, Christof  and
          M{\"u}ller, Mathias  and
          Pal, Santanu  and
          Post, Matt  and
          Zampieri, Marcos},
        booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)",
        month = aug,
        year = "2019",
        address = "Florence, Italy",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W19-5301",
        doi = "10.18653/v1/W19-5301",
        pages = "1--61"
    }"""
    for pair in [
            'cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne',
            'kk en', 'lt en', 'ru en', 'zh en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='wiki_titles_v1',
                  url=WIKI_TITLES_v1 % (l1, l2),
                  cite=cite))

    # ===== Wiki Titles V2
    WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz'
    for pair in [
            'ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en',
            'pl en', 'ps en', 'ru en', 'ta en', 'zh en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='wiki_titles_v2',
                  url=WIKI_TITLES_v2 % (l1, l2),
                  cite=cite))

    # ==== WMT  Dev and Tests
    wmt_sets = {
        'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'),
                         ('ru', 'en'), ('hi', 'en')],
        'newsdev2015': [('fi', 'en'), ('en', 'fi')],
        'newstest2015':
        [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'),
         ('de', 'en'), ('ru', 'en'), ('en', 'fi')],
        'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'),
                        ('en', 'tr')],
        'newstest2016':
        [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'),
         ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'),
         ('en', 'tr'), ('en', 'cs')],
        'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'),
                        ('en', 'lv')],
        'newstest2017':
        [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'),
         ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'),
         ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newsdev2018': [('et', 'en'), ('en', 'et')],
        'newstest2018':
        [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'),
         ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'),
         ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')],
        'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'),
                        ('lt', 'en'), ('en', 'gu')],
        'newstest2019':
        [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'),
         ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'),
         ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'),
         ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')],
        'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'),
                        ('en', 'iu'), ('en', 'ja'), ('ja', 'en'), ('en', 'pl')]
    }
    for set_name, pairs in wmt_sets.items():
        for l1, l2 in pairs:
            src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm'
            ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm'
            name = f'{set_name}_{l1}{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[src, ref],
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                    cite=cite))
    # Multi parallel
    wmt_sets = {
        '2009': ['en', 'cs', 'de', 'es', 'fr'],
        '2010': ['en', 'cs', 'de', 'es', 'fr'],
        '2011': ['en', 'cs', 'de', 'es', 'fr'],
        '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
        '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
    }
    for year, langs in wmt_sets.items():
        for l1, l2 in itertools.combinations(langs, 2):
            name = f'newstest{year}'
            f1 = f'dev/{name}.{l1}'
            f2 = f'dev/{name}.{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite,
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz'
                ))

    for l1, l2 in [('ps', 'en'), ('km', 'en')]:
        for set_name in ['wikipedia.dev', 'wikipedia.devtest']:
            src = f'dev/{set_name}.{l1}-{l2}.{l1}'
            ref = f'dev/{set_name}.{l1}-{l2}.{l2}'
            name = f'{set_name.replace(".", "_")}_{l1}{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[src, ref],
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                    in_ext='txt',
                    cite=cite))

    # ==== TED Talks 2.0 ar-en
    index.add_entry(
        Entry(
            ('en', 'ar'),
            'tedtalks_v2_clean',
            ext='tsv.xz',
            url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz'))

    # ==== Europarl v10
    EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz"
    wmt20_cite = None  # TODO: update
    for pair in [
            'cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en',
            'lt en', 'pl en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v10',
                  url=EP_v10 % (l1, l2),
                  cite=wmt20_cite))

    # ==== PMIndia V1
    PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv"
    cite = r"""@ARTICLE{2020arXiv200109907H,
           author = {{Haddow}, Barry and {Kirefu}, Faheem},
            title = "{PMIndia -- A Collection of Parallel Corpora of Languages of India}",
          journal = {arXiv e-prints},
         keywords = {Computer Science - Computation and Language},
             year = "2020",
            month = "Jan",
              eid = {arXiv:2001.09907},
            pages = {arXiv:2001.09907},
    archivePrefix = {arXiv},
           eprint = {2001.09907}
    }"""
    for pair in [
            "as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en",
            "mr en", "or en", "pa en", "ta en", "te en", "ur en"
    ]:
        l1, l2 = pair.split()
        # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed!
        index.add_entry(
            Entry(langs=(l2, l1),
                  name='pmindia_v1',
                  url=PMINDIA_v1 % (l1, l2),
                  cite=cite))

    # Pashto - English  pseudo parallel dataset for alignment
    index.add_entry(
        Entry(
            langs=('en', 'ps'),
            name='wmt20_enps_aligntask',
            url=
            'http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz',
            cite=wmt20_cite,
            ext='tsv.xz'))

    # Pashto - English  mostly parallel dataset
    for name in [
            "GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps",
            "bible.en-ps.clean", "ted-wmt20.en-ps", "wikimedia.en-ps"
    ]:
        ps = f'ps-parallel/{name}.ps'
        en = f'ps-parallel/{name}.en'
        url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz'
        name = name.replace('.en-ps',
                            '').replace('.', '_').replace('-', '_').lower()
        entry = Entry(langs=('ps', 'en'),
                      name=name,
                      url=url,
                      cite=wmt20_cite,
                      in_paths=[ps, en],
                      filename='wmt20-psen-parallel.tgz',
                      in_ext='txt')
        index.add_entry(entry)
Exemplo n.º 19
0
def load(index: Index):
    cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl')
    cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl')
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v3',
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    l1 = 'en'
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz'
    for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split(
    ):
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v7_1',
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz'
    for pair in 'en-nb en-nn es-ca es-eu es-gl'.split():
        l1, l2 = pair.split('-')
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v7',
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz'
    for version, pairs in [
        ('v8.0', 'en-bg en-cs en-da en-de en-el'),
        ('v8.0-0001',
         'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl'
         ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl')
    ]:
        for pair in pairs.split():
            l1, l2 = pair.split('-')
            url = PARACRAWL_V8.format(version=version, pair=pair)
            ent = Entry(langs=(l1, l2),
                        name='paracrawl_v8',
                        url=url,
                        cite=cite,
                        ext='tsv.gz')
            index.add_entry(ent)

    PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz'
    for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split(
    ):
        l1, l2 = pair.split('-')
        url = PARACRAWL_BONUS.format(pair=pair)
        ent = Entry(langs=(l1, l2),
                    name='paracrawl_bonus',
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)
Exemplo n.º 20
0
Arquivo: other.py Projeto: kpu/mtdata
def load_all(index: Index):

    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb')
    l1, l2 = 'hi', 'en'
    for version, prefix in [
            #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('v1_5',
         'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download'
         )
    ]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    name=f'IITB{version}_train',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[
                        f'parallel/IITB.en-hi.{l1}',
                        f'parallel/IITB.en-hi.{l2}'
                    ])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        name=f'IITB{version}_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = index.ref_db.get_bibtex('neubig11kftt')
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename="kftt-data-1.0.tar.gz",
                    name=f'kftt_v1_{split}',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite)
        index.add_entry(ent)

    url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip"
    cite = index.ref_db.get_bibtex('ding2020a')
    for split in ['dev', 'test', 'train']:
        ent = Entry(langs=('my', 'en'),
                    url=url,
                    name=f'WAT2020_ALT_{split}',
                    in_ext='txt',
                    cite=cite,
                    filename='wat2020.my-en.zip',
                    in_paths=[
                        f'wat2020.my-en/alt/{split}.alt.my',
                        f'wat2020.my-en/alt/{split}.alt.en'
                    ])
        index.add_entry(ent)

    l1, l2 = 'iu', 'en'
    url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60"
    cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut')
    for split in ['dev', 'devtest', 'test', 'train']:
        path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}'
        if split != 'train':
            path_pref += '-dedup'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    name=f'NunavutHansard_v3_{split}',
                    in_ext='txt',
                    cite=cite,
                    filename='NunavutHansard_iuen_v3.tgz',
                    in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}'])
        index.add_entry(ent)

    # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122
    url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip"
    cite = index.ref_db.get_bibtex('Khresmoi')
    langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"]
    for i, l1 in enumerate(langs):
        for l2 in langs[i + 1:]:
            ent = Entry(
                langs=(l1, l2),
                url=url,
                name='Khresmoi_Summary_Test_v2',
                filename='khresmoi-summary-test-set-2.0.zip',
                cite=cite,
                in_paths=[
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}",
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}"
                ],
                in_ext='txt')
            index.add_entry(ent)
            ent = Entry(
                langs=(l1, l2),
                url=url,
                name='Khresmoi_Summary_Dev_v2',
                filename='khresmoi-summary-test-set-2.0.zip',
                cite=cite,
                in_paths=[
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}",
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}"
                ],
                in_ext='txt')
            index.add_entry(ent)
Exemplo n.º 21
0
def load(index: Index):
    group_id = 'Statmt'
    WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
    WMT14_CITE = index.ref_db.get_bibtex('ws-2014-statistical')
    for l1 in ['de', 'cs', 'fr', 'ru', 'es']:
        l2 = 'en'
        f1 = f'commoncrawl.{l1}-en.{l1}'
        f2 = f'commoncrawl.{l1}-en.en'
        data_id = DatasetId(group=group_id, name='commoncrawl_wmt13', version='1', langs=(l1, l2))
        index.add_entry(Entry(did=data_id, url=WMT13_CCRAWL,
                              filename='wmt13_parallel_commoncrawl.tgz',
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # === WMT 13 release of europarl_v7 ===
    for l1 in ['cs', 'de', 'fr', 'es']:
        l2 = 'en'
        f1 = f'training/europarl-v7.{l1}-{l2}.{l1}'
        f2 = f'training/europarl-v7.{l1}-{l2}.{l2}'
        data_id = DatasetId(group=group_id, name='europarl_wmt13', version='7', langs=(l1, l2))
        index.add_entry(Entry(did=data_id, url="http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
                              filename="wmt13_europarl_v7.tgz",
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # ==== WMT 18  news commentary v13 ===
    for l1 in ['cs', 'de', 'ru', 'zh']:
        l2 = 'en'
        f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}'
        f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}'
        data_id = DatasetId(group=group_id, name='news_commentary_wmt18', version='13', langs=(l1, l2))
        index.add_entry(Entry(did=data_id,
                              url="http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
                              filename="wmt18_news_commentary_v13.tgz",
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # === Europarl V9 corpus
    EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz'
    cite = index.ref_db.get_bibtex('koehn2005europarl')
    for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='europarl', version='9', langs=(l1, l2)),
                              url=EUROPARL_v9 % (l1, l2), cite=cite))

    # === Europarl V7 corpus
    EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz'
    cite = index.ref_db.get_bibtex('bojar-etal-2017-findings')
    for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split():
        l2 = 'en'
        src = f'europarl-v7.{l1}-{l2}.{l1}'
        ref = f'europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name='europarl', version='7', langs=(l1, l2)), in_paths=[src, ref],
                              url=EUROPARL_v7 % (l1, l2), in_ext='txt', cite=cite))

    # === Digital Corpus of European Parliament
    index.add_entry(Entry(did=DatasetId(group=group_id, name='dcep_wmt17', version='1', langs=(l1, l2)),
                          in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt',
                          url='http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz'))
    index.add_entry(Entry(did=DatasetId(group=group_id, name='books_wmt17', version='1', langs=(l1, l2)),
                          in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt',
                          url='http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz'))

    # === News Commentary v14
    NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz"
    cite = index.ref_db.get_bibtex('bojar-etal-2018-findings')
    for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it',
                 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es',
                 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru',
                 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk',
                 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it',
                 'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id',
                 'es it', 'es ja', 'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id',
                 'fr it', 'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it',
                 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk', 'id nl', 'id pt', 'id ru',
                 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl',
                 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh']:
        l1, l2 = pair.split()
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name='news_commentary', version='14', langs=(l1, l2)),
            url=NEWSCOM_v14 % (l1, l2), cite=cite))

    for v in [15, 16]:
        cite = index.ref_db.get_bibtex('barrault-etal-2020-findings')
        url = f"http://data.statmt.org/news-commentary/v{v}/training/news-commentary-v{v}.%s-%s.tsv.gz"
        for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it', 'ar ja', 'ar kk', 'ar nl',
                     'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja',
                     'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it',
                     'de ja', 'de kk', 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it',
                     'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja',
                     'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it', 'fr ja', 'fr kk', 'fr nl',
                     'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id ja',
                     'id kk', 'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja pt',
                     'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh',
                     'ru zh']:
            l1, l2 = pair.split()
            index.add_entry(Entry(did=DatasetId(group=group_id, name='news_commentary', version=f'{v}', langs=(l1, l2)),
                url=url % (l1, l2), cite=cite))


    # ===== Wiki Titles V1
    WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz'
    cite = index.ref_db.get_bibtex('barrault-etal-2019-findings')
    for pair in ['cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne', 'kk en', 'lt en',
                 'ru en', 'zh en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='1', langs=(l1, l2)),
                              url=WIKI_TITLES_v1 % (l1, l2), cite=cite))

    # ===== Wiki Titles V2
    WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz'
    for pair in ['ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en', 'pl en', 'ps en',
                 'ru en', 'ta en', 'zh en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='2', langs=(l1, l2)),
                              url=WIKI_TITLES_v2 % (l1, l2), cite=cite))

    WIKI_TITLES_v3 = 'http://data.statmt.org/wikititles/v3/wikititles-v3.{pair}.tsv'
    langs = 'bn-hi ca-es ca-pt ca-ro cs-en de-en de-fr es-pt es-ro ha-en ig-en is-en ja-en ps-en pt-ro ru-en xh-zu zh-en'
    for pair in langs.split():
        l1, l2 = pair.split('-')
        url = WIKI_TITLES_v3.format(pair=pair)
        ent = Entry(did=DatasetId(group=group_id, name=f'wikititles', version='3', langs=(l1, l2)), url=url, cite=cite)
        index.add_entry(ent)

    # ==== WMT  Dev and Tests
    wmt_sets = {
        'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'), ('ru', 'en'), ('hi', 'en')],
        'newsdev2015': [('fi', 'en'), ('en', 'fi')],
        'newstest2015': [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'),
                         ('de', 'en'), ('ru', 'en'), ('en', 'fi')],
        'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'), ('en', 'tr')],
        'newstest2016': [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'),
                         ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'),
                         ('en', 'tr'), ('en', 'cs')],
        'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newstest2017': [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'),
                         ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'),
                         ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newsdev2018': [('et', 'en'), ('en', 'et')],
        'newstest2018': [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'),
                         ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'),
                         ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')],
        'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'), ('lt', 'en'),
                        ('en', 'gu')],
        'newstest2019': [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'),
                         ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'),
                         ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'),
                         ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')],
        'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'), ('en', 'iu'),
                        ('en', 'ja'), ('ja', 'en'), ('en', 'pl')]
    }
    for set_name, pairs in wmt_sets.items():
        sub_name, year = set_name[:-4], set_name[-4:]
        for l1, l2 in pairs:
            src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm'
            ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm'
            name = f'{sub_name}_{l1}{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)),
                                  filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='sgm',
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                                  cite=cite))

    # Multi parallel
    wmt_sets = {
        '2009': ['en', 'cs', 'de', 'es', 'fr'],
        '2010': ['en', 'cs', 'de', 'es', 'fr'],
        '2011': ['en', 'cs', 'de', 'es', 'fr'],
        '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
        '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
    }
    for year, langs in wmt_sets.items():
        name = 'newstest'
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'dev/{name}{year}.{l1}'
            f2 = f'dev/{name}{year}.{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)),
                                    filename='wmt20dev.tgz', in_paths=[f1, f2], in_ext='txt', cite=cite,
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz'))

    for l1, l2 in [('ps', 'en'), ('km', 'en')]:
        for set_name in ['wikipedia.dev', 'wikipedia.devtest']:
            src = f'dev/{set_name}.{l1}-{l2}.{l1}'
            ref = f'dev/{set_name}.{l1}-{l2}.{l2}'
            name = f'{set_name.replace(".", "_")}_{l1}{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version='1', langs=(l1, l2)),
                                 filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='txt', cite=cite,
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz'))

    #### WMT 20 Tests
    url = "http://data.statmt.org/wmt20/translation-task/test.tgz"
    wmt20_cite = index.ref_db.get_bibtex('barrault-etal-2020-findings')
    for _pref, pairs in {
        "": ["csen", "deen", "defr", "encs", "ende", "eniu", "enja", "enkm", "enpl", "enps",
             "enru", "enta", "enzh", "frde", "iuen", "jaen", "kmen", "plen", "psen", "ruen",
             "taen", "zhen"],
        "B": ["deen", "ende", "enzh", "ruen", "zhen"]}.items():
        year = "2020"
        name = f'newstest{_pref}'
        for pair in pairs:
            l1, l2 = pair[:2], pair[2:]
            f1 = f'sgm/{name}{year}-{pair}-src.{l1}.sgm'
            f2 = f'sgm/{name}{year}-{pair}-ref.{l2}.sgm'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=f'{name}_{pair}'.lower(), version=year, langs=(l1, l2)),
                filename='wmt20tests.tgz', in_paths=[f1, f2], in_ext='sgm', cite=wmt20_cite, url=url))

    # WMT 21 Dev
    url = "http://data.statmt.org/wmt21/translation-task/dev.tgz"
    pairs = "en-ha en-is is-en ha-en".split()
    for pair in pairs:
        l1, l2 = pair.split('-')
        in_path = f'dev/xml/newsdev2021.{l1}-{l2}.xml'
        ent = Entry(did=DatasetId(group=group_id, name=f'newsdev_{l1}{l2}', version='2021', langs=(l1, l2)),
                    filename='wmt21dev.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url)
        index.add_entry(ent)

    url = "http://data.statmt.org/wmt21/translation-task/test.tgz"
    pairs = 'bn-hi hi-bn xh-zu zu-xh cs-en de-en de-fr en-cs en-de en-ha en-is en-ja en-ru en-zh fr-de ha-en is-en ja-en ru-en zh-en'.split()
    for pair in pairs:
        l1, l2 = pair.split('-')
        name = 'newstest'
        if pair in 'bn-hi hi-bn xh-zu zu-xh':
            name = 'florestest'
        in_path = f'test/{name}2021.{l1}-{l2}.xml'
        ent = Entry(did=DatasetId(group=group_id, name=f'{name}_{l1}{l2}', version='2021', langs=(l1, l2)),
                    filename='wmt21tests.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url)
        index.add_entry(ent)

    # ==== TED Talks 2.0 ar-en
    index.add_entry(Entry(did=DatasetId(group=group_id, name='tedtalks', version='2_clean', langs=('en', 'ar')),
                         ext='tsv.xz', url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz'))

    # ==== Europarl v10
    EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz"
    for pair in ['cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en', 'lt en', 'pl en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name=f'europarl', version='10', langs=(l1, l2)),
                url=EP_v10 % (l1, l2), cite=wmt20_cite))

    # ==== PMIndia V1
    PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv"
    cite = index.ref_db.get_bibtex('Haddow-etal-2020-PMIndia')
    for pair in ["as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en", "mr en", "or en",
                 "pa en", "ta en", "te en", "ur en"]:
        l1, l2 = pair.split()
        # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed!
        index.add_entry(Entry(did=DatasetId(group=group_id, name=f'pmindia', version='1', langs=(l2, l1)),
                              url=PMINDIA_v1 % (l1, l2), cite=cite))

    # Pashto - English  pseudo parallel dataset for alignment
    index.add_entry(Entry(did=DatasetId(group=group_id, name=f'wmt20_enps_aligntask', version='1', langs=('en', 'ps')),
                          url='http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz',
                          cite=wmt20_cite, ext='tsv.xz'))

    # Pashto - English  mostly parallel dataset
    for name in ["GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps", "bible.en-ps.clean",
                 "ted-wmt20.en-ps", "wikimedia.en-ps"]:
        ps = f'ps-parallel/{name}.ps'
        en = f'ps-parallel/{name}.en'
        url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz'
        name = name.replace('.en-ps', '').replace('.', '_').replace('-', '_').lower()
        entry = Entry(did=DatasetId(group=group_id, name=name, version='1', langs=('ps', 'en')), url=url,
                      cite=wmt20_cite, in_paths=[ps, en], filename='wmt20-psen-parallel.tgz', in_ext='txt')
        index.add_entry(entry)

    for l2 in ['ps', 'km']:
        url = f"http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-{l2}.xz"
        entry = Entry(did=DatasetId(group=group_id, name='paracrawl', version='5.1', langs=('en', l2)),
                    url=url, cite=wmt20_cite, ext='tsv.xz', cols=(0, 1))
        index.add_entry(entry)

    # for ja-en only TED was available
    index.add_entry(Entry(url="http://data.statmt.org/wmt20/translation-task/ja-en/ted.en-ja.tgz",
                    did=DatasetId(group=group_id, name='ted', version='wmt20', langs=('en', 'ja')),
                    cite=wmt20_cite, ext='tgz', in_ext='txt',
                    in_paths=['en-ja/train.tags.en-ja.en', 'en-ja/train.tags.en-ja.ja']))

    ccalign_cite = index.ref_db.get_bibtex('chaudhary-EtAl:2019:WMT')
    CC_ALIGNED = 'http://www.statmt.org/cc-aligned/sentence-aligned/{src}-{tgt}.tsv.xz'
    tgts='es_XX et_EE fa_IR ff_NG fi_FI fr_XX gu_IN ha_NG he_IL hi_IN hr_HR ht_HT hu_HU hy_AM id_ID ig_NG is_IS it_IT ja_XX jv_ID ka_GE kg_AO kk_KZ km_KH kn_IN ko_KR ku_TR ky_KG lg_UG ln_CD lo_LA lt_LT lv_LV mg_MG mi_NZ mk_MK ml_IN mn_MN mr_IN ms_MY mt_MT my_MM ne_NP nl_XX no_XX ns_ZA ny_MW om_KE or_IN pa_IN pl_PL ps_AF pt_XX qa_MM qd_MM ro_RO ru_RU si_LK sk_SK sl_SI sn_ZW so_SO sq_AL sr_RS ss_SZ st_ZA su_ID sv_SE sw_KE sz_PL ta_IN te_IN tg_TJ th_TH ti_ET tl_XX tn_BW tr_TR ts_ZA tz_MA uk_UA ur_PK ve_ZA vi_VN wo_SN xh_ZA yo_NG zh_CN zh_TW zu_ZA zz_TR'.split()
    srcs = 'af_ZA ak_GH am_ET ar_AR as_IN ay_BO az_AZ az_IR be_BY bg_BG bm_ML bn_IN br_FR bs_BA ca_ES cb_IQ cs_CZ cx_PH cy_GB da_DK de_DE el_GR'.split()
    pairs = [('en_XX', tgt) for tgt in tgts] + [(src, 'en_XX') for src in srcs]
    dont_know = {'qa', 'qd'}   # looks like some Myanmar languages, but not sure which one.
    # Cant find them in ISO 639-1:  https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
    #                and lingo http://www.lingoes.net/en/translator/langcode.htm
    #               and web-info https://wp-info.org/tools/languagecodes.php
    #unsupported = {'zh_TW', 'az_IR'}
    # country locales are not supported; they create conflicts. keeping large ones instead
    for src, tgt in pairs:
        # l1, l2 = src.split('_')[0], tgt.split('_')[0]
        if src[:2] in dont_know or tgt[:2] in dont_know:   # I dont know what language these are
            continue
        url = CC_ALIGNED.format(src=src, tgt=tgt)
        entry = Entry(did=DatasetId(group=group_id, name='ccaligned', version='1', langs=(src, tgt)), url=url,
                      cite=ccalign_cite, ext='tsv.xz', cols=(0, 1))
        index.add_entry(entry)

    wmt21_cite = 'WMT21'  # unavailable at the time of adding
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'khamenei', version='wmt21', langs=('ha','en')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/ha-en/khamenei.v1.ha-en.tsv', ext='tsv', cols=(2, 3)))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'opus', version='wmt21', langs=('ha', 'en')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0)))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ha')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ha.bifixed.dedup.laser.filter-0.9.xz',
        ext='tsv.xz', cols=[1, 2]))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ru')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ru.bifixed.dedup.filter-1.1.xz',
        ext='tsv.xz', cols=[0, 1]))

    for pair in ['bn-hi', 'xh-zu']:
        l1, l2 = pair.split('-')
        url = f'http://data.statmt.org/wmt21/translation-task/cc-aligned/{pair}.tsv.xz'
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name=f'ccaligned', version='wmt21', langs=(l1, l2)), cite=wmt21_cite,
            url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0)))

    # https://data.statmt.org/wmt19/translation-task/fr-de/bitexts/de-fr.bicleaner07.de.gz
    for cln_name, name in [('commoncrawl', ''), ('paracrawl', 'de-fr.bicleaner07'), ('europarl_v7', '')]:
        l1, l2 = 'fr', 'de'
        prefix = 'https://data.statmt.org/wmt19/translation-task/fr-de/bitexts'
        index.add_entry(Entry(did=DatasetId(group=group_id, name=cln_name or name, version='wmt19', langs=(l1, l2)),
                              ext='txt.gz', url=(f'{prefix}/{name}.{l1}.gz', f'{prefix}/{name}.{l2}.gz')))

    # Back Translation
    prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/zh-en'
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_enzh', version='wmt20', langs=('en', 'zh')),
        ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.translatedto.zh.gz')))

    prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/ru-en'
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_enru', version='wmt20', langs=('en', 'ru')),
                      ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.en.translatedto.ru.gz')))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_ruen', version='wmt20', langs=('ru', 'en')),
        ext='txt.gz', url=(f'{prefix}/news.ru.gz', f'{prefix}/news.ru.translatedto.en.gz')))
Exemplo n.º 22
0
def load_all(index: Index):
    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb')
    l1, l2 = 'hi', 'en'
    for version, prefix in [
        # ('1.0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('1.5', 'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download')]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(did=DatasetId(group='IITB', name=f'hien_train', version=version, langs=(l1, l2)),
                    url=url, filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    in_ext='txt', cite=cite,
                    in_paths=[f'parallel/IITB.en-hi.{l1}',
                              f'parallel/IITB.en-hi.{l2}'])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(did=DatasetId(group='IITB', name=f'hien_{split}', version=version, langs=(l1, l2)),
                        url=url, filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        in_ext='txt', in_paths=[f1, f2], cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = index.ref_db.get_bibtex('neubig11kftt')
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(did=DatasetId(group='Phontron', name=f'kftt_{split}', version='1', langs=(l1, l2)),
                    url=url, filename="kftt-data-1.0.tar.gz", in_ext='txt', in_paths=[f1, f2], cite=cite)
        index.add_entry(ent)

    url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip"
    cite = index.ref_db.get_bibtex('ding2020a')
    for split in ['dev', 'test', 'train']:
        ent = Entry(did=DatasetId(group='WAT', name=f'alt_{split}', version='2020', langs=('my', 'en')),
                    url=url, in_ext='txt', cite=cite, filename='wat2020.my-en.zip',
                    in_paths=[f'wat2020.my-en/alt/{split}.alt.my', f'wat2020.my-en/alt/{split}.alt.en'])
        index.add_entry(ent)

    l1, l2 = 'iu', 'en'
    url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60"
    cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut')
    for split in ['dev', 'devtest', 'test', 'train']:
        path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}'
        if split != 'train':
            path_pref += '-dedup'
        ent = Entry(did=DatasetId(group='NRC_CA', name=f'nunavut_hansard_{split}', version='3', langs=(l1, l2)),
                    url=url, in_ext='txt', cite=cite, filename='NunavutHansard_iuen_v3.tgz',
                    in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}'])
        index.add_entry(ent)

    # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122
    url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip"
    cite = index.ref_db.get_bibtex('Khresmoi')
    langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"]
    for i, l1 in enumerate(langs):
        for l2 in langs[i + 1:]:
            ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_test', version='2', langs=(l1, l2)),
                        url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt',
                        in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}",
                                  f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}"])
            index.add_entry(ent)
            ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_dev', version='2', langs=(l1, l2)),
                        url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt',
                        in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}",
                                  f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}"])
            index.add_entry(ent)

    jesc_cite = index.ref_db.get_bibtex('pryzant_jesc_2018')
    for split in ['train', 'dev', 'test']:
        ent = Entry(url='https://nlp.stanford.edu/projects/jesc/data/split.tar.gz',
                    did=DatasetId(group='StanfordNLP', name=f'jesc_{split}', version='1', langs=('en', 'ja')),
                    filename='jesc-split.tar.gz', in_ext='tsv', in_paths=[f"split/{split}"], cite=jesc_cite)
        index.add_entry(ent)

    prefix = 'https://nlp.stanford.edu/projects/nmt/data'
    for name, subdir, src, tgt, cite_key in [
        ("wmt15_train", "wmt15.en-cs", "train.en", "train.cs", "luong2016acl_hybrid"),
        ("newstest2013", "wmt15.en-cs", "newstest2013.en", "newstest2013.cs", "luong2016acl_hybrid"),
        ("newstest2014", "wmt15.en-cs", "newstest2014.en", "newstest2014.cs", "luong2016acl_hybrid"),
        ("newstest2015", "wmt15.en-cs", "newstest2015.en", "newstest2015.cs", "luong2016acl_hybrid"),
        ("wmt14_train", "wmt14.en-de", "train.en", "train.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2012", "wmt14.en-de", "newstest2012.en", "newstest2012.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2013", "wmt14.en-de", "newstest2013.en", "newstest2013.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2014", "wmt14.en-de", "newstest2014.en", "newstest2014.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2015", "wmt14.en-de", "newstest2015.en", "newstest2015.de", "luong-pham-manning:2015:EMNLP"),
        ("iwslt15_train", "iwslt15.en-vi", "train.en", "train.vi", "Luong-Manning:iwslt15"),
        ("test2012", "iwslt15.en-vi", "tst2012.en", "tst2012.vi", "Luong-Manning:iwslt15"),
        ("test2013", "iwslt15.en-vi", "tst2013.en", "tst2013.vi", "Luong-Manning:iwslt15")]:
        l1, l2 = src.split(".")[-1], tgt.split(".")[-1]
        url1 = f"{prefix}/{subdir}/{src}"
        url2 = f"{prefix}/{subdir}/{tgt}"
        cite = index.ref_db.get_bibtex(cite_key)
        ent = Entry(did=DatasetId(group='StanfordNLP', name=name, version='1', langs=(l1, l2)),
                    ext='txt', url=(url1, url2), cite=cite)
        index.add_entry(ent)

    _url = 'https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip'
    cite = index.ref_db.get_bibtex('Barkarson-et-al-2020')
    for sub in ['eea train dev test', 'ema train dev test', 'opensubtitles dev test']:
        l1, l2 = 'en', 'is'
        sub, *splits = sub.split()
        for split in splits:
            in_paths = [f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l1}.csv',
                        f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l2}.csv']
            if split == 'train' and sub == 'eea':
                in_paths = [in_paths[1], in_paths[0]] # aha! they have swapped it
            ent = Entry(did=DatasetId(group='ParIce', name=f'{sub}_{split}', version='20.05', langs=(l1, l2)),
                        url=_url, ext='zip', in_ext='txt', in_paths=in_paths, cite=cite,
                        filename='Parice_dev_test.20.05.zip')
            index.add_entry(ent)

    # https://github.com/bonaventuredossou/ffr-v1/tree/master/FFR-Dataset/FFR%20Dataset%20v2
    _url = 'https://raw.githubusercontent.com/bonaventuredossou/ffr-v1/master/FFR-Dataset/FFR%20Dataset%20v2/ffr_dataset_v2.txt'
    cite = index.ref_db.get_bibtex("emezue-dossou-2020-ffr")
    ent = Entry(did=DatasetId(group='Masakhane', name=f'ffr', version='2', langs=('fon', 'fra')),
                url=_url, ext='tsv', cite=cite)
    index.add_entry(ent)

    # https://zenodo.org/record/4432712
    _url = 'https://zenodo.org/record/4432712/files/Fon_French_Parallel_Data_25377.csv?download=1'
    cite = index.ref_db.get_bibtex("dossou2021crowdsourced")
    ent = Entry(did=DatasetId(group='Masakhane', name=f'daily_dialogues', version='1', langs=('fon', 'fra')),
                url=_url, ext='csvwithheader', cite=cite)
    index.add_entry(ent)
Exemplo n.º 23
0
def load_all(index: Index):
    group = 'AI4Bharath'
    cite = index.ref_db.get_bibtex('ramesh2021samanantar')
    pairs = (
        'en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi'
        ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa'
        ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or'
        ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te'
        ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te')
    BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip'
    for pair in pairs.strip().split(' '):
        l1, l2 = pair.split('-')
        dirname = 'en2indic' if l1 == 'en' else 'indic2indic'
        url = BASE_v0_2.format(dirname=dirname, pair=pair)

        ent = Entry(did=DatasetId(group=group,
                                  name=f'samananthar',
                                  version='0.2',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'],
                    in_ext='txt')
        index.add_entry(ent)

    URL = "https://storage.googleapis.com/samanantar-public/benchmarks.zip"
    filename = "samananthar-benchmarks.zip"
    for split in ('dev', 'test'):
        want20_langs = 'bn gu hi ml mr ta te'.split()
        for l2 in want20_langs:
            f1 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.en'
            f2 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.{l2}'
            ent = Entry(did=DatasetId(group=group,
                                      name=f'wat_{split}',
                                      version='2020',
                                      langs=('en', l2)),
                        filename=filename,
                        url=URL,
                        cite=cite,
                        in_paths=[f1, f2],
                        in_ext='txt')
            index.add_entry(ent)

        wat21_langs = 'bn en gu hi kn ml mr or pa ta te'.split()
        for i, l1 in enumerate(wat21_langs):
            for l2 in wat21_langs[i + 1:]:
                f1 = f'benchmarks/wat2021-devtest/{split}.{l1}'
                f2 = f'benchmarks/wat2021-devtest/{split}.{l2}'
                ent = Entry(did=DatasetId(group=group,
                                          name=f'wat_{split}',
                                          version='2021',
                                          langs=(l1, l2)),
                            filename=filename,
                            url=URL,
                            cite=cite,
                            in_paths=[f1, f2],
                            in_ext='txt')
                index.add_entry(ent)

        # PMI langs; en-as
        index.add_entry(
            Entry(did=DatasetId(group=group,
                                name=f'pmi_{split}',
                                version='2021',
                                langs=('en', 'as')),
                  filename=filename,
                  url=URL,
                  cite=cite,
                  in_ext='txt',
                  in_paths=[
                      f'benchmarks/pmi/en-as/{split}.en',
                      f'benchmarks/pmi/en-as/{split}.as'
                  ]))
Exemplo n.º 24
0
def load_all(index: Index):

    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = """@article{DBLP:journals/corr/abs-1710-02855,
  author    = {Anoop Kunchukuttan and
               Pratik Mehta and
               Pushpak Bhattacharyya},
  title     = {The {IIT} Bombay English-Hindi Parallel Corpus},
  journal   = {CoRR},
  volume    = {abs/1710.02855},
  year      = {2017},
  url       = {http://arxiv.org/abs/1710.02855},
  archivePrefix = {arXiv},
  eprint    = {1710.02855},
  timestamp = {Mon, 13 Aug 2018 16:48:50 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1710-02855.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}"""
    l1, l2 = 'hi', 'en'
    for version, prefix in [
            #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('v1_5',
         'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download'
         )
    ]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    name=f'IITB{version}_train',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[
                        f'parallel/IITB.en-hi.{l1}',
                        f'parallel/IITB.en-hi.{l2}'
                    ])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        name=f'IITB{version}_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = """@misc{neubig11kftt,
    author = {Graham Neubig},
    title = {The {Kyoto} Free Translation Task},
    howpublished = {http://www.phontron.com/kftt},
    year = {2011}
    }"""
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename="kftt-data-1.0.tar.gz",
                    name=f'kftt_v1_{split}',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite)
        index.add_entry(ent)
Exemplo n.º 25
0
def load_all(index: Index):
    data="""an-ca an-de an-en an-es an-fr an-gl an-it an-pl an-pt an-ru ar-arz ar-az ar-ba ar-be ar-bg ar-bn ar-br ar-bs ar-ca ar-ceb
 ar-cs ar-da ar-de ar-el ar-en ar-eo ar-es ar-et ar-eu ar-fa ar-fi ar-fr ar-gl ar-he ar-hi ar-hr ar-hu ar-id ar-is ar-it
 ar-ja ar-kk ar-ko ar-lt ar-mk ar-ml ar-mr ar-nds ar-ne ar-nl ar-no ar-pl ar-pt ar-ro ar-ru ar-sh ar-si ar-sk ar-sl ar-sq
 ar-sr ar-sv ar-sw ar-ta ar-te ar-tl ar-tr ar-tt ar-uk ar-vi arz-de arz-en arz-es arz-fr ar-zh arz-it arz-pt arz-ru as-de as-es
 as-fr as-it azb-fr az-bg az-ca az-cs az-da az-de az-el az-en az-es az-et az-fa az-fi az-fr az-gl az-he az-hr az-hu az-id
 az-it az-ja az-ko az-nl az-no az-pl az-pt az-ro az-ru az-sr az-sv az-ta az-tr az-uk az-vi az-zh ba-bg ba-ca ba-cs ba-da
 ba-de ba-el ba-en ba-es ba-fi ba-fr ba-gl ba-hr ba-hu ba-id ba-it ba-ja ba-nl ba-no ba-pl ba-pt bar-de bar-en bar-es bar-fr
 bar-it ba-ro bar-pt bar-ru ba-ru ba-sh ba-sk ba-sl ba-sr ba-sv ba-tr ba-uk ba-zh be-bg be-ca be-cs be-de be-en be-es be-fi
 be-fr be-he be-hu be-it be-ja be-nl be-no be-pl be-pt be-ro be-ru be-sr be-sv be-uk bg-bn bg-bs bg-ca bg-ceb bg-cs bg-da
 bg-de bg-el bg-en bg-eo bg-es bg-et bg-eu bg-fa bg-fi bg-fr bg-gl bg-he bg-hi bg-hr bg-hu bg-id bg-is bg-it bg-ja bg-kk
 bg-ko bg-lt bg-mk bg-ml bg-mr bg-nds bg-ne bg-nl bg-no bg-pl bg-pt bg-ro bg-ru bg-sh bg-si bg-sk bg-sl bg-sq bg-sr bg-sv
 bg-sw bg-ta bg-te bg-tl bg-tr bg-tt bg-uk bg-vi bg-zh bn-bs bn-ca bn-cs bn-da bn-de bn-el bn-en bn-eo bn-es bn-et bn-eu
 bn-fa bn-fi bn-fr bn-gl bn-he bn-hi bn-hr bn-hu bn-id bn-it bn-ja bn-ko bn-lt bn-mk bn-nl bn-no bn-pl bn-pt bn-ro bn-ru
 bn-sh bn-sk bn-sl bn-sq bn-sr bn-sv bn-ta bn-tr bn-uk bn-vi bn-zh br-de br-en br-es br-fr br-it br-pt br-ru br-uk bs-ca
 bs-cs bs-da bs-de bs-el bs-en bs-eo bs-es bs-et bs-eu bs-fa bs-fi bs-fr bs-gl bs-he bs-hi bs-hr bs-hu bs-id bs-is bs-it
 bs-ja bs-ko bs-lt bs-mk bs-ml bs-mr bs-nl bs-no bs-pl bs-pt bs-ro bs-ru bs-sh bs-si bs-sk bs-sl bs-sq bs-sr bs-sv bs-ta
 bs-te bs-tl bs-tr bs-uk bs-vi bs-zh ca-ceb ca-cs ca-da ca-de ca-el ca-en ca-eo ca-es ca-et ca-eu ca-fa ca-fi ca-fo ca-fr
 ca-fy ca-gl ca-he ca-hi ca-hr ca-hu ca-id ca-is ca-it ca-ja ca-ka ca-kk ca-ko ca-la ca-lb ca-lt ca-mk ca-ml ca-mr ca-nds
 ca-ne ca-nl ca-no ca-oc ca-pl ca-pt ca-ro ca-ru ca-sh ca-si ca-sk ca-sl ca-sq ca-sr ca-sv ca-sw ca-ta ca-te ca-tl ca-tr
 ca-tt ca-uk ca-vi ca-zh ceb-cs ceb-de ceb-en ceb-es ceb-fi ceb-fr ceb-hu ceb-it ceb-ja ceb-nl ceb-no ceb-pl ceb-pt ceb-ro ceb-ru ceb-sv
 ceb-uk cs-da cs-de cs-el cs-en cs-eo cs-es cs-et cs-eu cs-fa cs-fi cs-fr cs-fy cs-gl cs-he cs-hi cs-hr cs-hu cs-id cs-is
 cs-it cs-ja cs-ka cs-kk cs-ko cs-la cs-lt cs-mk cs-ml cs-mr cs-nds cs-ne cs-nl cs-no cs-oc cs-pl cs-pt cs-ro cs-ru cs-sh
 cs-si cs-sk cs-sl cs-sq cs-sr cs-sv cs-sw cs-ta cs-te cs-tl cs-tr cs-tt cs-uk cs-vi cs-zh da-de da-el da-en da-eo da-es
 da-et da-eu da-fa da-fi da-fo da-fr da-gl da-he da-hi da-hr da-hu da-id da-is da-it da-ja da-ko da-lt da-mk da-ml da-mr
 da-nds da-ne da-nl da-no da-pl da-pt da-ro da-ru da-sh da-si da-sk da-sl da-sq da-sr da-sv da-sw da-ta da-te da-tl da-tr
 da-tt da-uk da-vi da-zh de-el de-en de-eo de-es de-et de-eu de-fa de-fi de-fo de-fr de-fy de-gl de-gom de-he de-hi de-hr
 de-hu de-hy de-id de-is de-it de-ja de-ka de-kk de-ko de-la de-lb de-lt de-mk de-ml de-mr de-nds de-ne de-nl de-no de-oc
 de-pl de-pt de-rm de-ro de-ru de-sh de-si de-sk de-sl de-sq de-sr de-sv de-sw de-ta de-te de-tg de-tl de-tr de-tt de-uk
 de-vi de-wuu de-zh el-en el-eo el-es el-et el-eu el-fa el-fi el-fr el-gl el-he el-hi el-hr el-hu el-id el-is el-it el-ja
 el-ko el-lt el-mk el-ml el-mr el-nl el-no el-pl el-pt el-ro el-ru el-sh el-si el-sk el-sl el-sq el-sr el-sv el-sw el-ta
 el-te el-tl el-tr el-uk el-vi el-zh en-eo en-es en-et en-eu en-fa en-fi en-fo en-fr en-fy en-gl en-he en-hi en-hr en-hu
 en-id en-io en-is en-it en-ja en-jv en-ka en-kk en-ko en-la en-lb en-lmo en-lt en-mg en-mk en-ml en-mr en-mwl en-nds_nl en-nds
 en-ne en-nl en-no en-oc en-pl en-pt en-ro en-ru en-sh en-simple en-si en-sk en-sl en-sq en-sr en-sv en-sw en-ta en-te en-tg
 en-tl en-tr en-tt en-ug en-uk en-vi en-wuu en-zh eo-es eo-et eo-eu eo-fa eo-fi eo-fr eo-gl eo-he eo-hi eo-hr eo-hu eo-id
 eo-is eo-it eo-ja eo-ko eo-lt eo-mk eo-ml eo-mr eo-nds eo-nl eo-no eo-pl eo-pt eo-ro eo-ru eo-sh eo-si eo-sk eo-sl eo-sq
 eo-sr eo-sv eo-ta eo-te eo-tl eo-tr eo-uk eo-vi eo-zh es-et es-eu es-fa es-fi es-fo es-fr es-fy es-gl es-gom es-he es-hi
 es-hr es-hu es-hy es-id es-is es-it es-ja es-jv es-ka es-kk es-ko es-la es-lb es-lt es-mk es-ml es-mr es-nds es-ne es-nl
 es-no es-oc es-pl es-pt es-ro es-ru es-sh es-si es-sk es-sl es-sq es-sr es-sv es-sw es-ta es-te es-tl es-tr es-tt es-uk
 es-vi es-wuu es-zh et-eu et-fa et-fi et-fr et-gl et-he et-hi et-hr et-hu et-id et-is et-it et-ja et-ko et-lt et-mk et-ml
 et-mr et-nl et-no et-pl et-pt et-ro et-ru et-sh et-si et-sk et-sl et-sq et-sr et-sv et-ta et-te et-tl et-tr et-uk et-vi
 et-zh eu-fa eu-fi eu-fr eu-gl eu-he eu-hi eu-hr eu-hu eu-id eu-is eu-it eu-ja eu-ko eu-lt eu-mk eu-ml eu-mr eu-nl eu-no
 eu-pl eu-pt eu-ro eu-ru eu-sh eu-sk eu-sl eu-sq eu-sr eu-sv eu-ta eu-te eu-tr eu-uk eu-vi eu-zh fa-fi fa-fr fa-gl fa-he
 fa-hi fa-hr fa-hu fa-id fa-it fa-ja fa-ko fa-lt fa-mk fa-ml fa-mr fa-nl fa-no fa-pl fa-pt fa-ro fa-ru fa-sh fa-sk fa-sl
 fa-sq fa-sr fa-sv fa-ta fa-te fa-tr fa-uk fa-vi fa-zh fi-fr fi-gl fi-he fi-hi fi-hr fi-hu fi-id fi-is fi-it fi-ja fi-ko
 fi-lt fi-mk fi-ml fi-mr fi-nds fi-ne fi-nl fi-no fi-oc fi-pl fi-pt fi-ro fi-ru fi-sh fi-si fi-sk fi-sl fi-sq fi-sr fi-sv
 fi-sw fi-ta fi-te fi-tl fi-tr fi-tt fi-uk fi-vi fi-zh fo-fr fo-it fo-nl fo-pl fo-pt fo-ru fo-sv fr-fy fr-gl fr-gom fr-he
 fr-hi fr-hr fr-hu fr-hy fr-id fr-is fr-it fr-ja fr-jv fr-ka fr-kk fr-ko fr-la fr-lb fr-lt fr-mg fr-mk fr-ml fr-mr fr-nds
 fr-ne fr-nl fr-no fr-oc fr-pl fr-pt fr-ro fr-ru fr-sh fr-si fr-sk fr-sl fr-sq fr-sr fr-sv fr-sw fr-ta fr-te fr-tl fr-tr
 fr-tt fr-uk fr-vi fr-wuu fr-zh fy-it fy-nl fy-pl fy-pt fy-ru fy-sv gl-he gl-hi gl-hr gl-hu gl-id gl-is gl-it gl-ja gl-ko
 gl-lt gl-mk gl-ml gl-mr gl-nds gl-ne gl-nl gl-no gl-oc gl-pl gl-pt gl-ro gl-ru gl-sh gl-si gl-sk gl-sl gl-sq gl-sr gl-sv
 gl-ta gl-te gl-tl gl-tr gl-tt gl-uk gl-vi gl-zh gom-it gom-pt gom-ru he-hi he-hr he-hu he-id he-is he-it he-ja he-ko he-lt
 he-mk he-ml he-mr he-nl he-no he-pl he-pt he-ro he-ru he-sh he-si he-sk he-sl he-sq he-sr he-sv he-sw he-ta he-te he-tl
 he-tr he-uk he-vi he-zh hi-hr hi-hu hi-id hi-it hi-ja hi-ko hi-lt hi-mk hi-mr hi-ne hi-nl hi-no hi-pl hi-pt hi-ro hi-ru
 hi-sh hi-sk hi-sl hi-sq hi-sr hi-sv hi-ta hi-te hi-tr hi-uk hi-vi hi-zh hr-hu hr-id hr-is hr-it hr-ja hr-ko hr-lt hr-mk
 hr-ml hr-mr hr-ne hr-nl hr-no hr-pl hr-pt hr-ro hr-ru hr-sh hr-si hr-sk hr-sl hr-sq hr-sr hr-sv hr-ta hr-te hr-tl hr-tr
 hr-uk hr-vi hr-zh hu-id hu-is hu-it hu-ja hu-kk hu-ko hu-lt hu-mk hu-ml hu-mr hu-nds hu-ne hu-nl hu-no hu-oc hu-pl hu-pt
 hu-ro hu-ru hu-sh hu-si hu-sk hu-sl hu-sq hu-sr hu-sv hu-sw hu-ta hu-te hu-tl hu-tr hu-uk hu-vi hu-zh hy-it hy-pt hy-ru
 id-is id-it id-ja id-jv id-ko id-lt id-mk id-ml id-mr id-ne id-nl id-no id-pl id-pt id-ro id-ru id-sh id-si id-sk id-sl
 id-sq id-sr id-sv id-sw id-ta id-te id-tl id-tr id-tt id-uk id-vi id-zh is-it is-ja is-lt is-mk is-nl is-no is-pl is-pt
 is-ro is-ru is-sh is-sk is-sl is-sr is-sv is-tr is-uk is-vi is-zh it-ja it-jv it-ka it-kk it-ko it-la it-lb it-lmo it-lt
 it-mk it-ml it-mr it-nds it-ne it-nl it-no it-oc it-pl it-pt it-ro it-ru it-scn it-sh it-si it-sk it-sl it-sq it-sr it-sv
 it-sw it-ta it-te it-tl it-tr it-tt it-uk it-vi it-wuu it-zh ja-kk ja-ko ja-lt ja-mk ja-ml ja-mr ja-nds ja-nl ja-no ja-pl
 ja-pt ja-ro ja-ru ja-sh ja-si ja-sk ja-sl ja-sq ja-sr ja-sv ja-sw ja-ta ja-te ja-tl ja-tr ja-tt ja-uk ja-vi ja-zh jv-pt
 ka-nl ka-pl ka-pt ka-ru ka-sv kk-nl kk-no kk-pl kk-pt kk-ro kk-ru kk-sv kk-tr kk-uk ko-lt ko-mk ko-ml ko-mr ko-nl ko-no
 ko-pl ko-pt ko-ro ko-ru ko-sh ko-sk ko-sl ko-sq ko-sr ko-sv ko-ta ko-te ko-tr ko-uk ko-vi ko-zh la-nl la-pl la-pt la-ro
 la-ru la-sv lb-nl lb-pl lb-pt lb-ru lb-sv lt-mk lt-ml lt-mr lt-nl lt-no lt-pl lt-pt lt-ro lt-ru lt-sh lt-si lt-sk lt-sl
 lt-sq lt-sr lt-sv lt-ta lt-te lt-tl lt-tr lt-uk lt-vi lt-zh mk-ml mk-mr mk-nl mk-no mk-pl mk-pt mk-ro mk-ru mk-sh mk-si
 mk-sk mk-sl mk-sq mk-sr mk-sv mk-ta mk-te mk-tl mk-tr mk-uk mk-vi mk-zh ml-nl ml-no ml-pl ml-pt ml-ro ml-ru ml-sh ml-sk
 ml-sl ml-sq ml-sr ml-sv ml-tr ml-uk ml-vi ml-zh mr-nl mr-no mr-pl mr-pt mr-ro mr-ru mr-sh mr-sk mr-sl mr-sq mr-sr mr-sv
 mr-tr mr-uk mr-vi mr-zh mwl-pt nds_nl-nl nds-nl nds-no nds-pl nds-pt nds-ro nds-ru nds-sv nds-uk ne-nl ne-no ne-pl ne-pt ne-ro ne-ru
 ne-sh ne-sk ne-sl ne-sv ne-uk nl-no nl-oc nl-pl nl-pt nl-ro nl-ru nl-sh nl-si nl-sk nl-sl nl-sq nl-sr nl-sv nl-sw nl-ta
 nl-te nl-tl nl-tr nl-tt nl-uk nl-vi nl-zh no-pl no-pt no-ro no-ru no-sh no-si no-sk no-sl no-sq no-sr no-sv no-sw no-ta
 no-te no-tl no-tr no-tt no-uk no-vi no-zh oc-pl oc-pt oc-ro oc-ru oc-sv pl-pt pl-ro pl-ru pl-sh pl-si pl-sk pl-sl pl-sq
 pl-sr pl-sv pl-sw pl-ta pl-te pl-tl pl-tr pl-tt pl-uk pl-vi pl-zh pt-ro pt-ru pt-sh pt-si pt-sk pt-sl pt-sq pt-sr pt-sv
 pt-sw pt-ta pt-te pt-tl pt-tr pt-tt pt-uk pt-vi pt-wuu pt-zh ro-ru ro-sh ro-si ro-sk ro-sl ro-sq ro-sr ro-sv ro-sw ro-ta
 ro-te ro-tl ro-tr ro-tt ro-uk ro-vi ro-zh ru-sh ru-si ru-sk ru-sl ru-sq ru-sr ru-sv ru-sw ru-ta ru-te ru-tg ru-tl ru-tr
 ru-tt ru-uk ru-vi ru-wuu ru-zh sh-si sh-sk sh-sl sh-sq sh-sr sh-sv sh-ta sh-te sh-tl sh-tr sh-uk sh-vi sh-zh si-sk si-sl
 si-sq si-sr si-sv si-tr si-uk si-vi si-zh sk-sl sk-sq sk-sr sk-sv sk-ta sk-te sk-tl sk-tr sk-uk sk-vi sk-zh sl-sq sl-sr
 sl-sv sl-ta sl-te sl-tl sl-tr sl-uk sl-vi sl-zh sq-sr sq-sv sq-ta sq-te sq-tl sq-tr sq-uk sq-vi sq-zh sr-sv sr-ta sr-te
 sr-tl sr-tr sr-uk sr-vi sr-zh sv-sw sv-ta sv-te sv-tl sv-tr sv-tt sv-uk sv-vi sv-zh sw-tr sw-uk sw-vi sw-zh ta-tr ta-uk
 ta-vi ta-zh te-tr te-uk te-vi te-zh tl-tr tl-uk tl-vi tl-zh tr-tt tr-uk tr-vi tr-zh tt-uk tt-zh uk-vi uk-zh vi-zh wuu-zh"""
    cite = """@article{wikimatrix1,
    author    = {Holger Schwenk and Vishrav Chaudhary and Shuo Sun and Hongyu Gong and Francisco Guzm{\'{a}}n},
    title     = {WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia},
    journal   = {CoRR},
    volume    = {abs/1907.05791},
    year      = {2019},
    url       = {http://arxiv.org/abs/1907.05791},
    archivePrefix = {arXiv},
    eprint    = {1907.05791},
    timestamp = {Wed, 17 Jul 2019 10:27:36 +0200},
    biburl    = {https://dblp.org/rec/journals/corr/abs-1907-05791.bib},
    bibsource = {dblp computer science bibliography, https://dblp.org}}"""
    url_pat = "https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.%s-%s.tsv.gz"
    mapping = dict(sh='hbs')
    skips = {'nds_nl', 'simple'}
    for pair in data.split():
        l1, l2 = pair.split('-')
        if l1 in skips or l2 in skips:
            continue
        l1iso, l2iso = mapping.get(l1, l1), mapping.get(l2, l2)
        url = url_pat % (l1, l2)
        ent = Entry(langs=(l1iso, l2iso), url=url, name='WikiMatrix_v1', cols=(1, 2), cite=cite)
        index.add_entry(ent)
Exemplo n.º 26
0
def load_all(index: Index):
    data = """am-en am-fr ar-am ar-en ar-fr aym-am aym-ar aym-en aym-fr bg-ar bg-aym bg-en bg-fr
  bn-am bn-ar bn-aym bn-bg bn-en bn-fr ca-am ca-ar ca-aym ca-bg ca-bn ca-en ca-fr cs-ar cs-aym 
  cs-bg cs-bn cs-ca cs-en cs-fr da-am da-ar da-aym da-bg da-bn da-ca da-cs da-en da-fr de-am 
  de-ar de-aym de-bg de-bn de-ca de-cs de-da de-en de-fr el-am el-ar el-aym el-bg el-bn el-ca
  el-cs el-da el-de el-en el-fr eo-ar eo-aym eo-bg eo-bn eo-ca eo-cs eo-da eo-de eo-el eo-en
  eo-fr es-am es-ar es-aym es-bg es-bn es-ca es-cs es-da es-de es-el es-en es-eo es-fr fa-am
  fa-ar fa-aym fa-bg fa-bn fa-ca fa-cs fa-da fa-de fa-el fa-en fa-eo fa-es fa-fr fil-ar fil-aym
  fil-bg fil-bn fil-ca fil-cs fil-da fil-de fil-el fil-en fil-eo fil-es fil-fa fil-fr fr-en he-ar
   he-bn he-ca he-cs he-da he-de he-el he-en he-es he-fa he-fr hi-am hi-ar hi-bg hi-bn hi-cs hi-de
   hi-el hi-en hi-eo hi-es hi-fa hi-fr hu-am hu-ar hu-aym hu-bg hu-bn hu-ca hu-cs hu-da hu-de
   hu-el hu-en hu-eo hu-es hu-fa hu-fil hu-fr hu-hi id-am id-ar id-aym id-bg id-bn id-ca id-cs 
   id-da id-de id-el id-en id-eo id-es id-fa id-fil id-fr id-hi id-hu it-am it-ar it-aym it-bg it-bn 
   it-ca it-cs it-da it-de it-el it-en it-eo it-es it-fa it-fil it-fr it-he it-hi it-hu it-id jp-am 
   jp-ar jp-aym jp-bg jp-bn jp-ca jp-cs jp-da jp-de jp-el jp-en jp-eo jp-es jp-fa jp-fil jp-fr 
   jp-he jp-hi jp-hu jp-id jp-it km-ar km-aym km-bn km-ca km-da km-de km-el km-en km-es km-fa km-fil 
   km-fr km-hu km-it km-jp ko-am ko-ar ko-aym ko-bg ko-bn ko-ca ko-cs ko-da ko-de ko-el ko-en ko-eo 
   ko-es ko-fa ko-fil ko-fr ko-hu ko-id ko-it ko-jp ku-ar ku-el ku-en ku-es ku-fr ku-it ku-jp mg-am 
   mg-ar mg-aym mg-bg mg-bn mg-ca mg-cs mg-da mg-de mg-el mg-en mg-eo mg-es mg-fa mg-fil mg-fr 
   mg-he mg-hi mg-hu mg-id mg-it mg-jp mg-km mg-ko mg-ku mk-am mk-ar mk-aym mk-bg mk-bn mk-ca mk-cs 
   mk-da mk-de mk-el mk-en mk-eo mk-es mk-fa mk-fil mk-fr mk-he mk-hi mk-hu mk-id mk-it mk-jp mk-km 
   mk-ko mk-mg my-am my-ar my-aym my-bg my-bn my-ca my-cs my-da my-de my-el my-en my-es my-fa my-fil 
   my-fr my-he my-hi my-hu my-id my-it my-jp my-ko my-mg my-mk ne-ar ne-aym ne-bg ne-bn ne-ca ne-cs 
   ne-de ne-el ne-en ne-eo ne-es ne-fa ne-fr ne-hi ne-id ne-it ne-jp ne-ko ne-mg ne-mk nl-am nl-ar 
   nl-aym nl-bg nl-bn nl-ca nl-cs nl-da nl-de nl-el nl-en nl-eo nl-es nl-fa nl-fil nl-fr nl-he nl-hi 
   nl-hu nl-id nl-it nl-jp nl-km nl-ko nl-mg nl-mk nl-my nl-ne or-ar or-aym or-bn or-ca or-cs or-de 
   or-el or-en or-es or-fa or-fr or-hi or-it or-jp or-mg or-mk or-nl pa-ar pa-bn pa-ca pa-cs pa-de 
   pa-el pa-en pa-es pa-fr pa-hi pa-hu pa-it pa-jp pa-ko pa-mg pa-mk pa-ne pa-nl pl-am pl-ar pl-aym 
   pl-bg pl-bn pl-ca pl-cs pl-da pl-de pl-el pl-en pl-eo pl-es pl-fa pl-fil pl-fr pl-he pl-hi pl-hu 
   pl-id pl-it pl-jp pl-ko pl-ku pl-mg pl-mk pl-my pl-ne pl-nl pl-or pl-pa pt-am pt-ar pt-aym pt-bg 
   pt-bn pt-ca pt-cs pt-da pt-de pt-el pt-en pt-eo pt-es pt-fa pt-fil pt-fr pt-he pt-hi pt-hu pt-id 
   pt-it pt-jp pt-km pt-ko pt-ku pt-mg pt-mk pt-my pt-ne pt-nl pt-or pt-pa pt-pl ro-ar ro-aym ro-bg 
   ro-bn ro-ca ro-cs ro-de ro-el ro-en ro-eo ro-es ro-fa ro-fr ro-hu ro-id ro-it ro-jp ro-ko ro-ku 
   ro-mg ro-mk ro-my ro-ne ro-nl ro-pl ro-pt ru-am ru-ar ru-aym ru-bg ru-bn ru-ca ru-cs ru-da ru-de 
   ru-el ru-en ru-eo ru-es ru-fa ru-fil ru-fr ru-he ru-hi ru-hu ru-id ru-it ru-jp ru-km ru-ko ru-mg 
   ru-mk ru-my ru-ne ru-nl ru-or ru-pa ru-pl ru-pt ru-ro sq-am sq-ar sq-aym sq-bg sq-bn sq-ca sq-cs 
   sq-da sq-de sq-el sq-en sq-eo sq-es sq-fa sq-fil sq-fr sq-hi sq-hu sq-id sq-it sq-jp sq-ko sq-mg 
   sq-mk sq-my sq-nl sq-pl sq-pt sq-ru sr-am sr-ar sr-aym sr-bg sr-bn sr-ca sr-cs sr-da sr-de sr-el 
   sr-en sr-eo sr-es sr-fa sr-fil sr-fr sr-hi sr-hu sr-id sr-it sr-jp sr-km sr-ko sr-mg sr-mk sr-my 
   sr-ne sr-nl sr-pl sr-pt sr-ro sr-ru sr-sq sv-am sv-ar sv-aym sv-bg sv-bn sv-ca sv-cs sv-da sv-de 
   sv-el sv-en sv-eo sv-es sv-fa sv-fil sv-fr sv-he sv-hi sv-hu sv-id sv-it sv-jp sv-ko sv-mg sv-mk 
   sv-my sv-nl sv-pl sv-pt sv-ro sv-ru sv-sq sv-sr sw-am sw-ar sw-aym sw-bg sw-bn sw-ca sw-cs sw-da 
   sw-de sw-el sw-en sw-eo sw-es sw-fa sw-fil sw-fr sw-he sw-hi sw-hu sw-id sw-it sw-jp sw-km sw-ko 
   sw-mg sw-mk sw-my sw-ne sw-nl sw-pa sw-pl sw-pt sw-ro sw-ru sw-sq sw-sr sw-sv tet-ar tet-aym 
   tet-bn tet-cs tet-de tet-el tet-en tet-es tet-fr tet-id tet-it tet-mg tet-pt tet-ru tet-sw tr-am 
   tr-ar tr-aym tr-bg tr-bn tr-ca tr-cs tr-da tr-de tr-el tr-en tr-eo tr-es tr-fa tr-fil tr-fr tr-he 
   tr-hi tr-hu tr-id tr-it tr-jp tr-ko tr-mg tr-mk tr-my tr-ne tr-nl tr-pa tr-pl tr-pt tr-ro tr-ru 
   tr-sq tr-sr tr-sv tr-sw ur-am ur-ar ur-aym ur-bg ur-bn ur-ca ur-cs ur-da ur-de ur-el ur-en ur-eo 
   ur-es ur-fa ur-fil ur-fr ur-he ur-hi ur-hu ur-id ur-it ur-jp ur-ko ur-mg ur-mk ur-my ur-ne ur-nl 
   ur-or ur-pa ur-pl ur-pt ur-ro ur-ru ur-sq ur-sr ur-sv ur-sw ur-tr yo-ar yo-el yo-en yo-es yo-fr 
   yo-it yo-mg yo-pl yo-pt yo-ru yo-sw zhs-am zhs-ar zhs-aym zhs-bg zhs-bn zhs-ca zhs-cs zhs-da 
   zhs-de zhs-el zhs-en zhs-eo zhs-es zhs-fa zhs-fil zhs-fr zhs-he zhs-hi zhs-hu zhs-id zhs-it 
   zhs-jp zhs-km zhs-ko zhs-mg zhs-mk zhs-my zhs-ne zhs-nl zhs-pa zhs-pl zhs-pt zhs-ro zhs-ru 
   zhs-sq zhs-sr zhs-sv zhs-sw zhs-tr zhs-ur zht-am zht-ar zht-aym zht-bg zht-bn zht-ca zht-cs 
   zht-da zht-de zht-el zht-en zht-eo zht-es zht-fa zht-fil zht-fr zht-he zht-hi zht-hu zht-id 
   zht-it zht-jp zht-km zht-ko zht-mg zht-mk zht-my zht-ne zht-nl zht-pa zht-pl zht-pt zht-ro 
   zht-ru zht-sq zht-sr zht-sv zht-sw zht-tet zht-tr zht-ur zht-zhs"""
    url = 'http://casmacat.eu/corpus/global-voices-tar-balls/training.tgz'
    cite = """Philipp Koehn, "Global Voices Corpus" http://casmacat.eu/corpus/global-voices.html """

    # any hot fixes for lang id mapping specific to this source
    code_map = {
        'jp':
        'jpn',  # there was never a jp in ISO 693, it was always a 'ja' not 'jp'
        'zhs': 'zho'  # map simplified to chinese
    }
    code_map = code_map.get
    for pair in data.split():
        if 'zht' in pair:
            continue  #skipping traditional chinese because I dont know the ISO code for it
        l1, l2 = pair.split('-')
        f1 = f'training/globalvoices.{l1}-{l2}.{l1}'
        f2 = f'training/globalvoices.{l1}-{l2}.{l2}'
        l1, l2 = code_map(l1, l1), code_map(l2, l2)  # map codes
        ent = Entry(langs=(l1, l2),
                    name='GlobalVoices_2018Q4',
                    url=url,
                    filename='GlobalVoices_2018Q4-training.tgz',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[f1, f2])
        index.add_entry(ent)