예제 #1
0
파일: tilde.py 프로젝트: masonreznov/mtdata
def load(index: Index):
    CONTENT = """EESC2017::bg-de bg-en bg-fr cs-de cs-en cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-pl en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    EMA2016::bg-de bg-en bg-fr cs-de cs-en cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-no de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-no fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    airbaltic::de-en de-et de-fi de-lt de-lv de-ru en-et en-fi en-lt en-lv en-ru et-fi et-lt et-lv et-ru fi-lt fi-lv fi-ru lt-lv lt-ru lv-ru
    czechtourism::de-en de-es de-fr de-it de-pl de-pt de-ru en-es en-fr en-it en-pl en-pt en-ru es-fr es-it es-pl es-pt es-ru fr-it fr-pl fr-pt fr-ru it-pl it-pt it-ru pl-pt pl-ru pt-ru
    ecb2017::bg-cs bg-da bg-de bg-el bg-en bg-es bg-et bg-fi bg-fr bg-hr bg-hu bg-it bg-lt bg-lv bg-mt bg-nl bg-pl bg-pt bg-ro bg-sk bg-sl bg-sv cs-da cs-de cs-el cs-en cs-es cs-et cs-fi cs-fr cs-hr cs-hu cs-it cs-lt cs-lv cs-mt cs-nl cs-pl cs-pt cs-ro cs-sk cs-sl cs-sv da-de da-el da-en da-es da-et da-fi da-fr da-hr da-hu da-it da-lt da-lv da-mt da-nl da-pl da-pt da-ro da-sk da-sl da-sv de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-it de-lt de-lv de-mt de-nl de-pl de-pt de-ro de-sk de-sl de-sv el-en el-es el-et el-fi el-fr el-hr el-hu el-it el-lt el-lv el-mt el-nl el-pl el-pt el-ro el-sk el-sl el-sv en-es en-et en-fi en-fr en-hr en-hu en-it en-lt en-lv en-mt en-nl en-pl en-pt en-ro en-sk en-sl en-sv es-et es-fi es-fr es-hr es-hu es-it es-lt es-lv es-mt es-nl es-pl es-pt es-ro es-sk es-sl es-sv et-fi et-fr et-hr et-hu et-it et-lt et-lv et-mt et-nl et-pl et-pt et-ro et-sk et-sl et-sv fi-fr fi-hr fi-hu fi-it fi-lt fi-lv fi-mt fi-nl fi-pl fi-pt fi-ro fi-sk fi-sl fi-sv fr-hr fr-hu fr-it fr-lt fr-lv fr-mt fr-nl fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv hr-hu hr-it hr-lt hr-lv hr-mt hr-nl hr-pl hr-pt hr-ro hr-sk hr-sl hr-sv hu-it hu-lt hu-lv hu-mt hu-nl hu-pl hu-pt hu-ro hu-sk hu-sl hu-sv it-lt it-lv it-mt it-nl it-pl it-pt it-ro it-sk it-sl it-sv lt-lv lt-mt lt-nl lt-pl lt-pt lt-ro lt-sk lt-sl lt-sv lv-mt lv-nl lv-pl lv-pt lv-ro lv-sk lv-sl lv-sv mt-nl mt-pl mt-pt mt-ro mt-sk mt-sl mt-sv nl-pl nl-pt nl-ro nl-sk nl-sl nl-sv pl-pt pl-ro pl-sk pl-sl pl-sv pt-ro pt-sk pt-sl pt-sv ro-sk ro-sl ro-sv sk-sl sk-sv sl-sv
    fold::en-lv
    rapid2016::bg-de bg-en bg-fr cs-de cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-no de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-no fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    rapid2019::cs-en de-en en-pl
    worldbank::en-es en-fr en-hr en-pl en-pt en-ro en-ru en-sq en-sr en-tr en-uk"""
    TILDE_CITE = """@inproceedings{rozis-skadins-2017-tilde,
    title = "Tilde {MODEL} - Multilingual Open Data for {EU} Languages",
    author = "Rozis, Roberts  and
      Skadi{\c{n}}{\v{s}}, Raivis",
    booktitle = "Proceedings of the 21st Nordic Conference on Computational Linguistics",
    month = may,
    year = "2017",
    address = "Gothenburg, Sweden",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W17-0235",
    pages = "263--265",
}"""
    TILDE = 'https://tilde-model.s3-eu-west-1.amazonaws.com/%s.%s-%s.tmx.zip'
    for line in CONTENT.splitlines():
        line = line.strip()
        name, pairs = line.split('::')
        for pair in pairs.split(' '):
            l1, l2 = pair.split('-')
            url = TILDE % (name, l1, l2)
            index.add_entry(
                Entry(langs=(l1, l2),
                      name=name,
                      url=url,
                      cite=TILDE_CITE,
                      in_paths=["*.tmx"]))
예제 #2
0
def load_all(index: Index):

    cite = """@inproceedings{post-etal-2012-constructing,
    title = "Constructing Parallel Corpora for Six {I}ndian Languages via Crowdsourcing",
    author = "Post, Matt  and
      Callison-Burch, Chris  and
      Osborne, Miles",
    booktitle = "Proceedings of the Seventh Workshop on Statistical Machine Translation",
    month = jun,
    year = "2012",
    address = "Montr{\'e}al, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/W12-3152",
    pages = "401--409",
}"""
    url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz'
    l2 = 'en'
    langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta']
    for l1 in langs:
        for split in ['training', 'dev', 'test', 'devtest', 'dict']:
            if l1 == 'hi' and split == 'dict':
                continue  # hindi dont have dict
            f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}'
            f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}'
            if split not in ('training', 'dict'):
                f2 += '.0'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        name=f'JoshuaIndianCorpus_{split}',
                        filename='joshua-indian-parallel-corpora.tar.gz',
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
예제 #3
0
def load_all(index: Index):
    url = "http://phontron.com/data/ted_talks.tar.gz"
    cite = index.ref_db.get_bibtex('Ye2018WordEmbeddings')
    header = (
        "-,en,es,pt-br,fr,ru,he,ar,ko,zh-cn,it,ja,zh-tw,nl,ro,tr,de,vi,pl,pt,bg,el,fa,sr,hu,hr,"
        "uk,cs,id,th,sv,sk,sq,lt,da,calv-,my,sl,mk,fr-ca,fi,hy,hi,nor,ka,mn,et,ku,gl,mr,zh,ur,"
        "eo,ms,az,ta,bn,kk,be,eu,bs").split(',')
    col_idx = {lang: idx for idx, lang in enumerate(header)}

    # langs that I care; exclude <lang>-<country> bcoz the iso3 code doesnt have a way to map it
    langs = [x for x in header if '-' not in x]
    for split in ['train', 'test', 'dev']:
        for idx1, lang1 in enumerate(langs):
            col1 = col_idx[lang1]
            for lang2 in langs[idx1 + 1:]:
                col2 = col_idx[lang2]
                ent = NoisyEntry(
                    did=DatasetId(group='Neulab',
                                  name=f'tedtalks_{split}',
                                  version='1',
                                  langs=(lang1, lang2)),
                    filename='neulab_ted_talksv1.tar.gz',
                    url=url,
                    in_paths=[f"all_talks_{split}.tsv"],
                    in_ext='tsv',
                    cols=(col1, col2),
                    cite=cite,
                )
                index.add_entry(ent)
예제 #4
0
def load_all(index: Index):
    url = "http://phontron.com/data/ted_talks.tar.gz"
    cite = """@inproceedings{Ye2018WordEmbeddings,
    author  = {Ye, Qi and Devendra, Sachan and Matthieu, Felix and Sarguna, Padmanabhan and Graham, Neubig},
    title   = {When and Why are pre-trained word embeddings useful for Neural Machine Translation},
    booktitle = {HLT-NAACL},
    year    = {2018},
    }"""
    header = (
        "-,en,es,pt-br,fr,ru,he,ar,ko,zh-cn,it,ja,zh-tw,nl,ro,tr,de,vi,pl,pt,bg,el,fa,sr,hu,hr,"
        "uk,cs,id,th,sv,sk,sq,lt,da,calv-,my,sl,mk,fr-ca,fi,hy,hi,nor,ka,mn,et,ku,gl,mr,zh,ur,"
        "eo,ms,az,ta,bn,kk,be,eu,bs").split(',')
    col_idx = {lang: idx for idx, lang in enumerate(header)}

    # langs that I care; exclude <lang>-<country> bcoz the iso3 code doesnt have a way to map it
    langs = [x for x in header if '-' not in x]
    for split in ['train', 'test', 'dev']:
        for idx1, lang1 in enumerate(langs):
            col1 = col_idx[lang1]
            for lang2 in langs[idx1 + 1:]:
                col2 = col_idx[lang2]
                ent = NoisyEntry(langs=(lang1, lang2),
                                 name=f"neulab_tedtalksv1_{split}",
                                 filename='neulab_ted_talksv1.tar.gz',
                                 url=url, in_paths=[f"all_talks_{split}.tsv"],
                                 cols=(col1, col2), cite=cite)
                index.add_entry(ent)
예제 #5
0
def load(index: Index):
    content = """EESC2017::bg-de bg-en bg-fr cs-de cs-en cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-pl en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    EMA2016::bg-de bg-en bg-fr cs-de cs-en cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-no de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-no fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    airbaltic::de-en de-et de-fi de-lt de-lv de-ru en-et en-fi en-lt en-lv en-ru et-fi et-lt et-lv et-ru fi-lt fi-lv fi-ru lt-lv lt-ru lv-ru
    czechtourism::de-en de-es de-fr de-it de-pl de-pt de-ru en-es en-fr en-it en-pl en-pt en-ru es-fr es-it es-pl es-pt es-ru fr-it fr-pl fr-pt fr-ru it-pl it-pt it-ru pl-pt pl-ru pt-ru
    ecb2017::bg-cs bg-da bg-de bg-el bg-en bg-es bg-et bg-fi bg-fr bg-hr bg-hu bg-it bg-lt bg-lv bg-mt bg-nl bg-pl bg-pt bg-ro bg-sk bg-sl bg-sv cs-da cs-de cs-el cs-en cs-es cs-et cs-fi cs-fr cs-hr cs-hu cs-it cs-lt cs-lv cs-mt cs-nl cs-pl cs-pt cs-ro cs-sk cs-sl cs-sv da-de da-el da-en da-es da-et da-fi da-fr da-hr da-hu da-it da-lt da-lv da-mt da-nl da-pl da-pt da-ro da-sk da-sl da-sv de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-it de-lt de-lv de-mt de-nl de-pl de-pt de-ro de-sk de-sl de-sv el-en el-es el-et el-fi el-fr el-hr el-hu el-it el-lt el-lv el-mt el-nl el-pl el-pt el-ro el-sk el-sl el-sv en-es en-et en-fi en-fr en-hr en-hu en-it en-lt en-lv en-mt en-nl en-pl en-pt en-ro en-sk en-sl en-sv es-et es-fi es-fr es-hr es-hu es-it es-lt es-lv es-mt es-nl es-pl es-pt es-ro es-sk es-sl es-sv et-fi et-fr et-hr et-hu et-it et-lt et-lv et-mt et-nl et-pl et-pt et-ro et-sk et-sl et-sv fi-fr fi-hr fi-hu fi-it fi-lt fi-lv fi-mt fi-nl fi-pl fi-pt fi-ro fi-sk fi-sl fi-sv fr-hr fr-hu fr-it fr-lt fr-lv fr-mt fr-nl fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv hr-hu hr-it hr-lt hr-lv hr-mt hr-nl hr-pl hr-pt hr-ro hr-sk hr-sl hr-sv hu-it hu-lt hu-lv hu-mt hu-nl hu-pl hu-pt hu-ro hu-sk hu-sl hu-sv it-lt it-lv it-mt it-nl it-pl it-pt it-ro it-sk it-sl it-sv lt-lv lt-mt lt-nl lt-pl lt-pt lt-ro lt-sk lt-sl lt-sv lv-mt lv-nl lv-pl lv-pt lv-ro lv-sk lv-sl lv-sv mt-nl mt-pl mt-pt mt-ro mt-sk mt-sl mt-sv nl-pl nl-pt nl-ro nl-sk nl-sl nl-sv pl-pt pl-ro pl-sk pl-sl pl-sv pt-ro pt-sk pt-sl pt-sv ro-sk ro-sl ro-sv sk-sl sk-sv sl-sv
    fold::en-lv
    rapid2016::bg-de bg-en bg-fr cs-de cs-fr da-de da-en da-fr de-el de-en de-es de-et de-fi de-fr de-hr de-hu de-is de-it de-lt de-lv de-mt de-nl de-no de-pl de-pt de-ro de-sk de-sl de-sv el-en el-fr en-es en-et en-fi en-fr en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pt en-ro en-sk en-sl en-sv es-fr et-fr fi-fr fr-hr fr-hu fr-is fr-it fr-lt fr-lv fr-mt fr-nl fr-no fr-pl fr-pt fr-ro fr-sk fr-sl fr-sv
    rapid2019::cs-en de-en en-pl
    worldbank::en-es en-fr en-hr en-pl en-pt en-ro en-ru en-sq en-sr en-tr en-uk"""
    cite = index.ref_db.get_bibtex('rozis-skadins-2017-tilde')
    TILDE = 'https://tilde-model.s3-eu-west-1.amazonaws.com/%s.%s-%s.tmx.zip'
    for line in content.splitlines():
        line = line.strip()
        name, pairs = line.split('::')
        for pair in pairs.split(' '):
            l1, l2 = pair.split('-')
            url = TILDE % (name, l1, l2)
            index.add_entry(
                Entry(langs=(l1, l2),
                      name=name,
                      url=url,
                      cite=cite,
                      in_paths=["*.tmx"],
                      in_ext='tmx'))
예제 #6
0
def load_all(index: Index):
    with open(REFS_FILE, encoding='utf-8') as data:
        for line in data:
            l1, l2, num, short, name, info, download, licenses, in_paths = line.split('\t', maxsplit=8)
            dataset_name = short.lower().replace(':', '_').replace('__', '_').replace('__', '_')
            in_paths = in_paths.strip().split('\t')
            ent = Entry(did=DatasetId(group='ELRC', name=dataset_name, version='1', langs=(l1, l2)),
                    url=download, filename="ELRC_" + str(num) + ".zip", in_ext='tmx', in_paths=in_paths)
            index.add_entry(ent)
예제 #7
0
def load_all(index: Index):
    url_ptn = 'https://www.dropbox.com/s/{uid}/wikititles-2014_{l1}{l2}.tgz?dl=1'
    rows = [row.split(',') for row in wiki_titles.splitlines()]
    for row in rows:
        uid, pair = row
        assert len(pair) == 4
        l1, l2 = pair[:2], pair[2:]
        url = url_ptn.format(uid=uid, l1=l1, l2=l2)
        in_file = f'wikititles-2014_{l1}{l2}'
        ent = Entry(did=DatasetId(group='LinguaTools', name=f'wikititles', version='2014', langs=(l1, l2)),
                    url=url, ext='tgz', in_ext='txt', in_paths=[f'{in_file}.{l1}', f'{in_file}.{l2}'])
        index.add_entry(ent)
예제 #8
0
def load_all(index: Index):
    with open(REFS_FILE) as data:
        for line in data:
            l1, l2, num, short, name, info, download, licenses, in_paths = line.split(
                '\t', maxsplit=8)
            in_paths = in_paths.strip().split('\t')
            ent = Entry(langs=(l1, l2),
                        url=download,
                        name="ELRC_" + short,
                        filename="ELRC_" + str(num) + ".zip",
                        in_ext='tmx',
                        in_paths=in_paths)
            index.add_entry(ent)
예제 #9
0
def load_all(index: Index):

    url_pat = 'https://object.pouta.csc.fi/OPUS-{corpus}/{version}/moses/{l1}-{l2}.txt.zip'
    group_id = 'OPUS'
    citation = index.ref_db.get_bibtex('tiedemann2012parallel')
    skip_counts = defaultdict(int)
    dupes = defaultdict(set)
    assert data_file.exists()
    assert data_file.stat().st_size > 0

    with data_file.open() as lines:
        for line in lines:
            line = line.strip()
            if not line:  # empty lines in the top and bottom
                continue
            assert len(line.split('\t')) == 4, line
            corpus, version, l1, l2 = line.split('\t')
            url = url_pat.format(corpus=corpus, version=version, l1=l1, l2=l2)
            iso_l1, iso_l2 = bcp47.try_parse(
                l1, default=None), bcp47.try_parse(l2, default=None)
            if not iso_l1 or not iso_l2:
                if not iso_l1:
                    skip_counts[str(l1)] += 1
                if not iso_l2:
                    skip_counts[str(l2)] += 1
                continue
            version_cln = version.replace('-', '').lower()
            corpus_cln = corpus.replace('-', '_').lower()

            data_id = DatasetId(group=group_id,
                                name=corpus_cln,
                                version=version_cln,
                                langs=(iso_l1, iso_l2))
            if data_id in index:
                dupes[corpus].add(f'{l1}-{l2}')
                continue
            entry = Entry(did=data_id,
                          url=url,
                          cite=citation,
                          in_paths=[f'*.{l1}', f'*.{l2}'],
                          in_ext='txt')
            index.add_entry(entry)
        if skip_counts:
            skip_counts = list(
                sorted(dict(skip_counts).items(),
                       key=lambda x: x[1],
                       reverse=True))
            log.info(f"Skipped lang counts: {skip_counts}")
        if dupes:
            log.info(f"Duplicates langs: {dupes}")
예제 #10
0
def load_all(index: Index):

    cite = index.ref_db.get_bibtex('ramesh2021samanantar')
    pairs = ('en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi'
             ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa'
             ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or'
             ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te'
             ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te')
    BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip'
    for pair in pairs.strip().split(' '):
        l1, l2 = pair.split('-')
        dirname = 'en2indic' if l1 == 'en' else 'indic2indic'
        url = BASE_v0_2.format(dirname=dirname, pair=pair)
        ent = Entry(langs=(l1, l2), name='AI4B_Samananthar_v02', url=url, cite=cite,
              in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'], in_ext='txt')
        index.add_entry(ent)
예제 #11
0
def load_all(index: Index):
    cite = index.ref_db.get_bibtex('ziemski-etal-2016-united')
    url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz"
    langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh']
    for split in ['dev', 'test']:
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}'
            f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename='UNv1.0.testsets.tar.gz',
                        name=f'UNv1_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)
예제 #12
0
def load_all(index: Index):
    lines = data_file.read_text(encoding='utf-8').splitlines()
    langs = set('hi bn ta ml te kn mr pa gu as ur or'.split())  # other than en
    group_id = 'Anuvaad'
    cite_txt = index.ref_db.get_bibtex('project-anuvaad')
    for url in lines:
        url = url.strip()
        assert url.startswith('http') and url.endswith('.zip')
        file_name = url.split('/')[-1]
        file_name = file_name[:-4]  # .zip
        char_count = coll.Counter(list(file_name))
        n_hyps = char_count.get('-', 0)
        n_unders = char_count.get('_', 0)
        if n_hyps > n_unders:
            parts = file_name.split('-')
        else:
            assert '_' in file_name
            parts = file_name.split('_')
        name, version = '?', '?'
        l1, l2 = 'en', '?'
        if parts[-2] == l1 and parts[-1] in langs:
            l2 = parts[-1]
            version = parts[-3]
        elif parts[-3] == l1 and parts[-2] in langs:
            l2 = parts[-2]
            version = parts[-1]
        else:
            log.warn(f"Unable to parse {file_name} :: {parts}")
            continue
        name = '_'.join(parts[:-3])
        name = name.replace('-', '_')
        f1 = f'{l1}-{l2}/*.{l1}'
        f2 = f'{l1}-{l2}/*.{l2}'
        if name == 'wikipedia':
            f1 = f'{l1}-{l2}/{l1}.txt'
            f2 = f'{l1}-{l2}/{l2}.txt'

        ent = Entry(did=DatasetId(group=group_id,
                                  name=name,
                                  version=version,
                                  langs=(l1, l2)),
                    url=url,
                    ext='zip',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite_txt)
        index.add_entry(ent)
예제 #13
0
def load_all(index: Index):
    URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz"
    cite = index.ref_db.get_bibtex('zhang-etal-2020-improving')
    cite += '\n\n' + index.ref_db.get_bibtex('tiedemann2012parallel')
    filename = 'opus-100-corpus-v1.0.tar.gz'
    code_map = dict(
        nb='nob',
        sh='hbs')  # these arent obvious to iso lookup function, so helping
    group, name = 'OPUS', 'opus100'
    for pair in supervised_v1:
        l1, l2 = pair.split("-")
        l1 = code_map.get(l1, l1)
        l2 = code_map.get(l2, l2)
        splits = ['train', 'dev', 'test']
        if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}:
            splits = ['train'
                      ]  # somehow they forgot to include test sets for these
        for split in splits:
            f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}'
            f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}'
            ent = Entry(did=DatasetId(group=group,
                                      name=f'{name}_{split}',
                                      version='1',
                                      langs=(l1, l2)),
                        url=URL,
                        filename=filename,
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
    for pair in zeroshot_v1:
        l1, l2 = pair.split("-")
        f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}'
        f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}'
        ent = Entry(did=DatasetId(group=group,
                                  name=f'{name}_test',
                                  version='1',
                                  langs=(l1, l2)),
                    url=URL,
                    filename=filename,
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite)
        index.add_entry(ent)
예제 #14
0
def load_all(index: Index):
    cite = index.ref_db.get_bibtex('ziemski-etal-2016-united')
    url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz"
    url = "https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx"  # they changed it!
    langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh']
    for split in ['dev', 'test']:
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}'
            f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}'
            ent = Entry(did=DatasetId(group='UN',
                                      name=f'un_{split}',
                                      version='1',
                                      langs=(l1, l2)),
                        url=url,
                        filename='UNv1.0.testsets.tar.gz',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)
예제 #15
0
파일: joshua_indian.py 프로젝트: kpu/mtdata
def load_all(index: Index):

    cite = index.ref_db.get_bibtex(key='post-etal-2012-constructing')
    url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz'
    l2 = 'en'
    langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta']
    for l1 in langs:
        for split in ['training', 'dev', 'test', 'devtest', 'dict']:
            if l1 == 'hi' and split == 'dict':
                continue  # hindi dont have dict
            f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}'
            f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}'
            if split not in ('training', 'dict'):
                f2 += '.0'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        name=f'JoshuaIndianCorpus_{split}',
                        filename='joshua-indian-parallel-corpora.tar.gz',
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
예제 #16
0
def load(index: Index):
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    cite = r"""@inproceedings{espla-etal-2019-paracrawl,
        title = "{P}ara{C}rawl: Web-scale parallel corpora for the languages of the {EU}",
        author = "Espl{\`a}, Miquel  and
          Forcada, Mikel  and
          Ram{\'\i}rez-S{\'a}nchez, Gema  and
          Hoang, Hieu",
        booktitle = "Proceedings of Machine Translation Summit XVII Volume 2: Translator, Project and User Tracks",
        month = aug,
        year = "2019",
        address = "Dublin, Ireland",
        publisher = "European Association for Machine Translation",
        url = "https://www.aclweb.org/anthology/W19-6721",
        pages = "118--119",
    }"""
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v3',
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
예제 #17
0
def load_all(index: Index):
    # === ECDC ===
    # https://ec.europa.eu/jrc/en/language-technologies/ecdc-translation-memory
    cite = index.ref_db.get_bibtex('Steinberger2014')
    langs = 'en bg cs da de el es et fi fr ga hu is it lt lv mt nl no pl pt ro sk sl sv'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="http://optima.jrc.it/Resources/ECDC-TM/ECDC-TM.zip",
                        name="ECDC", in_ext='tmx', cite=cite, in_paths=["ECDC-TM/ECDC.tmx"])
            index.add_entry(ent)

    # === EAC ===
    # https://ec.europa.eu/jrc/en/language-technologies/eac-translation-memory
    # This corpus has two 
    langs = 'bg cs da de el en es et fi fr hu is it lt lv mt nb nl pl pt ro sk sl sv tr'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip",
                        name="EAC_Forms", in_ext='tmx', cite=cite, in_paths=["EAC_FORMS.tmx"])
            index.add_entry(ent)
    langs = 'bg cs da de el en es et fi fr hr hu is it lt lv mt nl no pl pt ro sk sl sv tr'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip",
                        name="EAC_Reference", in_ext='tmx', cite=cite, in_paths=["EAC_REFRENCE_DATA.tmx"])
            index.add_entry(ent)

    # === DCEP ===
    # https://ec.europa.eu/jrc/en/language-technologies/dcep
    # This was annoying to process so I ended up rehosting it.
    # Don't bother with TR; it doesn't have sentences anyway.
    cite = index.ref_db.get_bibtex('dcep')
    langs = 'BG CS DA DE EL EN ES ET FI FR GA HU IT LT LV MT NL PL PT RO SK SL SV'.split()
    for i, l1 in enumerate(langs):
        for l2 in langs[i+1:]:
            ent = Entry(langs=(l1, l2), url=f"http://data.statmt.org/DCEP/{l1}-{l2}.tsv.xz",
                        name="DCEP", in_ext='tsv', cite=cite)
            index.add_entry(ent)
예제 #18
0
def load(index: Index):
    cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl')
    cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl')
    group_id = 'ParaCrawl'
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='3',
                                langs=(l1, l2)),
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='6',
                                langs=(l1, l2)),
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='6B',
                                langs=(l1, l2)),
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    l1 = 'en'
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz'
    for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split(
    ):
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='7.1',
                                langs=(l1, l2)),
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz'
    for pair in 'en-nb en-nn es-ca es-eu es-gl'.split():
        l1, l2 = pair.split('-')
        index.add_entry(
            Entry(did=DatasetId(group=group_id,
                                name=f'paracrawl',
                                version='7.1',
                                langs=(l1, l2)),
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz'
    for version, pairs in [
        ('v8.0', 'en-bg en-cs en-da en-de en-el'),
        ('v8.0-0001',
         'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl'
         ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl')
    ]:
        for pair in pairs.split():
            l1, l2 = pair.split('-')
            url = PARACRAWL_V8.format(version=version, pair=pair)
            ent = Entry(did=DatasetId(group=group_id,
                                      name=f'paracrawl',
                                      version='8',
                                      langs=(l1, l2)),
                        url=url,
                        cite=cite,
                        ext='tsv.gz')
            index.add_entry(ent)

    PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz'
    for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split(
    ):
        l1, l2 = pair.split('-')
        url = PARACRAWL_BONUS.format(pair=pair)
        ent = Entry(did=DatasetId(group=group_id,
                                  name=f'paracrawl',
                                  version='1_bonus',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)

    PARACRAWL_V9 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release9/{l1}-{l2}/{l1}-{l2}.txt.gz'
    for pair in (
            'en-bg en-cs en-da en-de en-el en-es en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv'
            ' en-mt en-nb en-nl en-nn en-pl en-pt en-ro en-sk en-sl en-sv es-ca es-eu es-gl'
    ).split():
        l1, l2 = pair.split('-')
        url = PARACRAWL_V9.format(l1=l1, l2=l2)
        ent = Entry(did=DatasetId(group=group_id,
                                  name=f'paracrawl',
                                  version='9',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)
    # this is a new addition in Sept 2021
    index.add_entry(
        Entry(
            did=DatasetId(group=group_id,
                          name=f'paracrawl',
                          version='1_bonus',
                          langs=('en', 'zh')),
            url=
            'http://web-language-models.s3-website-us-east-1.amazonaws.com/paracrawl/bonus/en-zh-v1.txt.gz',
            cite=cite,
            ext='tsv.gz'))

    # Japanese-English paracrawl (5.1) used by WMT20 and WMT21
    for version in ['2', '3']:
        ent = Entry(
            did=DatasetId(group='KECL',
                          name=f'paracrawl',
                          version=version,
                          langs=('eng', 'jpn')),
            in_paths=['en-ja/en-ja.bicleaner05.txt'],
            in_ext='tsv',
            cols=(2, 3),
            cite='',
            url=
            f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz'
        )
        index.add_entry(ent)
예제 #19
0
파일: opus100.py 프로젝트: kpu/mtdata
def load_all(index: Index):
    URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz"
    cite = """
@inproceedings{zhang-etal-2020-improving,
    title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation",
    author = "Zhang, Biao  and
      Williams, Philip  and
      Titov, Ivan  and
      Sennrich, Rico",
    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
    month = jul,
    year = "2020",
    address = "Online",
    publisher = "Association for Computational Linguistics",
    url = "https://www.aclweb.org/anthology/2020.acl-main.148",
    doi = "10.18653/v1/2020.acl-main.148",
    pages = "1628--1639",
}
@inproceedings{tiedemann2012parallel,
  title={Parallel Data, Tools and Interfaces in OPUS.},
  author={Tiedemann, J{\"o}rg},
  booktitle={Lrec},
  volume={2012},
  pages={2214--2218},
  year={2012}
}"""
    filename = 'opus-100-corpus-v1.0.tar.gz'
    code_map = dict(
        nb='nob',
        sh='hbs')  # these arent obvious to iso lookup function, so helping
    for pair in supervised_v1:
        l1, l2 = pair.split("-")
        l1 = code_map.get(l1, l1)
        l2 = code_map.get(l2, l2)
        splits = ['train', 'dev', 'test']
        if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}:
            splits = ['train'
                      ]  # somehow they forgot to include test sets for these
        for split in splits:
            f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}'
            f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=URL,
                        name=f'OPUS100v1_{split}',
                        filename=filename,
                        in_paths=[f1, f2],
                        in_ext='txt',
                        cite=cite)
            index.add_entry(ent)
    for pair in zeroshot_v1:
        l1, l2 = pair.split("-")
        f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}'
        f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=URL,
                    name=f'OPUS100v1_test',
                    filename=filename,
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite)
        index.add_entry(ent)
예제 #20
0
def load(index: Index):
    cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl')
    cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl')
    # === Para crawl corpus
    PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz'
    for pair in ['en cs', 'en de', 'en fi', 'en lt']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v3',
                  url=PARACRAWL_v3 % (l1, l2),
                  cite=cite))

    # === Paracrawl V6
    PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz'
    for l2 in [
            'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el',
            'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl',
            'es', 'sv'
    ]:
        l1 = 'en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    # these are bonus
    PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz'
    for l1, l2 in [('nl', 'fr'), ('pl', 'de')]:
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v6',
                  url=PARACRAWL_v6_B % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    l1 = 'en'
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz'
    for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split(
    ):
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v7_1',
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))
    PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz'
    for pair in 'en-nb en-nn es-ca es-eu es-gl'.split():
        l1, l2 = pair.split('-')
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='paracrawl_v7',
                  url=PARACRAWL_v7_1 % (l1, l2),
                  cite=cite,
                  ext='tsv.gz'))

    PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz'
    for version, pairs in [
        ('v8.0', 'en-bg en-cs en-da en-de en-el'),
        ('v8.0-0001',
         'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl'
         ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl')
    ]:
        for pair in pairs.split():
            l1, l2 = pair.split('-')
            url = PARACRAWL_V8.format(version=version, pair=pair)
            ent = Entry(langs=(l1, l2),
                        name='paracrawl_v8',
                        url=url,
                        cite=cite,
                        ext='tsv.gz')
            index.add_entry(ent)

    PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz'
    for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split(
    ):
        l1, l2 = pair.split('-')
        url = PARACRAWL_BONUS.format(pair=pair)
        ent = Entry(langs=(l1, l2),
                    name='paracrawl_bonus',
                    url=url,
                    cite=cite,
                    ext='tsv.gz')
        index.add_entry(ent)
예제 #21
0
def load(index: Index):
    WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
    WMT14_CITE = """@proceedings{ws-2014-statistical,
        title = "Proceedings of the Ninth Workshop on Statistical Machine Translation",
        editor = "Bojar, Ond{\v{r}}ej  and
          Buck, Christian  and
          Federmann, Christian  and
          Haddow, Barry  and
          Koehn, Philipp  and
          Monz, Christof  and
          Post, Matt  and
          Specia, Lucia",
        month = jun,
        year = "2014",
        address = "Baltimore, Maryland, USA",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W14-3300",
        doi = "10.3115/v1/W14-33",
    }"""
    for l1 in ['de', 'cs', 'fr', 'ru', 'es']:
        l2 = 'en'
        f1 = f'commoncrawl.{l1}-en.{l1}'
        f2 = f'commoncrawl.{l1}-en.en'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name=f'wmt13_commoncrawl',
                  url=WMT13_CCRAWL,
                  filename='wmt13_parallel_commoncrawl.tgz',
                  in_paths=[f1, f2],
                  in_ext='txt',
                  cite=WMT14_CITE))

    # === WMT 13 release of europarl_v7 ===
    for l1 in ['cs', 'de', 'fr', 'es']:
        l2 = 'en'
        f1 = f'training/europarl-v7.{l1}-{l2}.{l1}'
        f2 = f'training/europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(
                langs=(l1, l2),
                name=f'wmt13_europarl_v7',
                url=
                "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
                filename="wmt13_europarl_v7.tgz",
                in_paths=[f1, f2],
                in_ext='txt',
                cite=WMT14_CITE))

    # ==== WMT 18  news commentary v13 ===
    for l1 in ['cs', 'de', 'ru', 'zh']:
        l2 = 'en'
        f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}'
        f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(
                langs=(l1, l2),
                name=f'wmt18_news_commentary_v13',
                url=
                "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
                filename="wmt18_news_commentary_v13.tgz",
                in_paths=[f1, f2],
                in_ext='txt',
                cite=WMT14_CITE))

    # === Europarl V9 corpus
    EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz'
    cite = r"""@inproceedings{koehn2005europarl,
      title={Europarl: A parallel corpus for statistical machine translation},
      author={Koehn, Philipp},
      booktitle={MT summit},
      volume={5},
      pages={79--86},
      year={2005},
      organization={Citeseer}
    }"""
    for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v9',
                  url=EUROPARL_v9 % (l1, l2),
                  cite=cite))

    # === Europarl V7 corpus
    EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz'
    cite = r"""@inproceedings{bojar-etal-2017-findings,
      title = "Findings of the 2017 Conference on Machine Translation ({WMT}17)",
      author = "Bojar, Ond{\v{r}}ej  and
        Chatterjee, Rajen  and
        Federmann, Christian  and
        Graham, Yvette  and
        Haddow, Barry  and
        Huang, Shujian  and
        Huck, Matthias  and
        Koehn, Philipp  and
        Liu, Qun  and
        Logacheva, Varvara  and
        Monz, Christof  and
        Negri, Matteo  and
        Post, Matt  and
        Rubino, Raphael  and
        Specia, Lucia  and
        Turchi, Marco",
      booktitle = "Proceedings of the Second Conference on Machine Translation",
      month = sep,
      year = "2017",
      address = "Copenhagen, Denmark",
      publisher = "Association for Computational Linguistics",
      url = "https://www.aclweb.org/anthology/W17-4717",
      doi = "10.18653/v1/W17-4717",
      pages = "169--214",
    }"""
    for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split(
    ):
        l2 = 'en'
        src = f'europarl-v7.{l1}-{l2}.{l1}'
        ref = f'europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v7',
                  in_paths=[src, ref],
                  url=EUROPARL_v7 % (l1, l2),
                  in_ext='txt',
                  cite=cite))

    # === Digital Corpus of European Parliament
    index.add_entry(
        Entry(
            langs=('lv', 'en'),
            name='wmt17_dcep_v1',
            in_paths=['*/*.lv', f'*/*.en'],
            cite=cite,
            url=
            'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz'))
    index.add_entry(
        Entry(
            langs=('lv', 'en'),
            name='wmt17_books_v1',
            in_paths=['*/*.lv', f'*/*.en'],
            cite=cite,
            url=
            'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz')
    )

    # === News Commentary v14
    NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz"
    cite = r"""@inproceedings{bojar-etal-2018-findings,
        title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)",
        author = "Bojar, Ond{\v{r}}ej  and
          Federmann, Christian  and
          Fishel, Mark  and
          Graham, Yvette  and
          Haddow, Barry  and
          Koehn, Philipp  and
          Monz, Christof",
        booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers",
        month = oct,
        year = "2018",
        address = "Belgium, Brussels",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W18-6401",
        doi = "10.18653/v1/W18-6401",
        pages = "272--303"
    }"""
    for pair in [
            'ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id',
            'ar it', 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh',
            'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it',
            'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en',
            'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk',
            'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi',
            'en id', 'en it', 'en ja', 'en kk', 'en nl', 'en pt', 'en ru',
            'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja', 'es kk',
            'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it',
            'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id',
            'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk',
            'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt',
            'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru',
            'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='news_commentary_v14',
                  url=NEWSCOM_v14 % (l1, l2),
                  cite=cite))

    # ===== Wiki Titles V1
    WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz'
    cite = r"""@inproceedings{barrault-etal-2019-findings,
        title = "Findings of the 2019 Conference on Machine Translation ({WMT}19)",
        author = {Barrault, Lo{\"\i}c  and
          Bojar, Ond{\v{r}}ej  and
          Costa-juss{\`a}, Marta R.  and
          Federmann, Christian  and
          Fishel, Mark  and
          Graham, Yvette  and
          Haddow, Barry  and
          Huck, Matthias  and
          Koehn, Philipp  and
          Malmasi, Shervin  and
          Monz, Christof  and
          M{\"u}ller, Mathias  and
          Pal, Santanu  and
          Post, Matt  and
          Zampieri, Marcos},
        booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)",
        month = aug,
        year = "2019",
        address = "Florence, Italy",
        publisher = "Association for Computational Linguistics",
        url = "https://www.aclweb.org/anthology/W19-5301",
        doi = "10.18653/v1/W19-5301",
        pages = "1--61"
    }"""
    for pair in [
            'cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne',
            'kk en', 'lt en', 'ru en', 'zh en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='wiki_titles_v1',
                  url=WIKI_TITLES_v1 % (l1, l2),
                  cite=cite))

    # ===== Wiki Titles V2
    WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz'
    for pair in [
            'ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en',
            'pl en', 'ps en', 'ru en', 'ta en', 'zh en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='wiki_titles_v2',
                  url=WIKI_TITLES_v2 % (l1, l2),
                  cite=cite))

    # ==== WMT  Dev and Tests
    wmt_sets = {
        'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'),
                         ('ru', 'en'), ('hi', 'en')],
        'newsdev2015': [('fi', 'en'), ('en', 'fi')],
        'newstest2015':
        [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'),
         ('de', 'en'), ('ru', 'en'), ('en', 'fi')],
        'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'),
                        ('en', 'tr')],
        'newstest2016':
        [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'),
         ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'),
         ('en', 'tr'), ('en', 'cs')],
        'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'),
                        ('en', 'lv')],
        'newstest2017':
        [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'),
         ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'),
         ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newsdev2018': [('et', 'en'), ('en', 'et')],
        'newstest2018':
        [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'),
         ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'),
         ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')],
        'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'),
                        ('lt', 'en'), ('en', 'gu')],
        'newstest2019':
        [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'),
         ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'),
         ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'),
         ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')],
        'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'),
                        ('en', 'iu'), ('en', 'ja'), ('ja', 'en'), ('en', 'pl')]
    }
    for set_name, pairs in wmt_sets.items():
        for l1, l2 in pairs:
            src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm'
            ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm'
            name = f'{set_name}_{l1}{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[src, ref],
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                    cite=cite))
    # Multi parallel
    wmt_sets = {
        '2009': ['en', 'cs', 'de', 'es', 'fr'],
        '2010': ['en', 'cs', 'de', 'es', 'fr'],
        '2011': ['en', 'cs', 'de', 'es', 'fr'],
        '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
        '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
    }
    for year, langs in wmt_sets.items():
        for l1, l2 in itertools.combinations(langs, 2):
            name = f'newstest{year}'
            f1 = f'dev/{name}.{l1}'
            f2 = f'dev/{name}.{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[f1, f2],
                    in_ext='txt',
                    cite=cite,
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz'
                ))

    for l1, l2 in [('ps', 'en'), ('km', 'en')]:
        for set_name in ['wikipedia.dev', 'wikipedia.devtest']:
            src = f'dev/{set_name}.{l1}-{l2}.{l1}'
            ref = f'dev/{set_name}.{l1}-{l2}.{l2}'
            name = f'{set_name.replace(".", "_")}_{l1}{l2}'
            index.add_entry(
                Entry(
                    (l1, l2),
                    name=name,
                    filename='wmt20dev.tgz',
                    in_paths=[src, ref],
                    url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                    in_ext='txt',
                    cite=cite))

    # ==== TED Talks 2.0 ar-en
    index.add_entry(
        Entry(
            ('en', 'ar'),
            'tedtalks_v2_clean',
            ext='tsv.xz',
            url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz'))

    # ==== Europarl v10
    EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz"
    wmt20_cite = None  # TODO: update
    for pair in [
            'cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en',
            'lt en', 'pl en'
    ]:
        l1, l2 = pair.split()
        index.add_entry(
            Entry(langs=(l1, l2),
                  name='europarl_v10',
                  url=EP_v10 % (l1, l2),
                  cite=wmt20_cite))

    # ==== PMIndia V1
    PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv"
    cite = r"""@ARTICLE{2020arXiv200109907H,
           author = {{Haddow}, Barry and {Kirefu}, Faheem},
            title = "{PMIndia -- A Collection of Parallel Corpora of Languages of India}",
          journal = {arXiv e-prints},
         keywords = {Computer Science - Computation and Language},
             year = "2020",
            month = "Jan",
              eid = {arXiv:2001.09907},
            pages = {arXiv:2001.09907},
    archivePrefix = {arXiv},
           eprint = {2001.09907}
    }"""
    for pair in [
            "as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en",
            "mr en", "or en", "pa en", "ta en", "te en", "ur en"
    ]:
        l1, l2 = pair.split()
        # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed!
        index.add_entry(
            Entry(langs=(l2, l1),
                  name='pmindia_v1',
                  url=PMINDIA_v1 % (l1, l2),
                  cite=cite))

    # Pashto - English  pseudo parallel dataset for alignment
    index.add_entry(
        Entry(
            langs=('en', 'ps'),
            name='wmt20_enps_aligntask',
            url=
            'http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz',
            cite=wmt20_cite,
            ext='tsv.xz'))

    # Pashto - English  mostly parallel dataset
    for name in [
            "GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps",
            "bible.en-ps.clean", "ted-wmt20.en-ps", "wikimedia.en-ps"
    ]:
        ps = f'ps-parallel/{name}.ps'
        en = f'ps-parallel/{name}.en'
        url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz'
        name = name.replace('.en-ps',
                            '').replace('.', '_').replace('-', '_').lower()
        entry = Entry(langs=('ps', 'en'),
                      name=name,
                      url=url,
                      cite=wmt20_cite,
                      in_paths=[ps, en],
                      filename='wmt20-psen-parallel.tgz',
                      in_ext='txt')
        index.add_entry(entry)
예제 #22
0
파일: other.py 프로젝트: kpu/mtdata
def load_all(index: Index):

    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb')
    l1, l2 = 'hi', 'en'
    for version, prefix in [
            #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('v1_5',
         'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download'
         )
    ]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    name=f'IITB{version}_train',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[
                        f'parallel/IITB.en-hi.{l1}',
                        f'parallel/IITB.en-hi.{l2}'
                    ])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        name=f'IITB{version}_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = index.ref_db.get_bibtex('neubig11kftt')
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename="kftt-data-1.0.tar.gz",
                    name=f'kftt_v1_{split}',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite)
        index.add_entry(ent)

    url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip"
    cite = index.ref_db.get_bibtex('ding2020a')
    for split in ['dev', 'test', 'train']:
        ent = Entry(langs=('my', 'en'),
                    url=url,
                    name=f'WAT2020_ALT_{split}',
                    in_ext='txt',
                    cite=cite,
                    filename='wat2020.my-en.zip',
                    in_paths=[
                        f'wat2020.my-en/alt/{split}.alt.my',
                        f'wat2020.my-en/alt/{split}.alt.en'
                    ])
        index.add_entry(ent)

    l1, l2 = 'iu', 'en'
    url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60"
    cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut')
    for split in ['dev', 'devtest', 'test', 'train']:
        path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}'
        if split != 'train':
            path_pref += '-dedup'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    name=f'NunavutHansard_v3_{split}',
                    in_ext='txt',
                    cite=cite,
                    filename='NunavutHansard_iuen_v3.tgz',
                    in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}'])
        index.add_entry(ent)

    # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122
    url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip"
    cite = index.ref_db.get_bibtex('Khresmoi')
    langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"]
    for i, l1 in enumerate(langs):
        for l2 in langs[i + 1:]:
            ent = Entry(
                langs=(l1, l2),
                url=url,
                name='Khresmoi_Summary_Test_v2',
                filename='khresmoi-summary-test-set-2.0.zip',
                cite=cite,
                in_paths=[
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}",
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}"
                ],
                in_ext='txt')
            index.add_entry(ent)
            ent = Entry(
                langs=(l1, l2),
                url=url,
                name='Khresmoi_Summary_Dev_v2',
                filename='khresmoi-summary-test-set-2.0.zip',
                cite=cite,
                in_paths=[
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}",
                    f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}"
                ],
                in_ext='txt')
            index.add_entry(ent)
예제 #23
0
def load(index: Index):
    group_id = 'Statmt'
    WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
    WMT14_CITE = index.ref_db.get_bibtex('ws-2014-statistical')
    for l1 in ['de', 'cs', 'fr', 'ru', 'es']:
        l2 = 'en'
        f1 = f'commoncrawl.{l1}-en.{l1}'
        f2 = f'commoncrawl.{l1}-en.en'
        data_id = DatasetId(group=group_id, name='commoncrawl_wmt13', version='1', langs=(l1, l2))
        index.add_entry(Entry(did=data_id, url=WMT13_CCRAWL,
                              filename='wmt13_parallel_commoncrawl.tgz',
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # === WMT 13 release of europarl_v7 ===
    for l1 in ['cs', 'de', 'fr', 'es']:
        l2 = 'en'
        f1 = f'training/europarl-v7.{l1}-{l2}.{l1}'
        f2 = f'training/europarl-v7.{l1}-{l2}.{l2}'
        data_id = DatasetId(group=group_id, name='europarl_wmt13', version='7', langs=(l1, l2))
        index.add_entry(Entry(did=data_id, url="http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz",
                              filename="wmt13_europarl_v7.tgz",
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # ==== WMT 18  news commentary v13 ===
    for l1 in ['cs', 'de', 'ru', 'zh']:
        l2 = 'en'
        f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}'
        f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}'
        data_id = DatasetId(group=group_id, name='news_commentary_wmt18', version='13', langs=(l1, l2))
        index.add_entry(Entry(did=data_id,
                              url="http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz",
                              filename="wmt18_news_commentary_v13.tgz",
                              in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE))

    # === Europarl V9 corpus
    EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz'
    cite = index.ref_db.get_bibtex('koehn2005europarl')
    for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='europarl', version='9', langs=(l1, l2)),
                              url=EUROPARL_v9 % (l1, l2), cite=cite))

    # === Europarl V7 corpus
    EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz'
    cite = index.ref_db.get_bibtex('bojar-etal-2017-findings')
    for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split():
        l2 = 'en'
        src = f'europarl-v7.{l1}-{l2}.{l1}'
        ref = f'europarl-v7.{l1}-{l2}.{l2}'
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name='europarl', version='7', langs=(l1, l2)), in_paths=[src, ref],
                              url=EUROPARL_v7 % (l1, l2), in_ext='txt', cite=cite))

    # === Digital Corpus of European Parliament
    index.add_entry(Entry(did=DatasetId(group=group_id, name='dcep_wmt17', version='1', langs=(l1, l2)),
                          in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt',
                          url='http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz'))
    index.add_entry(Entry(did=DatasetId(group=group_id, name='books_wmt17', version='1', langs=(l1, l2)),
                          in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt',
                          url='http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz'))

    # === News Commentary v14
    NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz"
    cite = index.ref_db.get_bibtex('bojar-etal-2018-findings')
    for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it',
                 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es',
                 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru',
                 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk',
                 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it',
                 'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id',
                 'es it', 'es ja', 'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id',
                 'fr it', 'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it',
                 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk', 'id nl', 'id pt', 'id ru',
                 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl',
                 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh']:
        l1, l2 = pair.split()
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name='news_commentary', version='14', langs=(l1, l2)),
            url=NEWSCOM_v14 % (l1, l2), cite=cite))

    for v in [15, 16]:
        cite = index.ref_db.get_bibtex('barrault-etal-2020-findings')
        url = f"http://data.statmt.org/news-commentary/v{v}/training/news-commentary-v{v}.%s-%s.tsv.gz"
        for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it', 'ar ja', 'ar kk', 'ar nl',
                     'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja',
                     'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it',
                     'de ja', 'de kk', 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it',
                     'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja',
                     'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it', 'fr ja', 'fr kk', 'fr nl',
                     'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id ja',
                     'id kk', 'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja pt',
                     'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh',
                     'ru zh']:
            l1, l2 = pair.split()
            index.add_entry(Entry(did=DatasetId(group=group_id, name='news_commentary', version=f'{v}', langs=(l1, l2)),
                url=url % (l1, l2), cite=cite))


    # ===== Wiki Titles V1
    WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz'
    cite = index.ref_db.get_bibtex('barrault-etal-2019-findings')
    for pair in ['cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne', 'kk en', 'lt en',
                 'ru en', 'zh en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='1', langs=(l1, l2)),
                              url=WIKI_TITLES_v1 % (l1, l2), cite=cite))

    # ===== Wiki Titles V2
    WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz'
    for pair in ['ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en', 'pl en', 'ps en',
                 'ru en', 'ta en', 'zh en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='2', langs=(l1, l2)),
                              url=WIKI_TITLES_v2 % (l1, l2), cite=cite))

    WIKI_TITLES_v3 = 'http://data.statmt.org/wikititles/v3/wikititles-v3.{pair}.tsv'
    langs = 'bn-hi ca-es ca-pt ca-ro cs-en de-en de-fr es-pt es-ro ha-en ig-en is-en ja-en ps-en pt-ro ru-en xh-zu zh-en'
    for pair in langs.split():
        l1, l2 = pair.split('-')
        url = WIKI_TITLES_v3.format(pair=pair)
        ent = Entry(did=DatasetId(group=group_id, name=f'wikititles', version='3', langs=(l1, l2)), url=url, cite=cite)
        index.add_entry(ent)

    # ==== WMT  Dev and Tests
    wmt_sets = {
        'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'), ('ru', 'en'), ('hi', 'en')],
        'newsdev2015': [('fi', 'en'), ('en', 'fi')],
        'newstest2015': [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'),
                         ('de', 'en'), ('ru', 'en'), ('en', 'fi')],
        'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'), ('en', 'tr')],
        'newstest2016': [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'),
                         ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'),
                         ('en', 'tr'), ('en', 'cs')],
        'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newstest2017': [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'),
                         ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'),
                         ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')],
        'newsdev2018': [('et', 'en'), ('en', 'et')],
        'newstest2018': [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'),
                         ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'),
                         ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')],
        'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'), ('lt', 'en'),
                        ('en', 'gu')],
        'newstest2019': [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'),
                         ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'),
                         ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'),
                         ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')],
        'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'), ('en', 'iu'),
                        ('en', 'ja'), ('ja', 'en'), ('en', 'pl')]
    }
    for set_name, pairs in wmt_sets.items():
        sub_name, year = set_name[:-4], set_name[-4:]
        for l1, l2 in pairs:
            src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm'
            ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm'
            name = f'{sub_name}_{l1}{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)),
                                  filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='sgm',
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz',
                                  cite=cite))

    # Multi parallel
    wmt_sets = {
        '2009': ['en', 'cs', 'de', 'es', 'fr'],
        '2010': ['en', 'cs', 'de', 'es', 'fr'],
        '2011': ['en', 'cs', 'de', 'es', 'fr'],
        '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
        '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'],
    }
    for year, langs in wmt_sets.items():
        name = 'newstest'
        for l1, l2 in itertools.combinations(langs, 2):
            f1 = f'dev/{name}{year}.{l1}'
            f2 = f'dev/{name}{year}.{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)),
                                    filename='wmt20dev.tgz', in_paths=[f1, f2], in_ext='txt', cite=cite,
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz'))

    for l1, l2 in [('ps', 'en'), ('km', 'en')]:
        for set_name in ['wikipedia.dev', 'wikipedia.devtest']:
            src = f'dev/{set_name}.{l1}-{l2}.{l1}'
            ref = f'dev/{set_name}.{l1}-{l2}.{l2}'
            name = f'{set_name.replace(".", "_")}_{l1}{l2}'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version='1', langs=(l1, l2)),
                                 filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='txt', cite=cite,
                                  url='http://data.statmt.org/wmt20/translation-task/dev.tgz'))

    #### WMT 20 Tests
    url = "http://data.statmt.org/wmt20/translation-task/test.tgz"
    wmt20_cite = index.ref_db.get_bibtex('barrault-etal-2020-findings')
    for _pref, pairs in {
        "": ["csen", "deen", "defr", "encs", "ende", "eniu", "enja", "enkm", "enpl", "enps",
             "enru", "enta", "enzh", "frde", "iuen", "jaen", "kmen", "plen", "psen", "ruen",
             "taen", "zhen"],
        "B": ["deen", "ende", "enzh", "ruen", "zhen"]}.items():
        year = "2020"
        name = f'newstest{_pref}'
        for pair in pairs:
            l1, l2 = pair[:2], pair[2:]
            f1 = f'sgm/{name}{year}-{pair}-src.{l1}.sgm'
            f2 = f'sgm/{name}{year}-{pair}-ref.{l2}.sgm'
            index.add_entry(Entry(did=DatasetId(group=group_id, name=f'{name}_{pair}'.lower(), version=year, langs=(l1, l2)),
                filename='wmt20tests.tgz', in_paths=[f1, f2], in_ext='sgm', cite=wmt20_cite, url=url))

    # WMT 21 Dev
    url = "http://data.statmt.org/wmt21/translation-task/dev.tgz"
    pairs = "en-ha en-is is-en ha-en".split()
    for pair in pairs:
        l1, l2 = pair.split('-')
        in_path = f'dev/xml/newsdev2021.{l1}-{l2}.xml'
        ent = Entry(did=DatasetId(group=group_id, name=f'newsdev_{l1}{l2}', version='2021', langs=(l1, l2)),
                    filename='wmt21dev.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url)
        index.add_entry(ent)

    url = "http://data.statmt.org/wmt21/translation-task/test.tgz"
    pairs = 'bn-hi hi-bn xh-zu zu-xh cs-en de-en de-fr en-cs en-de en-ha en-is en-ja en-ru en-zh fr-de ha-en is-en ja-en ru-en zh-en'.split()
    for pair in pairs:
        l1, l2 = pair.split('-')
        name = 'newstest'
        if pair in 'bn-hi hi-bn xh-zu zu-xh':
            name = 'florestest'
        in_path = f'test/{name}2021.{l1}-{l2}.xml'
        ent = Entry(did=DatasetId(group=group_id, name=f'{name}_{l1}{l2}', version='2021', langs=(l1, l2)),
                    filename='wmt21tests.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url)
        index.add_entry(ent)

    # ==== TED Talks 2.0 ar-en
    index.add_entry(Entry(did=DatasetId(group=group_id, name='tedtalks', version='2_clean', langs=('en', 'ar')),
                         ext='tsv.xz', url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz'))

    # ==== Europarl v10
    EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz"
    for pair in ['cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en', 'lt en', 'pl en']:
        l1, l2 = pair.split()
        index.add_entry(Entry(did=DatasetId(group=group_id, name=f'europarl', version='10', langs=(l1, l2)),
                url=EP_v10 % (l1, l2), cite=wmt20_cite))

    # ==== PMIndia V1
    PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv"
    cite = index.ref_db.get_bibtex('Haddow-etal-2020-PMIndia')
    for pair in ["as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en", "mr en", "or en",
                 "pa en", "ta en", "te en", "ur en"]:
        l1, l2 = pair.split()
        # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed!
        index.add_entry(Entry(did=DatasetId(group=group_id, name=f'pmindia', version='1', langs=(l2, l1)),
                              url=PMINDIA_v1 % (l1, l2), cite=cite))

    # Pashto - English  pseudo parallel dataset for alignment
    index.add_entry(Entry(did=DatasetId(group=group_id, name=f'wmt20_enps_aligntask', version='1', langs=('en', 'ps')),
                          url='http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz',
                          cite=wmt20_cite, ext='tsv.xz'))

    # Pashto - English  mostly parallel dataset
    for name in ["GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps", "bible.en-ps.clean",
                 "ted-wmt20.en-ps", "wikimedia.en-ps"]:
        ps = f'ps-parallel/{name}.ps'
        en = f'ps-parallel/{name}.en'
        url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz'
        name = name.replace('.en-ps', '').replace('.', '_').replace('-', '_').lower()
        entry = Entry(did=DatasetId(group=group_id, name=name, version='1', langs=('ps', 'en')), url=url,
                      cite=wmt20_cite, in_paths=[ps, en], filename='wmt20-psen-parallel.tgz', in_ext='txt')
        index.add_entry(entry)

    for l2 in ['ps', 'km']:
        url = f"http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-{l2}.xz"
        entry = Entry(did=DatasetId(group=group_id, name='paracrawl', version='5.1', langs=('en', l2)),
                    url=url, cite=wmt20_cite, ext='tsv.xz', cols=(0, 1))
        index.add_entry(entry)

    # for ja-en only TED was available
    index.add_entry(Entry(url="http://data.statmt.org/wmt20/translation-task/ja-en/ted.en-ja.tgz",
                    did=DatasetId(group=group_id, name='ted', version='wmt20', langs=('en', 'ja')),
                    cite=wmt20_cite, ext='tgz', in_ext='txt',
                    in_paths=['en-ja/train.tags.en-ja.en', 'en-ja/train.tags.en-ja.ja']))

    ccalign_cite = index.ref_db.get_bibtex('chaudhary-EtAl:2019:WMT')
    CC_ALIGNED = 'http://www.statmt.org/cc-aligned/sentence-aligned/{src}-{tgt}.tsv.xz'
    tgts='es_XX et_EE fa_IR ff_NG fi_FI fr_XX gu_IN ha_NG he_IL hi_IN hr_HR ht_HT hu_HU hy_AM id_ID ig_NG is_IS it_IT ja_XX jv_ID ka_GE kg_AO kk_KZ km_KH kn_IN ko_KR ku_TR ky_KG lg_UG ln_CD lo_LA lt_LT lv_LV mg_MG mi_NZ mk_MK ml_IN mn_MN mr_IN ms_MY mt_MT my_MM ne_NP nl_XX no_XX ns_ZA ny_MW om_KE or_IN pa_IN pl_PL ps_AF pt_XX qa_MM qd_MM ro_RO ru_RU si_LK sk_SK sl_SI sn_ZW so_SO sq_AL sr_RS ss_SZ st_ZA su_ID sv_SE sw_KE sz_PL ta_IN te_IN tg_TJ th_TH ti_ET tl_XX tn_BW tr_TR ts_ZA tz_MA uk_UA ur_PK ve_ZA vi_VN wo_SN xh_ZA yo_NG zh_CN zh_TW zu_ZA zz_TR'.split()
    srcs = 'af_ZA ak_GH am_ET ar_AR as_IN ay_BO az_AZ az_IR be_BY bg_BG bm_ML bn_IN br_FR bs_BA ca_ES cb_IQ cs_CZ cx_PH cy_GB da_DK de_DE el_GR'.split()
    pairs = [('en_XX', tgt) for tgt in tgts] + [(src, 'en_XX') for src in srcs]
    dont_know = {'qa', 'qd'}   # looks like some Myanmar languages, but not sure which one.
    # Cant find them in ISO 639-1:  https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
    #                and lingo http://www.lingoes.net/en/translator/langcode.htm
    #               and web-info https://wp-info.org/tools/languagecodes.php
    #unsupported = {'zh_TW', 'az_IR'}
    # country locales are not supported; they create conflicts. keeping large ones instead
    for src, tgt in pairs:
        # l1, l2 = src.split('_')[0], tgt.split('_')[0]
        if src[:2] in dont_know or tgt[:2] in dont_know:   # I dont know what language these are
            continue
        url = CC_ALIGNED.format(src=src, tgt=tgt)
        entry = Entry(did=DatasetId(group=group_id, name='ccaligned', version='1', langs=(src, tgt)), url=url,
                      cite=ccalign_cite, ext='tsv.xz', cols=(0, 1))
        index.add_entry(entry)

    wmt21_cite = 'WMT21'  # unavailable at the time of adding
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'khamenei', version='wmt21', langs=('ha','en')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/ha-en/khamenei.v1.ha-en.tsv', ext='tsv', cols=(2, 3)))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'opus', version='wmt21', langs=('ha', 'en')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0)))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ha')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ha.bifixed.dedup.laser.filter-0.9.xz',
        ext='tsv.xz', cols=[1, 2]))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ru')), cite=wmt21_cite,
        url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ru.bifixed.dedup.filter-1.1.xz',
        ext='tsv.xz', cols=[0, 1]))

    for pair in ['bn-hi', 'xh-zu']:
        l1, l2 = pair.split('-')
        url = f'http://data.statmt.org/wmt21/translation-task/cc-aligned/{pair}.tsv.xz'
        index.add_entry(Entry(
            did=DatasetId(group=group_id, name=f'ccaligned', version='wmt21', langs=(l1, l2)), cite=wmt21_cite,
            url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0)))

    # https://data.statmt.org/wmt19/translation-task/fr-de/bitexts/de-fr.bicleaner07.de.gz
    for cln_name, name in [('commoncrawl', ''), ('paracrawl', 'de-fr.bicleaner07'), ('europarl_v7', '')]:
        l1, l2 = 'fr', 'de'
        prefix = 'https://data.statmt.org/wmt19/translation-task/fr-de/bitexts'
        index.add_entry(Entry(did=DatasetId(group=group_id, name=cln_name or name, version='wmt19', langs=(l1, l2)),
                              ext='txt.gz', url=(f'{prefix}/{name}.{l1}.gz', f'{prefix}/{name}.{l2}.gz')))

    # Back Translation
    prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/zh-en'
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_enzh', version='wmt20', langs=('en', 'zh')),
        ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.translatedto.zh.gz')))

    prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/ru-en'
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_enru', version='wmt20', langs=('en', 'ru')),
                      ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.en.translatedto.ru.gz')))
    index.add_entry(Entry(
        did=DatasetId(group=group_id, name='backtrans_ruen', version='wmt20', langs=('ru', 'en')),
        ext='txt.gz', url=(f'{prefix}/news.ru.gz', f'{prefix}/news.ru.translatedto.en.gz')))
예제 #24
0
def load_all(index: Index):
    group = 'AI4Bharath'
    cite = index.ref_db.get_bibtex('ramesh2021samanantar')
    pairs = (
        'en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi'
        ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa'
        ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or'
        ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te'
        ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te')
    BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip'
    for pair in pairs.strip().split(' '):
        l1, l2 = pair.split('-')
        dirname = 'en2indic' if l1 == 'en' else 'indic2indic'
        url = BASE_v0_2.format(dirname=dirname, pair=pair)

        ent = Entry(did=DatasetId(group=group,
                                  name=f'samananthar',
                                  version='0.2',
                                  langs=(l1, l2)),
                    url=url,
                    cite=cite,
                    in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'],
                    in_ext='txt')
        index.add_entry(ent)

    URL = "https://storage.googleapis.com/samanantar-public/benchmarks.zip"
    filename = "samananthar-benchmarks.zip"
    for split in ('dev', 'test'):
        want20_langs = 'bn gu hi ml mr ta te'.split()
        for l2 in want20_langs:
            f1 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.en'
            f2 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.{l2}'
            ent = Entry(did=DatasetId(group=group,
                                      name=f'wat_{split}',
                                      version='2020',
                                      langs=('en', l2)),
                        filename=filename,
                        url=URL,
                        cite=cite,
                        in_paths=[f1, f2],
                        in_ext='txt')
            index.add_entry(ent)

        wat21_langs = 'bn en gu hi kn ml mr or pa ta te'.split()
        for i, l1 in enumerate(wat21_langs):
            for l2 in wat21_langs[i + 1:]:
                f1 = f'benchmarks/wat2021-devtest/{split}.{l1}'
                f2 = f'benchmarks/wat2021-devtest/{split}.{l2}'
                ent = Entry(did=DatasetId(group=group,
                                          name=f'wat_{split}',
                                          version='2021',
                                          langs=(l1, l2)),
                            filename=filename,
                            url=URL,
                            cite=cite,
                            in_paths=[f1, f2],
                            in_ext='txt')
                index.add_entry(ent)

        # PMI langs; en-as
        index.add_entry(
            Entry(did=DatasetId(group=group,
                                name=f'pmi_{split}',
                                version='2021',
                                langs=('en', 'as')),
                  filename=filename,
                  url=URL,
                  cite=cite,
                  in_ext='txt',
                  in_paths=[
                      f'benchmarks/pmi/en-as/{split}.en',
                      f'benchmarks/pmi/en-as/{split}.as'
                  ]))
예제 #25
0
파일: other.py 프로젝트: masonreznov/mtdata
def load_all(index: Index):

    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = """@article{DBLP:journals/corr/abs-1710-02855,
  author    = {Anoop Kunchukuttan and
               Pratik Mehta and
               Pushpak Bhattacharyya},
  title     = {The {IIT} Bombay English-Hindi Parallel Corpus},
  journal   = {CoRR},
  volume    = {abs/1710.02855},
  year      = {2017},
  url       = {http://arxiv.org/abs/1710.02855},
  archivePrefix = {arXiv},
  eprint    = {1710.02855},
  timestamp = {Mon, 13 Aug 2018 16:48:50 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1710-02855.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}"""
    l1, l2 = 'hi', 'en'
    for version, prefix in [
            #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('v1_5',
         'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download'
         )
    ]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    name=f'IITB{version}_train',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[
                        f'parallel/IITB.en-hi.{l1}',
                        f'parallel/IITB.en-hi.{l2}'
                    ])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(langs=(l1, l2),
                        url=url,
                        filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        name=f'IITB{version}_{split}',
                        in_ext='txt',
                        in_paths=[f1, f2],
                        cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = """@misc{neubig11kftt,
    author = {Graham Neubig},
    title = {The {Kyoto} Free Translation Task},
    howpublished = {http://www.phontron.com/kftt},
    year = {2011}
    }"""
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(langs=(l1, l2),
                    url=url,
                    filename="kftt-data-1.0.tar.gz",
                    name=f'kftt_v1_{split}',
                    in_ext='txt',
                    in_paths=[f1, f2],
                    cite=cite)
        index.add_entry(ent)
예제 #26
0
def load_all(index: Index):
    # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/
    cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb')
    l1, l2 = 'hi', 'en'
    for version, prefix in [
        # ('1.0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'),
        ('1.5', 'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download')]:
        # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/
        # version is not explicit, but guessed from file modification time and description
        url = prefix + "/parallel.tgz"
        ent = Entry(did=DatasetId(group='IITB', name=f'hien_train', version=version, langs=(l1, l2)),
                    url=url, filename=f'IITB{version}-hin_eng-parallel.tar.gz',
                    in_ext='txt', cite=cite,
                    in_paths=[f'parallel/IITB.en-hi.{l1}',
                              f'parallel/IITB.en-hi.{l2}'])
        index.add_entry(ent)

        url = prefix + "/dev_test.tgz"
        for split in ['dev', 'test']:
            f1 = f'dev_test/{split}.{l1}'
            f2 = f'dev_test/{split}.{l2}'
            ent = Entry(did=DatasetId(group='IITB', name=f'hien_{split}', version=version, langs=(l1, l2)),
                        url=url, filename=f'IITB{version}-hin_eng-dev_test.tar.gz',
                        in_ext='txt', in_paths=[f1, f2], cite=cite)
            index.add_entry(ent)

    # == Japanese ==
    cite = index.ref_db.get_bibtex('neubig11kftt')
    url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz"
    l1, l2 = 'en', 'ja'
    for split in ['train', 'test', 'dev', 'tune']:
        f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}'
        f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}'
        ent = Entry(did=DatasetId(group='Phontron', name=f'kftt_{split}', version='1', langs=(l1, l2)),
                    url=url, filename="kftt-data-1.0.tar.gz", in_ext='txt', in_paths=[f1, f2], cite=cite)
        index.add_entry(ent)

    url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip"
    cite = index.ref_db.get_bibtex('ding2020a')
    for split in ['dev', 'test', 'train']:
        ent = Entry(did=DatasetId(group='WAT', name=f'alt_{split}', version='2020', langs=('my', 'en')),
                    url=url, in_ext='txt', cite=cite, filename='wat2020.my-en.zip',
                    in_paths=[f'wat2020.my-en/alt/{split}.alt.my', f'wat2020.my-en/alt/{split}.alt.en'])
        index.add_entry(ent)

    l1, l2 = 'iu', 'en'
    url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60"
    cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut')
    for split in ['dev', 'devtest', 'test', 'train']:
        path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}'
        if split != 'train':
            path_pref += '-dedup'
        ent = Entry(did=DatasetId(group='NRC_CA', name=f'nunavut_hansard_{split}', version='3', langs=(l1, l2)),
                    url=url, in_ext='txt', cite=cite, filename='NunavutHansard_iuen_v3.tgz',
                    in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}'])
        index.add_entry(ent)

    # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122
    url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip"
    cite = index.ref_db.get_bibtex('Khresmoi')
    langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"]
    for i, l1 in enumerate(langs):
        for l2 in langs[i + 1:]:
            ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_test', version='2', langs=(l1, l2)),
                        url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt',
                        in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}",
                                  f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}"])
            index.add_entry(ent)
            ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_dev', version='2', langs=(l1, l2)),
                        url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt',
                        in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}",
                                  f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}"])
            index.add_entry(ent)

    jesc_cite = index.ref_db.get_bibtex('pryzant_jesc_2018')
    for split in ['train', 'dev', 'test']:
        ent = Entry(url='https://nlp.stanford.edu/projects/jesc/data/split.tar.gz',
                    did=DatasetId(group='StanfordNLP', name=f'jesc_{split}', version='1', langs=('en', 'ja')),
                    filename='jesc-split.tar.gz', in_ext='tsv', in_paths=[f"split/{split}"], cite=jesc_cite)
        index.add_entry(ent)

    prefix = 'https://nlp.stanford.edu/projects/nmt/data'
    for name, subdir, src, tgt, cite_key in [
        ("wmt15_train", "wmt15.en-cs", "train.en", "train.cs", "luong2016acl_hybrid"),
        ("newstest2013", "wmt15.en-cs", "newstest2013.en", "newstest2013.cs", "luong2016acl_hybrid"),
        ("newstest2014", "wmt15.en-cs", "newstest2014.en", "newstest2014.cs", "luong2016acl_hybrid"),
        ("newstest2015", "wmt15.en-cs", "newstest2015.en", "newstest2015.cs", "luong2016acl_hybrid"),
        ("wmt14_train", "wmt14.en-de", "train.en", "train.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2012", "wmt14.en-de", "newstest2012.en", "newstest2012.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2013", "wmt14.en-de", "newstest2013.en", "newstest2013.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2014", "wmt14.en-de", "newstest2014.en", "newstest2014.de", "luong-pham-manning:2015:EMNLP"),
        ("newstest2015", "wmt14.en-de", "newstest2015.en", "newstest2015.de", "luong-pham-manning:2015:EMNLP"),
        ("iwslt15_train", "iwslt15.en-vi", "train.en", "train.vi", "Luong-Manning:iwslt15"),
        ("test2012", "iwslt15.en-vi", "tst2012.en", "tst2012.vi", "Luong-Manning:iwslt15"),
        ("test2013", "iwslt15.en-vi", "tst2013.en", "tst2013.vi", "Luong-Manning:iwslt15")]:
        l1, l2 = src.split(".")[-1], tgt.split(".")[-1]
        url1 = f"{prefix}/{subdir}/{src}"
        url2 = f"{prefix}/{subdir}/{tgt}"
        cite = index.ref_db.get_bibtex(cite_key)
        ent = Entry(did=DatasetId(group='StanfordNLP', name=name, version='1', langs=(l1, l2)),
                    ext='txt', url=(url1, url2), cite=cite)
        index.add_entry(ent)

    _url = 'https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip'
    cite = index.ref_db.get_bibtex('Barkarson-et-al-2020')
    for sub in ['eea train dev test', 'ema train dev test', 'opensubtitles dev test']:
        l1, l2 = 'en', 'is'
        sub, *splits = sub.split()
        for split in splits:
            in_paths = [f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l1}.csv',
                        f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l2}.csv']
            if split == 'train' and sub == 'eea':
                in_paths = [in_paths[1], in_paths[0]] # aha! they have swapped it
            ent = Entry(did=DatasetId(group='ParIce', name=f'{sub}_{split}', version='20.05', langs=(l1, l2)),
                        url=_url, ext='zip', in_ext='txt', in_paths=in_paths, cite=cite,
                        filename='Parice_dev_test.20.05.zip')
            index.add_entry(ent)

    # https://github.com/bonaventuredossou/ffr-v1/tree/master/FFR-Dataset/FFR%20Dataset%20v2
    _url = 'https://raw.githubusercontent.com/bonaventuredossou/ffr-v1/master/FFR-Dataset/FFR%20Dataset%20v2/ffr_dataset_v2.txt'
    cite = index.ref_db.get_bibtex("emezue-dossou-2020-ffr")
    ent = Entry(did=DatasetId(group='Masakhane', name=f'ffr', version='2', langs=('fon', 'fra')),
                url=_url, ext='tsv', cite=cite)
    index.add_entry(ent)

    # https://zenodo.org/record/4432712
    _url = 'https://zenodo.org/record/4432712/files/Fon_French_Parallel_Data_25377.csv?download=1'
    cite = index.ref_db.get_bibtex("dossou2021crowdsourced")
    ent = Entry(did=DatasetId(group='Masakhane', name=f'daily_dialogues', version='1', langs=('fon', 'fra')),
                url=_url, ext='csvwithheader', cite=cite)
    index.add_entry(ent)
예제 #27
0
def load_all(index: Index):
    data="""an-ca an-de an-en an-es an-fr an-gl an-it an-pl an-pt an-ru ar-arz ar-az ar-ba ar-be ar-bg ar-bn ar-br ar-bs ar-ca ar-ceb
 ar-cs ar-da ar-de ar-el ar-en ar-eo ar-es ar-et ar-eu ar-fa ar-fi ar-fr ar-gl ar-he ar-hi ar-hr ar-hu ar-id ar-is ar-it
 ar-ja ar-kk ar-ko ar-lt ar-mk ar-ml ar-mr ar-nds ar-ne ar-nl ar-no ar-pl ar-pt ar-ro ar-ru ar-sh ar-si ar-sk ar-sl ar-sq
 ar-sr ar-sv ar-sw ar-ta ar-te ar-tl ar-tr ar-tt ar-uk ar-vi arz-de arz-en arz-es arz-fr ar-zh arz-it arz-pt arz-ru as-de as-es
 as-fr as-it azb-fr az-bg az-ca az-cs az-da az-de az-el az-en az-es az-et az-fa az-fi az-fr az-gl az-he az-hr az-hu az-id
 az-it az-ja az-ko az-nl az-no az-pl az-pt az-ro az-ru az-sr az-sv az-ta az-tr az-uk az-vi az-zh ba-bg ba-ca ba-cs ba-da
 ba-de ba-el ba-en ba-es ba-fi ba-fr ba-gl ba-hr ba-hu ba-id ba-it ba-ja ba-nl ba-no ba-pl ba-pt bar-de bar-en bar-es bar-fr
 bar-it ba-ro bar-pt bar-ru ba-ru ba-sh ba-sk ba-sl ba-sr ba-sv ba-tr ba-uk ba-zh be-bg be-ca be-cs be-de be-en be-es be-fi
 be-fr be-he be-hu be-it be-ja be-nl be-no be-pl be-pt be-ro be-ru be-sr be-sv be-uk bg-bn bg-bs bg-ca bg-ceb bg-cs bg-da
 bg-de bg-el bg-en bg-eo bg-es bg-et bg-eu bg-fa bg-fi bg-fr bg-gl bg-he bg-hi bg-hr bg-hu bg-id bg-is bg-it bg-ja bg-kk
 bg-ko bg-lt bg-mk bg-ml bg-mr bg-nds bg-ne bg-nl bg-no bg-pl bg-pt bg-ro bg-ru bg-sh bg-si bg-sk bg-sl bg-sq bg-sr bg-sv
 bg-sw bg-ta bg-te bg-tl bg-tr bg-tt bg-uk bg-vi bg-zh bn-bs bn-ca bn-cs bn-da bn-de bn-el bn-en bn-eo bn-es bn-et bn-eu
 bn-fa bn-fi bn-fr bn-gl bn-he bn-hi bn-hr bn-hu bn-id bn-it bn-ja bn-ko bn-lt bn-mk bn-nl bn-no bn-pl bn-pt bn-ro bn-ru
 bn-sh bn-sk bn-sl bn-sq bn-sr bn-sv bn-ta bn-tr bn-uk bn-vi bn-zh br-de br-en br-es br-fr br-it br-pt br-ru br-uk bs-ca
 bs-cs bs-da bs-de bs-el bs-en bs-eo bs-es bs-et bs-eu bs-fa bs-fi bs-fr bs-gl bs-he bs-hi bs-hr bs-hu bs-id bs-is bs-it
 bs-ja bs-ko bs-lt bs-mk bs-ml bs-mr bs-nl bs-no bs-pl bs-pt bs-ro bs-ru bs-sh bs-si bs-sk bs-sl bs-sq bs-sr bs-sv bs-ta
 bs-te bs-tl bs-tr bs-uk bs-vi bs-zh ca-ceb ca-cs ca-da ca-de ca-el ca-en ca-eo ca-es ca-et ca-eu ca-fa ca-fi ca-fo ca-fr
 ca-fy ca-gl ca-he ca-hi ca-hr ca-hu ca-id ca-is ca-it ca-ja ca-ka ca-kk ca-ko ca-la ca-lb ca-lt ca-mk ca-ml ca-mr ca-nds
 ca-ne ca-nl ca-no ca-oc ca-pl ca-pt ca-ro ca-ru ca-sh ca-si ca-sk ca-sl ca-sq ca-sr ca-sv ca-sw ca-ta ca-te ca-tl ca-tr
 ca-tt ca-uk ca-vi ca-zh ceb-cs ceb-de ceb-en ceb-es ceb-fi ceb-fr ceb-hu ceb-it ceb-ja ceb-nl ceb-no ceb-pl ceb-pt ceb-ro ceb-ru ceb-sv
 ceb-uk cs-da cs-de cs-el cs-en cs-eo cs-es cs-et cs-eu cs-fa cs-fi cs-fr cs-fy cs-gl cs-he cs-hi cs-hr cs-hu cs-id cs-is
 cs-it cs-ja cs-ka cs-kk cs-ko cs-la cs-lt cs-mk cs-ml cs-mr cs-nds cs-ne cs-nl cs-no cs-oc cs-pl cs-pt cs-ro cs-ru cs-sh
 cs-si cs-sk cs-sl cs-sq cs-sr cs-sv cs-sw cs-ta cs-te cs-tl cs-tr cs-tt cs-uk cs-vi cs-zh da-de da-el da-en da-eo da-es
 da-et da-eu da-fa da-fi da-fo da-fr da-gl da-he da-hi da-hr da-hu da-id da-is da-it da-ja da-ko da-lt da-mk da-ml da-mr
 da-nds da-ne da-nl da-no da-pl da-pt da-ro da-ru da-sh da-si da-sk da-sl da-sq da-sr da-sv da-sw da-ta da-te da-tl da-tr
 da-tt da-uk da-vi da-zh de-el de-en de-eo de-es de-et de-eu de-fa de-fi de-fo de-fr de-fy de-gl de-gom de-he de-hi de-hr
 de-hu de-hy de-id de-is de-it de-ja de-ka de-kk de-ko de-la de-lb de-lt de-mk de-ml de-mr de-nds de-ne de-nl de-no de-oc
 de-pl de-pt de-rm de-ro de-ru de-sh de-si de-sk de-sl de-sq de-sr de-sv de-sw de-ta de-te de-tg de-tl de-tr de-tt de-uk
 de-vi de-wuu de-zh el-en el-eo el-es el-et el-eu el-fa el-fi el-fr el-gl el-he el-hi el-hr el-hu el-id el-is el-it el-ja
 el-ko el-lt el-mk el-ml el-mr el-nl el-no el-pl el-pt el-ro el-ru el-sh el-si el-sk el-sl el-sq el-sr el-sv el-sw el-ta
 el-te el-tl el-tr el-uk el-vi el-zh en-eo en-es en-et en-eu en-fa en-fi en-fo en-fr en-fy en-gl en-he en-hi en-hr en-hu
 en-id en-io en-is en-it en-ja en-jv en-ka en-kk en-ko en-la en-lb en-lmo en-lt en-mg en-mk en-ml en-mr en-mwl en-nds_nl en-nds
 en-ne en-nl en-no en-oc en-pl en-pt en-ro en-ru en-sh en-simple en-si en-sk en-sl en-sq en-sr en-sv en-sw en-ta en-te en-tg
 en-tl en-tr en-tt en-ug en-uk en-vi en-wuu en-zh eo-es eo-et eo-eu eo-fa eo-fi eo-fr eo-gl eo-he eo-hi eo-hr eo-hu eo-id
 eo-is eo-it eo-ja eo-ko eo-lt eo-mk eo-ml eo-mr eo-nds eo-nl eo-no eo-pl eo-pt eo-ro eo-ru eo-sh eo-si eo-sk eo-sl eo-sq
 eo-sr eo-sv eo-ta eo-te eo-tl eo-tr eo-uk eo-vi eo-zh es-et es-eu es-fa es-fi es-fo es-fr es-fy es-gl es-gom es-he es-hi
 es-hr es-hu es-hy es-id es-is es-it es-ja es-jv es-ka es-kk es-ko es-la es-lb es-lt es-mk es-ml es-mr es-nds es-ne es-nl
 es-no es-oc es-pl es-pt es-ro es-ru es-sh es-si es-sk es-sl es-sq es-sr es-sv es-sw es-ta es-te es-tl es-tr es-tt es-uk
 es-vi es-wuu es-zh et-eu et-fa et-fi et-fr et-gl et-he et-hi et-hr et-hu et-id et-is et-it et-ja et-ko et-lt et-mk et-ml
 et-mr et-nl et-no et-pl et-pt et-ro et-ru et-sh et-si et-sk et-sl et-sq et-sr et-sv et-ta et-te et-tl et-tr et-uk et-vi
 et-zh eu-fa eu-fi eu-fr eu-gl eu-he eu-hi eu-hr eu-hu eu-id eu-is eu-it eu-ja eu-ko eu-lt eu-mk eu-ml eu-mr eu-nl eu-no
 eu-pl eu-pt eu-ro eu-ru eu-sh eu-sk eu-sl eu-sq eu-sr eu-sv eu-ta eu-te eu-tr eu-uk eu-vi eu-zh fa-fi fa-fr fa-gl fa-he
 fa-hi fa-hr fa-hu fa-id fa-it fa-ja fa-ko fa-lt fa-mk fa-ml fa-mr fa-nl fa-no fa-pl fa-pt fa-ro fa-ru fa-sh fa-sk fa-sl
 fa-sq fa-sr fa-sv fa-ta fa-te fa-tr fa-uk fa-vi fa-zh fi-fr fi-gl fi-he fi-hi fi-hr fi-hu fi-id fi-is fi-it fi-ja fi-ko
 fi-lt fi-mk fi-ml fi-mr fi-nds fi-ne fi-nl fi-no fi-oc fi-pl fi-pt fi-ro fi-ru fi-sh fi-si fi-sk fi-sl fi-sq fi-sr fi-sv
 fi-sw fi-ta fi-te fi-tl fi-tr fi-tt fi-uk fi-vi fi-zh fo-fr fo-it fo-nl fo-pl fo-pt fo-ru fo-sv fr-fy fr-gl fr-gom fr-he
 fr-hi fr-hr fr-hu fr-hy fr-id fr-is fr-it fr-ja fr-jv fr-ka fr-kk fr-ko fr-la fr-lb fr-lt fr-mg fr-mk fr-ml fr-mr fr-nds
 fr-ne fr-nl fr-no fr-oc fr-pl fr-pt fr-ro fr-ru fr-sh fr-si fr-sk fr-sl fr-sq fr-sr fr-sv fr-sw fr-ta fr-te fr-tl fr-tr
 fr-tt fr-uk fr-vi fr-wuu fr-zh fy-it fy-nl fy-pl fy-pt fy-ru fy-sv gl-he gl-hi gl-hr gl-hu gl-id gl-is gl-it gl-ja gl-ko
 gl-lt gl-mk gl-ml gl-mr gl-nds gl-ne gl-nl gl-no gl-oc gl-pl gl-pt gl-ro gl-ru gl-sh gl-si gl-sk gl-sl gl-sq gl-sr gl-sv
 gl-ta gl-te gl-tl gl-tr gl-tt gl-uk gl-vi gl-zh gom-it gom-pt gom-ru he-hi he-hr he-hu he-id he-is he-it he-ja he-ko he-lt
 he-mk he-ml he-mr he-nl he-no he-pl he-pt he-ro he-ru he-sh he-si he-sk he-sl he-sq he-sr he-sv he-sw he-ta he-te he-tl
 he-tr he-uk he-vi he-zh hi-hr hi-hu hi-id hi-it hi-ja hi-ko hi-lt hi-mk hi-mr hi-ne hi-nl hi-no hi-pl hi-pt hi-ro hi-ru
 hi-sh hi-sk hi-sl hi-sq hi-sr hi-sv hi-ta hi-te hi-tr hi-uk hi-vi hi-zh hr-hu hr-id hr-is hr-it hr-ja hr-ko hr-lt hr-mk
 hr-ml hr-mr hr-ne hr-nl hr-no hr-pl hr-pt hr-ro hr-ru hr-sh hr-si hr-sk hr-sl hr-sq hr-sr hr-sv hr-ta hr-te hr-tl hr-tr
 hr-uk hr-vi hr-zh hu-id hu-is hu-it hu-ja hu-kk hu-ko hu-lt hu-mk hu-ml hu-mr hu-nds hu-ne hu-nl hu-no hu-oc hu-pl hu-pt
 hu-ro hu-ru hu-sh hu-si hu-sk hu-sl hu-sq hu-sr hu-sv hu-sw hu-ta hu-te hu-tl hu-tr hu-uk hu-vi hu-zh hy-it hy-pt hy-ru
 id-is id-it id-ja id-jv id-ko id-lt id-mk id-ml id-mr id-ne id-nl id-no id-pl id-pt id-ro id-ru id-sh id-si id-sk id-sl
 id-sq id-sr id-sv id-sw id-ta id-te id-tl id-tr id-tt id-uk id-vi id-zh is-it is-ja is-lt is-mk is-nl is-no is-pl is-pt
 is-ro is-ru is-sh is-sk is-sl is-sr is-sv is-tr is-uk is-vi is-zh it-ja it-jv it-ka it-kk it-ko it-la it-lb it-lmo it-lt
 it-mk it-ml it-mr it-nds it-ne it-nl it-no it-oc it-pl it-pt it-ro it-ru it-scn it-sh it-si it-sk it-sl it-sq it-sr it-sv
 it-sw it-ta it-te it-tl it-tr it-tt it-uk it-vi it-wuu it-zh ja-kk ja-ko ja-lt ja-mk ja-ml ja-mr ja-nds ja-nl ja-no ja-pl
 ja-pt ja-ro ja-ru ja-sh ja-si ja-sk ja-sl ja-sq ja-sr ja-sv ja-sw ja-ta ja-te ja-tl ja-tr ja-tt ja-uk ja-vi ja-zh jv-pt
 ka-nl ka-pl ka-pt ka-ru ka-sv kk-nl kk-no kk-pl kk-pt kk-ro kk-ru kk-sv kk-tr kk-uk ko-lt ko-mk ko-ml ko-mr ko-nl ko-no
 ko-pl ko-pt ko-ro ko-ru ko-sh ko-sk ko-sl ko-sq ko-sr ko-sv ko-ta ko-te ko-tr ko-uk ko-vi ko-zh la-nl la-pl la-pt la-ro
 la-ru la-sv lb-nl lb-pl lb-pt lb-ru lb-sv lt-mk lt-ml lt-mr lt-nl lt-no lt-pl lt-pt lt-ro lt-ru lt-sh lt-si lt-sk lt-sl
 lt-sq lt-sr lt-sv lt-ta lt-te lt-tl lt-tr lt-uk lt-vi lt-zh mk-ml mk-mr mk-nl mk-no mk-pl mk-pt mk-ro mk-ru mk-sh mk-si
 mk-sk mk-sl mk-sq mk-sr mk-sv mk-ta mk-te mk-tl mk-tr mk-uk mk-vi mk-zh ml-nl ml-no ml-pl ml-pt ml-ro ml-ru ml-sh ml-sk
 ml-sl ml-sq ml-sr ml-sv ml-tr ml-uk ml-vi ml-zh mr-nl mr-no mr-pl mr-pt mr-ro mr-ru mr-sh mr-sk mr-sl mr-sq mr-sr mr-sv
 mr-tr mr-uk mr-vi mr-zh mwl-pt nds_nl-nl nds-nl nds-no nds-pl nds-pt nds-ro nds-ru nds-sv nds-uk ne-nl ne-no ne-pl ne-pt ne-ro ne-ru
 ne-sh ne-sk ne-sl ne-sv ne-uk nl-no nl-oc nl-pl nl-pt nl-ro nl-ru nl-sh nl-si nl-sk nl-sl nl-sq nl-sr nl-sv nl-sw nl-ta
 nl-te nl-tl nl-tr nl-tt nl-uk nl-vi nl-zh no-pl no-pt no-ro no-ru no-sh no-si no-sk no-sl no-sq no-sr no-sv no-sw no-ta
 no-te no-tl no-tr no-tt no-uk no-vi no-zh oc-pl oc-pt oc-ro oc-ru oc-sv pl-pt pl-ro pl-ru pl-sh pl-si pl-sk pl-sl pl-sq
 pl-sr pl-sv pl-sw pl-ta pl-te pl-tl pl-tr pl-tt pl-uk pl-vi pl-zh pt-ro pt-ru pt-sh pt-si pt-sk pt-sl pt-sq pt-sr pt-sv
 pt-sw pt-ta pt-te pt-tl pt-tr pt-tt pt-uk pt-vi pt-wuu pt-zh ro-ru ro-sh ro-si ro-sk ro-sl ro-sq ro-sr ro-sv ro-sw ro-ta
 ro-te ro-tl ro-tr ro-tt ro-uk ro-vi ro-zh ru-sh ru-si ru-sk ru-sl ru-sq ru-sr ru-sv ru-sw ru-ta ru-te ru-tg ru-tl ru-tr
 ru-tt ru-uk ru-vi ru-wuu ru-zh sh-si sh-sk sh-sl sh-sq sh-sr sh-sv sh-ta sh-te sh-tl sh-tr sh-uk sh-vi sh-zh si-sk si-sl
 si-sq si-sr si-sv si-tr si-uk si-vi si-zh sk-sl sk-sq sk-sr sk-sv sk-ta sk-te sk-tl sk-tr sk-uk sk-vi sk-zh sl-sq sl-sr
 sl-sv sl-ta sl-te sl-tl sl-tr sl-uk sl-vi sl-zh sq-sr sq-sv sq-ta sq-te sq-tl sq-tr sq-uk sq-vi sq-zh sr-sv sr-ta sr-te
 sr-tl sr-tr sr-uk sr-vi sr-zh sv-sw sv-ta sv-te sv-tl sv-tr sv-tt sv-uk sv-vi sv-zh sw-tr sw-uk sw-vi sw-zh ta-tr ta-uk
 ta-vi ta-zh te-tr te-uk te-vi te-zh tl-tr tl-uk tl-vi tl-zh tr-tt tr-uk tr-vi tr-zh tt-uk tt-zh uk-vi uk-zh vi-zh wuu-zh"""
    cite = """@article{wikimatrix1,
    author    = {Holger Schwenk and Vishrav Chaudhary and Shuo Sun and Hongyu Gong and Francisco Guzm{\'{a}}n},
    title     = {WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia},
    journal   = {CoRR},
    volume    = {abs/1907.05791},
    year      = {2019},
    url       = {http://arxiv.org/abs/1907.05791},
    archivePrefix = {arXiv},
    eprint    = {1907.05791},
    timestamp = {Wed, 17 Jul 2019 10:27:36 +0200},
    biburl    = {https://dblp.org/rec/journals/corr/abs-1907-05791.bib},
    bibsource = {dblp computer science bibliography, https://dblp.org}}"""
    url_pat = "https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.%s-%s.tsv.gz"
    mapping = dict(sh='hbs')
    skips = {'nds_nl', 'simple'}
    for pair in data.split():
        l1, l2 = pair.split('-')
        if l1 in skips or l2 in skips:
            continue
        l1iso, l2iso = mapping.get(l1, l1), mapping.get(l2, l2)
        url = url_pat % (l1, l2)
        ent = Entry(langs=(l1iso, l2iso), url=url, name='WikiMatrix_v1', cols=(1, 2), cite=cite)
        index.add_entry(ent)
예제 #28
0
def load_all(index: Index):
    data = """am-en am-fr ar-am ar-en ar-fr aym-am aym-ar aym-en aym-fr bg-ar bg-aym bg-en bg-fr
  bn-am bn-ar bn-aym bn-bg bn-en bn-fr ca-am ca-ar ca-aym ca-bg ca-bn ca-en ca-fr cs-ar cs-aym 
  cs-bg cs-bn cs-ca cs-en cs-fr da-am da-ar da-aym da-bg da-bn da-ca da-cs da-en da-fr de-am 
  de-ar de-aym de-bg de-bn de-ca de-cs de-da de-en de-fr el-am el-ar el-aym el-bg el-bn el-ca
  el-cs el-da el-de el-en el-fr eo-ar eo-aym eo-bg eo-bn eo-ca eo-cs eo-da eo-de eo-el eo-en
  eo-fr es-am es-ar es-aym es-bg es-bn es-ca es-cs es-da es-de es-el es-en es-eo es-fr fa-am
  fa-ar fa-aym fa-bg fa-bn fa-ca fa-cs fa-da fa-de fa-el fa-en fa-eo fa-es fa-fr fil-ar fil-aym
  fil-bg fil-bn fil-ca fil-cs fil-da fil-de fil-el fil-en fil-eo fil-es fil-fa fil-fr fr-en he-ar
   he-bn he-ca he-cs he-da he-de he-el he-en he-es he-fa he-fr hi-am hi-ar hi-bg hi-bn hi-cs hi-de
   hi-el hi-en hi-eo hi-es hi-fa hi-fr hu-am hu-ar hu-aym hu-bg hu-bn hu-ca hu-cs hu-da hu-de
   hu-el hu-en hu-eo hu-es hu-fa hu-fil hu-fr hu-hi id-am id-ar id-aym id-bg id-bn id-ca id-cs 
   id-da id-de id-el id-en id-eo id-es id-fa id-fil id-fr id-hi id-hu it-am it-ar it-aym it-bg it-bn 
   it-ca it-cs it-da it-de it-el it-en it-eo it-es it-fa it-fil it-fr it-he it-hi it-hu it-id jp-am 
   jp-ar jp-aym jp-bg jp-bn jp-ca jp-cs jp-da jp-de jp-el jp-en jp-eo jp-es jp-fa jp-fil jp-fr 
   jp-he jp-hi jp-hu jp-id jp-it km-ar km-aym km-bn km-ca km-da km-de km-el km-en km-es km-fa km-fil 
   km-fr km-hu km-it km-jp ko-am ko-ar ko-aym ko-bg ko-bn ko-ca ko-cs ko-da ko-de ko-el ko-en ko-eo 
   ko-es ko-fa ko-fil ko-fr ko-hu ko-id ko-it ko-jp ku-ar ku-el ku-en ku-es ku-fr ku-it ku-jp mg-am 
   mg-ar mg-aym mg-bg mg-bn mg-ca mg-cs mg-da mg-de mg-el mg-en mg-eo mg-es mg-fa mg-fil mg-fr 
   mg-he mg-hi mg-hu mg-id mg-it mg-jp mg-km mg-ko mg-ku mk-am mk-ar mk-aym mk-bg mk-bn mk-ca mk-cs 
   mk-da mk-de mk-el mk-en mk-eo mk-es mk-fa mk-fil mk-fr mk-he mk-hi mk-hu mk-id mk-it mk-jp mk-km 
   mk-ko mk-mg my-am my-ar my-aym my-bg my-bn my-ca my-cs my-da my-de my-el my-en my-es my-fa my-fil 
   my-fr my-he my-hi my-hu my-id my-it my-jp my-ko my-mg my-mk ne-ar ne-aym ne-bg ne-bn ne-ca ne-cs 
   ne-de ne-el ne-en ne-eo ne-es ne-fa ne-fr ne-hi ne-id ne-it ne-jp ne-ko ne-mg ne-mk nl-am nl-ar 
   nl-aym nl-bg nl-bn nl-ca nl-cs nl-da nl-de nl-el nl-en nl-eo nl-es nl-fa nl-fil nl-fr nl-he nl-hi 
   nl-hu nl-id nl-it nl-jp nl-km nl-ko nl-mg nl-mk nl-my nl-ne or-ar or-aym or-bn or-ca or-cs or-de 
   or-el or-en or-es or-fa or-fr or-hi or-it or-jp or-mg or-mk or-nl pa-ar pa-bn pa-ca pa-cs pa-de 
   pa-el pa-en pa-es pa-fr pa-hi pa-hu pa-it pa-jp pa-ko pa-mg pa-mk pa-ne pa-nl pl-am pl-ar pl-aym 
   pl-bg pl-bn pl-ca pl-cs pl-da pl-de pl-el pl-en pl-eo pl-es pl-fa pl-fil pl-fr pl-he pl-hi pl-hu 
   pl-id pl-it pl-jp pl-ko pl-ku pl-mg pl-mk pl-my pl-ne pl-nl pl-or pl-pa pt-am pt-ar pt-aym pt-bg 
   pt-bn pt-ca pt-cs pt-da pt-de pt-el pt-en pt-eo pt-es pt-fa pt-fil pt-fr pt-he pt-hi pt-hu pt-id 
   pt-it pt-jp pt-km pt-ko pt-ku pt-mg pt-mk pt-my pt-ne pt-nl pt-or pt-pa pt-pl ro-ar ro-aym ro-bg 
   ro-bn ro-ca ro-cs ro-de ro-el ro-en ro-eo ro-es ro-fa ro-fr ro-hu ro-id ro-it ro-jp ro-ko ro-ku 
   ro-mg ro-mk ro-my ro-ne ro-nl ro-pl ro-pt ru-am ru-ar ru-aym ru-bg ru-bn ru-ca ru-cs ru-da ru-de 
   ru-el ru-en ru-eo ru-es ru-fa ru-fil ru-fr ru-he ru-hi ru-hu ru-id ru-it ru-jp ru-km ru-ko ru-mg 
   ru-mk ru-my ru-ne ru-nl ru-or ru-pa ru-pl ru-pt ru-ro sq-am sq-ar sq-aym sq-bg sq-bn sq-ca sq-cs 
   sq-da sq-de sq-el sq-en sq-eo sq-es sq-fa sq-fil sq-fr sq-hi sq-hu sq-id sq-it sq-jp sq-ko sq-mg 
   sq-mk sq-my sq-nl sq-pl sq-pt sq-ru sr-am sr-ar sr-aym sr-bg sr-bn sr-ca sr-cs sr-da sr-de sr-el 
   sr-en sr-eo sr-es sr-fa sr-fil sr-fr sr-hi sr-hu sr-id sr-it sr-jp sr-km sr-ko sr-mg sr-mk sr-my 
   sr-ne sr-nl sr-pl sr-pt sr-ro sr-ru sr-sq sv-am sv-ar sv-aym sv-bg sv-bn sv-ca sv-cs sv-da sv-de 
   sv-el sv-en sv-eo sv-es sv-fa sv-fil sv-fr sv-he sv-hi sv-hu sv-id sv-it sv-jp sv-ko sv-mg sv-mk 
   sv-my sv-nl sv-pl sv-pt sv-ro sv-ru sv-sq sv-sr sw-am sw-ar sw-aym sw-bg sw-bn sw-ca sw-cs sw-da 
   sw-de sw-el sw-en sw-eo sw-es sw-fa sw-fil sw-fr sw-he sw-hi sw-hu sw-id sw-it sw-jp sw-km sw-ko 
   sw-mg sw-mk sw-my sw-ne sw-nl sw-pa sw-pl sw-pt sw-ro sw-ru sw-sq sw-sr sw-sv tet-ar tet-aym 
   tet-bn tet-cs tet-de tet-el tet-en tet-es tet-fr tet-id tet-it tet-mg tet-pt tet-ru tet-sw tr-am 
   tr-ar tr-aym tr-bg tr-bn tr-ca tr-cs tr-da tr-de tr-el tr-en tr-eo tr-es tr-fa tr-fil tr-fr tr-he 
   tr-hi tr-hu tr-id tr-it tr-jp tr-ko tr-mg tr-mk tr-my tr-ne tr-nl tr-pa tr-pl tr-pt tr-ro tr-ru 
   tr-sq tr-sr tr-sv tr-sw ur-am ur-ar ur-aym ur-bg ur-bn ur-ca ur-cs ur-da ur-de ur-el ur-en ur-eo 
   ur-es ur-fa ur-fil ur-fr ur-he ur-hi ur-hu ur-id ur-it ur-jp ur-ko ur-mg ur-mk ur-my ur-ne ur-nl 
   ur-or ur-pa ur-pl ur-pt ur-ro ur-ru ur-sq ur-sr ur-sv ur-sw ur-tr yo-ar yo-el yo-en yo-es yo-fr 
   yo-it yo-mg yo-pl yo-pt yo-ru yo-sw zhs-am zhs-ar zhs-aym zhs-bg zhs-bn zhs-ca zhs-cs zhs-da 
   zhs-de zhs-el zhs-en zhs-eo zhs-es zhs-fa zhs-fil zhs-fr zhs-he zhs-hi zhs-hu zhs-id zhs-it 
   zhs-jp zhs-km zhs-ko zhs-mg zhs-mk zhs-my zhs-ne zhs-nl zhs-pa zhs-pl zhs-pt zhs-ro zhs-ru 
   zhs-sq zhs-sr zhs-sv zhs-sw zhs-tr zhs-ur zht-am zht-ar zht-aym zht-bg zht-bn zht-ca zht-cs 
   zht-da zht-de zht-el zht-en zht-eo zht-es zht-fa zht-fil zht-fr zht-he zht-hi zht-hu zht-id 
   zht-it zht-jp zht-km zht-ko zht-mg zht-mk zht-my zht-ne zht-nl zht-pa zht-pl zht-pt zht-ro 
   zht-ru zht-sq zht-sr zht-sv zht-sw zht-tet zht-tr zht-ur zht-zhs"""
    url = 'http://casmacat.eu/corpus/global-voices-tar-balls/training.tgz'
    cite = """Philipp Koehn, "Global Voices Corpus" http://casmacat.eu/corpus/global-voices.html """

    # any hot fixes for lang id mapping specific to this source
    code_map = {
        'jp':
        'jpn',  # there was never a jp in ISO 693, it was always a 'ja' not 'jp'
        'zhs': 'zho'  # map simplified to chinese
    }
    code_map = code_map.get
    for pair in data.split():
        if 'zht' in pair:
            continue  #skipping traditional chinese because I dont know the ISO code for it
        l1, l2 = pair.split('-')
        f1 = f'training/globalvoices.{l1}-{l2}.{l1}'
        f2 = f'training/globalvoices.{l1}-{l2}.{l2}'
        l1, l2 = code_map(l1, l1), code_map(l2, l2)  # map codes
        ent = Entry(langs=(l1, l2),
                    name='GlobalVoices_2018Q4',
                    url=url,
                    filename='GlobalVoices_2018Q4-training.tgz',
                    in_ext='txt',
                    cite=cite,
                    in_paths=[f1, f2])
        index.add_entry(ent)