def load(index: Index): # === Para crawl corpus PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz' cite = r"""@inproceedings{espla-etal-2019-paracrawl, title = "{P}ara{C}rawl: Web-scale parallel corpora for the languages of the {EU}", author = "Espl{\`a}, Miquel and Forcada, Mikel and Ram{\'\i}rez-S{\'a}nchez, Gema and Hoang, Hieu", booktitle = "Proceedings of Machine Translation Summit XVII Volume 2: Translator, Project and User Tracks", month = aug, year = "2019", address = "Dublin, Ireland", publisher = "European Association for Machine Translation", url = "https://www.aclweb.org/anthology/W19-6721", pages = "118--119", }""" for pair in ['en cs', 'en de', 'en fi', 'en lt']: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v3', url=PARACRAWL_v3 % (l1, l2), cite=cite)) # === Paracrawl V6 PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz' for l2 in [ 'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv' ]: l1 = 'en' index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v6', url=PARACRAWL_v6 % (l1, l2), cite=cite, ext='tsv.gz')) # these are bonus PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz' for l1, l2 in [('nl', 'fr'), ('pl', 'de')]: index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v6', url=PARACRAWL_v6_B % (l1, l2), cite=cite, ext='tsv.gz'))
def load_all(index: Index): cite = """@inproceedings{post-etal-2012-constructing, title = "Constructing Parallel Corpora for Six {I}ndian Languages via Crowdsourcing", author = "Post, Matt and Callison-Burch, Chris and Osborne, Miles", booktitle = "Proceedings of the Seventh Workshop on Statistical Machine Translation", month = jun, year = "2012", address = "Montr{\'e}al, Canada", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W12-3152", pages = "401--409", }""" url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz' l2 = 'en' langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta'] for l1 in langs: for split in ['training', 'dev', 'test', 'devtest', 'dict']: if l1 == 'hi' and split == 'dict': continue # hindi dont have dict f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}' f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}' if split not in ('training', 'dict'): f2 += '.0' ent = Entry(langs=(l1, l2), url=url, name=f'JoshuaIndianCorpus_{split}', filename='joshua-indian-parallel-corpora.tar.gz', in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent)
def add_part(self, dir_path: Path, entry: Entry, drop_noise=False): path = self.cache.get_entry(entry) swap = entry.is_swap(self.langs) parser = Parser(path, langs=self.langs, ext=entry.in_ext or None, ent=entry) langs = '_'.join(self.langs) l1 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}') l2 = (dir_path / f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}') mode = dict(mode='w', encoding='utf-8', errors='ignore') with l1.open(**mode) as f1, l2.open(**mode) as f2: count, skips, noise = 0, 0, 0 for rec in parser.read_segs(): rec = rec[:2] # get the first two recs if len(rec) != 2: skips += 1 continue if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]): skips += 1 noise += 1 continue sent1, sent2 = [s.strip() for s in rec] if not sent1 or not sent2: skips += 1 continue if swap: sent2, sent1 = sent1, sent2 sent1 = sent1.replace('\n', ' ').replace('\t', ' ') sent2 = sent2.replace('\n', ' ').replace('\t', ' ') f1.write(f'{sent1}\n') f2.write(f'{sent2}\n') count += 1 msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}' assert count > 0, msg if skips > count: log.warning(msg) if noise > 0: log.info( f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%" ) log.info(f"wrote {count} lines to {l1} == {l2}") return count, skips
def load_all(index: Index): with open(REFS_FILE, encoding='utf-8') as data: for line in data: l1, l2, num, short, name, info, download, licenses, in_paths = line.split('\t', maxsplit=8) dataset_name = short.lower().replace(':', '_').replace('__', '_').replace('__', '_') in_paths = in_paths.strip().split('\t') ent = Entry(did=DatasetId(group='ELRC', name=dataset_name, version='1', langs=(l1, l2)), url=download, filename="ELRC_" + str(num) + ".zip", in_ext='tmx', in_paths=in_paths) index.add_entry(ent)
def load_all(index: Index): URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz" cite = index.ref_db.get_bibtex('zhang-etal-2020-improving') cite += '\n\n' + index.ref_db.get_bibtex('tiedemann2012parallel') filename = 'opus-100-corpus-v1.0.tar.gz' code_map = dict( nb='nob', sh='hbs') # these arent obvious to iso lookup function, so helping group, name = 'OPUS', 'opus100' for pair in supervised_v1: l1, l2 = pair.split("-") l1 = code_map.get(l1, l1) l2 = code_map.get(l2, l2) splits = ['train', 'dev', 'test'] if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}: splits = ['train' ] # somehow they forgot to include test sets for these for split in splits: f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}' f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}' ent = Entry(did=DatasetId(group=group, name=f'{name}_{split}', version='1', langs=(l1, l2)), url=URL, filename=filename, in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent) for pair in zeroshot_v1: l1, l2 = pair.split("-") f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}' f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}' ent = Entry(did=DatasetId(group=group, name=f'{name}_test', version='1', langs=(l1, l2)), url=URL, filename=filename, in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent)
def load_all(index: Index): url_ptn = 'https://www.dropbox.com/s/{uid}/wikititles-2014_{l1}{l2}.tgz?dl=1' rows = [row.split(',') for row in wiki_titles.splitlines()] for row in rows: uid, pair = row assert len(pair) == 4 l1, l2 = pair[:2], pair[2:] url = url_ptn.format(uid=uid, l1=l1, l2=l2) in_file = f'wikititles-2014_{l1}{l2}' ent = Entry(did=DatasetId(group='LinguaTools', name=f'wikititles', version='2014', langs=(l1, l2)), url=url, ext='tgz', in_ext='txt', in_paths=[f'{in_file}.{l1}', f'{in_file}.{l2}']) index.add_entry(ent)
def load_all(index: Index): with open(REFS_FILE) as data: for line in data: l1, l2, num, short, name, info, download, licenses, in_paths = line.split( '\t', maxsplit=8) in_paths = in_paths.strip().split('\t') ent = Entry(langs=(l1, l2), url=download, name="ELRC_" + short, filename="ELRC_" + str(num) + ".zip", in_ext='tmx', in_paths=in_paths) index.add_entry(ent)
def load_all(index: Index): # === ECDC === # https://ec.europa.eu/jrc/en/language-technologies/ecdc-translation-memory cite = index.ref_db.get_bibtex('Steinberger2014') langs = 'en bg cs da de el es et fi fr ga hu is it lt lv mt nl no pl pt ro sk sl sv'.split() for i, l1 in enumerate(langs): for l2 in langs[i+1:]: ent = Entry(langs=(l1, l2), url="http://optima.jrc.it/Resources/ECDC-TM/ECDC-TM.zip", name="ECDC", in_ext='tmx', cite=cite, in_paths=["ECDC-TM/ECDC.tmx"]) index.add_entry(ent) # === EAC === # https://ec.europa.eu/jrc/en/language-technologies/eac-translation-memory # This corpus has two langs = 'bg cs da de el en es et fi fr hu is it lt lv mt nb nl pl pt ro sk sl sv tr'.split() for i, l1 in enumerate(langs): for l2 in langs[i+1:]: ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip", name="EAC_Forms", in_ext='tmx', cite=cite, in_paths=["EAC_FORMS.tmx"]) index.add_entry(ent) langs = 'bg cs da de el en es et fi fr hr hu is it lt lv mt nl no pl pt ro sk sl sv tr'.split() for i, l1 in enumerate(langs): for l2 in langs[i+1:]: ent = Entry(langs=(l1, l2), url="https://wt-public.emm4u.eu/Resources/EAC-TM/EAC-TM-all.zip", name="EAC_Reference", in_ext='tmx', cite=cite, in_paths=["EAC_REFRENCE_DATA.tmx"]) index.add_entry(ent) # === DCEP === # https://ec.europa.eu/jrc/en/language-technologies/dcep # This was annoying to process so I ended up rehosting it. # Don't bother with TR; it doesn't have sentences anyway. cite = index.ref_db.get_bibtex('dcep') langs = 'BG CS DA DE EL EN ES ET FI FR GA HU IT LT LV MT NL PL PT RO SK SL SV'.split() for i, l1 in enumerate(langs): for l2 in langs[i+1:]: ent = Entry(langs=(l1, l2), url=f"http://data.statmt.org/DCEP/{l1}-{l2}.tsv.xz", name="DCEP", in_ext='tsv', cite=cite) index.add_entry(ent)
def load_all(index: Index): url_pat = 'https://object.pouta.csc.fi/OPUS-{corpus}/{version}/moses/{l1}-{l2}.txt.zip' group_id = 'OPUS' citation = index.ref_db.get_bibtex('tiedemann2012parallel') skip_counts = defaultdict(int) dupes = defaultdict(set) assert data_file.exists() assert data_file.stat().st_size > 0 with data_file.open() as lines: for line in lines: line = line.strip() if not line: # empty lines in the top and bottom continue assert len(line.split('\t')) == 4, line corpus, version, l1, l2 = line.split('\t') url = url_pat.format(corpus=corpus, version=version, l1=l1, l2=l2) iso_l1, iso_l2 = bcp47.try_parse( l1, default=None), bcp47.try_parse(l2, default=None) if not iso_l1 or not iso_l2: if not iso_l1: skip_counts[str(l1)] += 1 if not iso_l2: skip_counts[str(l2)] += 1 continue version_cln = version.replace('-', '').lower() corpus_cln = corpus.replace('-', '_').lower() data_id = DatasetId(group=group_id, name=corpus_cln, version=version_cln, langs=(iso_l1, iso_l2)) if data_id in index: dupes[corpus].add(f'{l1}-{l2}') continue entry = Entry(did=data_id, url=url, cite=citation, in_paths=[f'*.{l1}', f'*.{l2}'], in_ext='txt') index.add_entry(entry) if skip_counts: skip_counts = list( sorted(dict(skip_counts).items(), key=lambda x: x[1], reverse=True)) log.info(f"Skipped lang counts: {skip_counts}") if dupes: log.info(f"Duplicates langs: {dupes}")
def load_all(index: Index): cite = index.ref_db.get_bibtex('ziemski-etal-2016-united') url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz" langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh'] for split in ['dev', 'test']: for l1, l2 in itertools.combinations(langs, 2): f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}' f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}' ent = Entry(langs=(l1, l2), url=url, filename='UNv1.0.testsets.tar.gz', name=f'UNv1_{split}', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent)
def load_all(index: Index): cite = index.ref_db.get_bibtex('ramesh2021samanantar') pairs = ('en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi' ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa' ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or' ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te' ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te') BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip' for pair in pairs.strip().split(' '): l1, l2 = pair.split('-') dirname = 'en2indic' if l1 == 'en' else 'indic2indic' url = BASE_v0_2.format(dirname=dirname, pair=pair) ent = Entry(langs=(l1, l2), name='AI4B_Samananthar_v02', url=url, cite=cite, in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'], in_ext='txt') index.add_entry(ent)
def load_all(index: Index): lines = data_file.read_text(encoding='utf-8').splitlines() langs = set('hi bn ta ml te kn mr pa gu as ur or'.split()) # other than en group_id = 'Anuvaad' cite_txt = index.ref_db.get_bibtex('project-anuvaad') for url in lines: url = url.strip() assert url.startswith('http') and url.endswith('.zip') file_name = url.split('/')[-1] file_name = file_name[:-4] # .zip char_count = coll.Counter(list(file_name)) n_hyps = char_count.get('-', 0) n_unders = char_count.get('_', 0) if n_hyps > n_unders: parts = file_name.split('-') else: assert '_' in file_name parts = file_name.split('_') name, version = '?', '?' l1, l2 = 'en', '?' if parts[-2] == l1 and parts[-1] in langs: l2 = parts[-1] version = parts[-3] elif parts[-3] == l1 and parts[-2] in langs: l2 = parts[-2] version = parts[-1] else: log.warn(f"Unable to parse {file_name} :: {parts}") continue name = '_'.join(parts[:-3]) name = name.replace('-', '_') f1 = f'{l1}-{l2}/*.{l1}' f2 = f'{l1}-{l2}/*.{l2}' if name == 'wikipedia': f1 = f'{l1}-{l2}/{l1}.txt' f2 = f'{l1}-{l2}/{l2}.txt' ent = Entry(did=DatasetId(group=group_id, name=name, version=version, langs=(l1, l2)), url=url, ext='zip', in_ext='txt', in_paths=[f1, f2], cite=cite_txt) index.add_entry(ent)
def load_all(index: Index): cite = index.ref_db.get_bibtex('ziemski-etal-2016-united') url = "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.testsets.tar.gz" url = "https://drive.google.com/uc?export=download&id=13GI1F1hvwpMUGBSa0QC6ov4eE57GC_Zx" # they changed it! langs = ['en', 'ar', 'fr', 'es', 'ru', 'zh'] for split in ['dev', 'test']: for l1, l2 in itertools.combinations(langs, 2): f1 = f'testsets/{split}set/UNv1.0.{split}set.{l1}' f2 = f'testsets/{split}set/UNv1.0.{split}set.{l2}' ent = Entry(did=DatasetId(group='UN', name=f'un_{split}', version='1', langs=(l1, l2)), url=url, filename='UNv1.0.testsets.tar.gz', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent)
def load_all(index: Index): cite = index.ref_db.get_bibtex(key='post-etal-2012-constructing') url = 'https://github.com/joshua-decoder/indian-parallel-corpora/archive/a2cd1a99.tar.gz' l2 = 'en' langs = ['ml', 'hi', 'ur', 'bn', 'te', 'ta'] for l1 in langs: for split in ['training', 'dev', 'test', 'devtest', 'dict']: if l1 == 'hi' and split == 'dict': continue # hindi dont have dict f1 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l1}' f2 = f'*/{l1}-{l2}/{split}.{l1}-{l2}.{l2}' if split not in ('training', 'dict'): f2 += '.0' ent = Entry(langs=(l1, l2), url=url, name=f'JoshuaIndianCorpus_{split}', filename='joshua-indian-parallel-corpora.tar.gz', in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent)
def get_stats(self, entry: Entry): path = self.get_entry(entry) parser = Parser(path, ext=entry.in_ext or None, ent=entry) count, skips, noise = 0, 0, 0 toks = [0, 0] chars = [0, 0] for rec in parser.read_segs(): if len(rec) < 2 or not rec[0] or not rec[1]: skips += 1 continue if entry.is_noisy(seg1=rec[0], seg2=rec[1]): noise += 1 skips += 1 continue count += 1 s1, s2 = rec[:2] # get the first two recs chars[0] += len(s1) chars[1] += len(s2) s1_tok, s2_tok = s1.split(), s2.split() toks[0] += len(s1_tok) toks[1] += len(s2_tok) l1, l2 = entry.did.langs l1, l2 = l1.lang, l2.lang assert count > 0, f'No valid records are found for {entry.did}' if l2 < l1: l1, l2 = l2, l1 toks = toks[1], toks[0] chars = chars[1], chars[0] return { 'id': str(entry.did), 'segs': count, 'segs_err': skips, 'segs_noise': noise, f'{l1}_toks': toks[0], f'{l2}_toks': toks[1], f'{l1}_chars': chars[0], f'{l2}_chars': chars[0] }
def load_all(index: Index): URL = "https://object.pouta.csc.fi/OPUS-100/v1.0/opus-100-corpus-v1.0.tar.gz" cite = """ @inproceedings{zhang-etal-2020-improving, title = "Improving Massively Multilingual Neural Machine Translation and Zero-Shot Translation", author = "Zhang, Biao and Williams, Philip and Titov, Ivan and Sennrich, Rico", booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", month = jul, year = "2020", address = "Online", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/2020.acl-main.148", doi = "10.18653/v1/2020.acl-main.148", pages = "1628--1639", } @inproceedings{tiedemann2012parallel, title={Parallel Data, Tools and Interfaces in OPUS.}, author={Tiedemann, J{\"o}rg}, booktitle={Lrec}, volume={2012}, pages={2214--2218}, year={2012} }""" filename = 'opus-100-corpus-v1.0.tar.gz' code_map = dict( nb='nob', sh='hbs') # these arent obvious to iso lookup function, so helping for pair in supervised_v1: l1, l2 = pair.split("-") l1 = code_map.get(l1, l1) l2 = code_map.get(l2, l2) splits = ['train', 'dev', 'test'] if pair in {'an-en', 'en-yo', 'dz-en', 'en-hy', 'en-mn'}: splits = ['train' ] # somehow they forgot to include test sets for these for split in splits: f1 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l1}' f2 = f'opus-100-corpus/v1.0/supervised/{l1}-{l2}/opus.{l1}-{l2}-{split}.{l2}' ent = Entry(langs=(l1, l2), url=URL, name=f'OPUS100v1_{split}', filename=filename, in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent) for pair in zeroshot_v1: l1, l2 = pair.split("-") f1 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l1}' f2 = f'opus-100-corpus/v1.0/zero-shot/{l1}-{l2}/opus.{l1}-{l2}-test.{l2}' ent = Entry(langs=(l1, l2), url=URL, name=f'OPUS100v1_test', filename=filename, in_paths=[f1, f2], in_ext='txt', cite=cite) index.add_entry(ent)
def load(index: Index): cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl') cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl') group_id = 'ParaCrawl' # === Para crawl corpus PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz' for pair in ['en cs', 'en de', 'en fi', 'en lt']: l1, l2 = pair.split() index.add_entry( Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='3', langs=(l1, l2)), url=PARACRAWL_v3 % (l1, l2), cite=cite)) # === Paracrawl V6 PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz' for l2 in [ 'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv' ]: l1 = 'en' index.add_entry( Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='6', langs=(l1, l2)), url=PARACRAWL_v6 % (l1, l2), cite=cite, ext='tsv.gz')) # these are bonus PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz' for l1, l2 in [('nl', 'fr'), ('pl', 'de')]: index.add_entry( Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='6B', langs=(l1, l2)), url=PARACRAWL_v6_B % (l1, l2), cite=cite, ext='tsv.gz')) l1 = 'en' PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz' for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split( ): index.add_entry( Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='7.1', langs=(l1, l2)), url=PARACRAWL_v7_1 % (l1, l2), cite=cite, ext='tsv.gz')) PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz' for pair in 'en-nb en-nn es-ca es-eu es-gl'.split(): l1, l2 = pair.split('-') index.add_entry( Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='7.1', langs=(l1, l2)), url=PARACRAWL_v7_1 % (l1, l2), cite=cite, ext='tsv.gz')) PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz' for version, pairs in [ ('v8.0', 'en-bg en-cs en-da en-de en-el'), ('v8.0-0001', 'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl' ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl') ]: for pair in pairs.split(): l1, l2 = pair.split('-') url = PARACRAWL_V8.format(version=version, pair=pair) ent = Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='8', langs=(l1, l2)), url=url, cite=cite, ext='tsv.gz') index.add_entry(ent) PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz' for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split( ): l1, l2 = pair.split('-') url = PARACRAWL_BONUS.format(pair=pair) ent = Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='1_bonus', langs=(l1, l2)), url=url, cite=cite, ext='tsv.gz') index.add_entry(ent) PARACRAWL_V9 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release9/{l1}-{l2}/{l1}-{l2}.txt.gz' for pair in ( 'en-bg en-cs en-da en-de en-el en-es en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv' ' en-mt en-nb en-nl en-nn en-pl en-pt en-ro en-sk en-sl en-sv es-ca es-eu es-gl' ).split(): l1, l2 = pair.split('-') url = PARACRAWL_V9.format(l1=l1, l2=l2) ent = Entry(did=DatasetId(group=group_id, name=f'paracrawl', version='9', langs=(l1, l2)), url=url, cite=cite, ext='tsv.gz') index.add_entry(ent) # this is a new addition in Sept 2021 index.add_entry( Entry( did=DatasetId(group=group_id, name=f'paracrawl', version='1_bonus', langs=('en', 'zh')), url= 'http://web-language-models.s3-website-us-east-1.amazonaws.com/paracrawl/bonus/en-zh-v1.txt.gz', cite=cite, ext='tsv.gz')) # Japanese-English paracrawl (5.1) used by WMT20 and WMT21 for version in ['2', '3']: ent = Entry( did=DatasetId(group='KECL', name=f'paracrawl', version=version, langs=('eng', 'jpn')), in_paths=['en-ja/en-ja.bicleaner05.txt'], in_ext='tsv', cols=(2, 3), cite='', url= f'http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/{version}.0/bitext/en-ja.tar.gz' ) index.add_entry(ent)
def load(index: Index): WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" WMT14_CITE = """@proceedings{ws-2014-statistical, title = "Proceedings of the Ninth Workshop on Statistical Machine Translation", editor = "Bojar, Ond{\v{r}}ej and Buck, Christian and Federmann, Christian and Haddow, Barry and Koehn, Philipp and Monz, Christof and Post, Matt and Specia, Lucia", month = jun, year = "2014", address = "Baltimore, Maryland, USA", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W14-3300", doi = "10.3115/v1/W14-33", }""" for l1 in ['de', 'cs', 'fr', 'ru', 'es']: l2 = 'en' f1 = f'commoncrawl.{l1}-en.{l1}' f2 = f'commoncrawl.{l1}-en.en' index.add_entry( Entry(langs=(l1, l2), name=f'wmt13_commoncrawl', url=WMT13_CCRAWL, filename='wmt13_parallel_commoncrawl.tgz', in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # === WMT 13 release of europarl_v7 === for l1 in ['cs', 'de', 'fr', 'es']: l2 = 'en' f1 = f'training/europarl-v7.{l1}-{l2}.{l1}' f2 = f'training/europarl-v7.{l1}-{l2}.{l2}' index.add_entry( Entry( langs=(l1, l2), name=f'wmt13_europarl_v7', url= "http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", filename="wmt13_europarl_v7.tgz", in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # ==== WMT 18 news commentary v13 === for l1 in ['cs', 'de', 'ru', 'zh']: l2 = 'en' f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}' f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}' index.add_entry( Entry( langs=(l1, l2), name=f'wmt18_news_commentary_v13', url= "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz", filename="wmt18_news_commentary_v13.tgz", in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # === Europarl V9 corpus EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz' cite = r"""@inproceedings{koehn2005europarl, title={Europarl: A parallel corpus for statistical machine translation}, author={Koehn, Philipp}, booktitle={MT summit}, volume={5}, pages={79--86}, year={2005}, organization={Citeseer} }""" for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='europarl_v9', url=EUROPARL_v9 % (l1, l2), cite=cite)) # === Europarl V7 corpus EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz' cite = r"""@inproceedings{bojar-etal-2017-findings, title = "Findings of the 2017 Conference on Machine Translation ({WMT}17)", author = "Bojar, Ond{\v{r}}ej and Chatterjee, Rajen and Federmann, Christian and Graham, Yvette and Haddow, Barry and Huang, Shujian and Huck, Matthias and Koehn, Philipp and Liu, Qun and Logacheva, Varvara and Monz, Christof and Negri, Matteo and Post, Matt and Rubino, Raphael and Specia, Lucia and Turchi, Marco", booktitle = "Proceedings of the Second Conference on Machine Translation", month = sep, year = "2017", address = "Copenhagen, Denmark", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W17-4717", doi = "10.18653/v1/W17-4717", pages = "169--214", }""" for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split( ): l2 = 'en' src = f'europarl-v7.{l1}-{l2}.{l1}' ref = f'europarl-v7.{l1}-{l2}.{l2}' index.add_entry( Entry(langs=(l1, l2), name='europarl_v7', in_paths=[src, ref], url=EUROPARL_v7 % (l1, l2), in_ext='txt', cite=cite)) # === Digital Corpus of European Parliament index.add_entry( Entry( langs=('lv', 'en'), name='wmt17_dcep_v1', in_paths=['*/*.lv', f'*/*.en'], cite=cite, url= 'http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz')) index.add_entry( Entry( langs=('lv', 'en'), name='wmt17_books_v1', in_paths=['*/*.lv', f'*/*.en'], cite=cite, url= 'http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz') ) # === News Commentary v14 NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz" cite = r"""@inproceedings{bojar-etal-2018-findings, title = "Findings of the 2018 Conference on Machine Translation ({WMT}18)", author = "Bojar, Ond{\v{r}}ej and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Koehn, Philipp and Monz, Christof", booktitle = "Proceedings of the Third Conference on Machine Translation: Shared Task Papers", month = oct, year = "2018", address = "Belgium, Brussels", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W18-6401", doi = "10.18653/v1/W18-6401", pages = "272--303" }""" for pair in [ 'ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it', 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk', 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it', 'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja', 'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it', 'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk', 'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh' ]: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='news_commentary_v14', url=NEWSCOM_v14 % (l1, l2), cite=cite)) # ===== Wiki Titles V1 WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz' cite = r"""@inproceedings{barrault-etal-2019-findings, title = "Findings of the 2019 Conference on Machine Translation ({WMT}19)", author = {Barrault, Lo{\"\i}c and Bojar, Ond{\v{r}}ej and Costa-juss{\`a}, Marta R. and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Huck, Matthias and Koehn, Philipp and Malmasi, Shervin and Monz, Christof and M{\"u}ller, Mathias and Pal, Santanu and Post, Matt and Zampieri, Marcos}, booktitle = "Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)", month = aug, year = "2019", address = "Florence, Italy", publisher = "Association for Computational Linguistics", url = "https://www.aclweb.org/anthology/W19-5301", doi = "10.18653/v1/W19-5301", pages = "1--61" }""" for pair in [ 'cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne', 'kk en', 'lt en', 'ru en', 'zh en' ]: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='wiki_titles_v1', url=WIKI_TITLES_v1 % (l1, l2), cite=cite)) # ===== Wiki Titles V2 WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz' for pair in [ 'ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en', 'pl en', 'ps en', 'ru en', 'ta en', 'zh en' ]: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='wiki_titles_v2', url=WIKI_TITLES_v2 % (l1, l2), cite=cite)) # ==== WMT Dev and Tests wmt_sets = { 'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'), ('ru', 'en'), ('hi', 'en')], 'newsdev2015': [('fi', 'en'), ('en', 'fi')], 'newstest2015': [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'), ('de', 'en'), ('ru', 'en'), ('en', 'fi')], 'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'), ('en', 'tr')], 'newstest2016': [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'), ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'), ('en', 'tr'), ('en', 'cs')], 'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'), ('en', 'lv')], 'newstest2017': [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'), ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'), ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')], 'newsdev2018': [('et', 'en'), ('en', 'et')], 'newstest2018': [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'), ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'), ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')], 'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'), ('lt', 'en'), ('en', 'gu')], 'newstest2019': [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'), ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'), ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'), ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')], 'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'), ('en', 'iu'), ('en', 'ja'), ('ja', 'en'), ('en', 'pl')] } for set_name, pairs in wmt_sets.items(): for l1, l2 in pairs: src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm' ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm' name = f'{set_name}_{l1}{l2}' index.add_entry( Entry( (l1, l2), name=name, filename='wmt20dev.tgz', in_paths=[src, ref], url='http://data.statmt.org/wmt20/translation-task/dev.tgz', cite=cite)) # Multi parallel wmt_sets = { '2009': ['en', 'cs', 'de', 'es', 'fr'], '2010': ['en', 'cs', 'de', 'es', 'fr'], '2011': ['en', 'cs', 'de', 'es', 'fr'], '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'], '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'], } for year, langs in wmt_sets.items(): for l1, l2 in itertools.combinations(langs, 2): name = f'newstest{year}' f1 = f'dev/{name}.{l1}' f2 = f'dev/{name}.{l2}' index.add_entry( Entry( (l1, l2), name=name, filename='wmt20dev.tgz', in_paths=[f1, f2], in_ext='txt', cite=cite, url='http://data.statmt.org/wmt20/translation-task/dev.tgz' )) for l1, l2 in [('ps', 'en'), ('km', 'en')]: for set_name in ['wikipedia.dev', 'wikipedia.devtest']: src = f'dev/{set_name}.{l1}-{l2}.{l1}' ref = f'dev/{set_name}.{l1}-{l2}.{l2}' name = f'{set_name.replace(".", "_")}_{l1}{l2}' index.add_entry( Entry( (l1, l2), name=name, filename='wmt20dev.tgz', in_paths=[src, ref], url='http://data.statmt.org/wmt20/translation-task/dev.tgz', in_ext='txt', cite=cite)) # ==== TED Talks 2.0 ar-en index.add_entry( Entry( ('en', 'ar'), 'tedtalks_v2_clean', ext='tsv.xz', url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz')) # ==== Europarl v10 EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz" wmt20_cite = None # TODO: update for pair in [ 'cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en', 'lt en', 'pl en' ]: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='europarl_v10', url=EP_v10 % (l1, l2), cite=wmt20_cite)) # ==== PMIndia V1 PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv" cite = r"""@ARTICLE{2020arXiv200109907H, author = {{Haddow}, Barry and {Kirefu}, Faheem}, title = "{PMIndia -- A Collection of Parallel Corpora of Languages of India}", journal = {arXiv e-prints}, keywords = {Computer Science - Computation and Language}, year = "2020", month = "Jan", eid = {arXiv:2001.09907}, pages = {arXiv:2001.09907}, archivePrefix = {arXiv}, eprint = {2001.09907} }""" for pair in [ "as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en", "mr en", "or en", "pa en", "ta en", "te en", "ur en" ]: l1, l2 = pair.split() # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed! index.add_entry( Entry(langs=(l2, l1), name='pmindia_v1', url=PMINDIA_v1 % (l1, l2), cite=cite)) # Pashto - English pseudo parallel dataset for alignment index.add_entry( Entry( langs=('en', 'ps'), name='wmt20_enps_aligntask', url= 'http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz', cite=wmt20_cite, ext='tsv.xz')) # Pashto - English mostly parallel dataset for name in [ "GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps", "bible.en-ps.clean", "ted-wmt20.en-ps", "wikimedia.en-ps" ]: ps = f'ps-parallel/{name}.ps' en = f'ps-parallel/{name}.en' url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz' name = name.replace('.en-ps', '').replace('.', '_').replace('-', '_').lower() entry = Entry(langs=('ps', 'en'), name=name, url=url, cite=wmt20_cite, in_paths=[ps, en], filename='wmt20-psen-parallel.tgz', in_ext='txt') index.add_entry(entry)
def load(index: Index): cite = index.ref_db.get_bibtex('espla-etal-2019-paracrawl') cite += '\n' + index.ref_db.get_bibtex('banon-etal-2020-paracrawl') # === Para crawl corpus PARACRAWL_v3 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/%s-%s.bicleaner07.tmx.gz' for pair in ['en cs', 'en de', 'en fi', 'en lt']: l1, l2 = pair.split() index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v3', url=PARACRAWL_v3 % (l1, l2), cite=cite)) # === Paracrawl V6 PARACRAWL_v6 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.txt.gz' for l2 in [ 'is', 'bg', 'hr', 'cs', 'da', 'nl', 'et', 'fi', 'fr', 'de', 'el', 'hu', 'ga', 'it', 'lv', 'lt', 'mt', 'pl', 'pt', 'ro', 'sk', 'sl', 'es', 'sv' ]: l1 = 'en' index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v6', url=PARACRAWL_v6 % (l1, l2), cite=cite, ext='tsv.gz')) # these are bonus PARACRAWL_v6_B = 'https://s3.amazonaws.com/web-language-models/paracrawl/release6/%s-%s.bicleaner07.txt.gz' for l1, l2 in [('nl', 'fr'), ('pl', 'de')]: index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v6', url=PARACRAWL_v6_B % (l1, l2), cite=cite, ext='tsv.gz')) l1 = 'en' PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7.1/%s-%s.txt.gz' for l2 in 'bg cs da de el es et fi fr ga hr hu is it lt lv mt nl pl pt ro sk sl sv'.split( ): index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v7_1', url=PARACRAWL_v7_1 % (l1, l2), cite=cite, ext='tsv.gz')) PARACRAWL_v7_1 = 'https://s3.amazonaws.com/web-language-models/paracrawl/release7/%s-%s.txt.gz' for pair in 'en-nb en-nn es-ca es-eu es-gl'.split(): l1, l2 = pair.split('-') index.add_entry( Entry(langs=(l1, l2), name='paracrawl_v7', url=PARACRAWL_v7_1 % (l1, l2), cite=cite, ext='tsv.gz')) PARACRAWL_V8 = 'https://archive.org/download/ParaCrawl-{version}/{pair}.txt.gz' for version, pairs in [ ('v8.0', 'en-bg en-cs en-da en-de en-el'), ('v8.0-0001', 'en-et en-fi en-fr en-ga en-hr en-hu en-is en-it en-lt en-lv en-mt en-nl en-no en-pl en-pt en-ro en-sk en-sl' ), ('v8.0-0002', 'en-sv es-eu'), ('v8.1-0000', 'es-ca es-gl') ]: for pair in pairs.split(): l1, l2 = pair.split('-') url = PARACRAWL_V8.format(version=version, pair=pair) ent = Entry(langs=(l1, l2), name='paracrawl_v8', url=url, cite=cite, ext='tsv.gz') index.add_entry(ent) PARACRAWL_BONUS = 'https://s3.amazonaws.com/web-language-models/paracrawl/bonus/{pair}.txt.gz' for pair in 'en-km en-my en-ne en-ps en-si en-so en-sw en-tl en-ru en-ko'.split( ): l1, l2 = pair.split('-') url = PARACRAWL_BONUS.format(pair=pair) ent = Entry(langs=(l1, l2), name='paracrawl_bonus', url=url, cite=cite, ext='tsv.gz') index.add_entry(ent)
def load_all(index: Index): # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/ cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb') l1, l2 = 'hi', 'en' for version, prefix in [ #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'), ('v1_5', 'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download' ) ]: # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/ # version is not explicit, but guessed from file modification time and description url = prefix + "/parallel.tgz" ent = Entry(langs=(l1, l2), url=url, filename=f'IITB{version}-hin_eng-parallel.tar.gz', name=f'IITB{version}_train', in_ext='txt', cite=cite, in_paths=[ f'parallel/IITB.en-hi.{l1}', f'parallel/IITB.en-hi.{l2}' ]) index.add_entry(ent) url = prefix + "/dev_test.tgz" for split in ['dev', 'test']: f1 = f'dev_test/{split}.{l1}' f2 = f'dev_test/{split}.{l2}' ent = Entry(langs=(l1, l2), url=url, filename=f'IITB{version}-hin_eng-dev_test.tar.gz', name=f'IITB{version}_{split}', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent) # == Japanese == cite = index.ref_db.get_bibtex('neubig11kftt') url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" l1, l2 = 'en', 'ja' for split in ['train', 'test', 'dev', 'tune']: f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}' f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}' ent = Entry(langs=(l1, l2), url=url, filename="kftt-data-1.0.tar.gz", name=f'kftt_v1_{split}', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent) url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip" cite = index.ref_db.get_bibtex('ding2020a') for split in ['dev', 'test', 'train']: ent = Entry(langs=('my', 'en'), url=url, name=f'WAT2020_ALT_{split}', in_ext='txt', cite=cite, filename='wat2020.my-en.zip', in_paths=[ f'wat2020.my-en/alt/{split}.alt.my', f'wat2020.my-en/alt/{split}.alt.en' ]) index.add_entry(ent) l1, l2 = 'iu', 'en' url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut') for split in ['dev', 'devtest', 'test', 'train']: path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}' if split != 'train': path_pref += '-dedup' ent = Entry(langs=(l1, l2), url=url, name=f'NunavutHansard_v3_{split}', in_ext='txt', cite=cite, filename='NunavutHansard_iuen_v3.tgz', in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}']) index.add_entry(ent) # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122 url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip" cite = index.ref_db.get_bibtex('Khresmoi') langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"] for i, l1 in enumerate(langs): for l2 in langs[i + 1:]: ent = Entry( langs=(l1, l2), url=url, name='Khresmoi_Summary_Test_v2', filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_paths=[ f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}", f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}" ], in_ext='txt') index.add_entry(ent) ent = Entry( langs=(l1, l2), url=url, name='Khresmoi_Summary_Dev_v2', filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_paths=[ f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}", f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}" ], in_ext='txt') index.add_entry(ent)
def load(index: Index): group_id = 'Statmt' WMT13_CCRAWL = "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" WMT14_CITE = index.ref_db.get_bibtex('ws-2014-statistical') for l1 in ['de', 'cs', 'fr', 'ru', 'es']: l2 = 'en' f1 = f'commoncrawl.{l1}-en.{l1}' f2 = f'commoncrawl.{l1}-en.en' data_id = DatasetId(group=group_id, name='commoncrawl_wmt13', version='1', langs=(l1, l2)) index.add_entry(Entry(did=data_id, url=WMT13_CCRAWL, filename='wmt13_parallel_commoncrawl.tgz', in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # === WMT 13 release of europarl_v7 === for l1 in ['cs', 'de', 'fr', 'es']: l2 = 'en' f1 = f'training/europarl-v7.{l1}-{l2}.{l1}' f2 = f'training/europarl-v7.{l1}-{l2}.{l2}' data_id = DatasetId(group=group_id, name='europarl_wmt13', version='7', langs=(l1, l2)) index.add_entry(Entry(did=data_id, url="http://www.statmt.org/wmt13/training-parallel-europarl-v7.tgz", filename="wmt13_europarl_v7.tgz", in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # ==== WMT 18 news commentary v13 === for l1 in ['cs', 'de', 'ru', 'zh']: l2 = 'en' f1 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l1}' f2 = f'training-parallel-nc-v13/news-commentary-v13.{l1}-{l2}.{l2}' data_id = DatasetId(group=group_id, name='news_commentary_wmt18', version='13', langs=(l1, l2)) index.add_entry(Entry(did=data_id, url="http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz", filename="wmt18_news_commentary_v13.tgz", in_paths=[f1, f2], in_ext='txt', cite=WMT14_CITE)) # === Europarl V9 corpus EUROPARL_v9 = 'http://www.statmt.org/europarl/v9/training/europarl-v9.%s-%s.tsv.gz' cite = index.ref_db.get_bibtex('koehn2005europarl') for pair in ['de en', 'cs en', 'cs pl', 'es pt', 'fi en', 'lt en']: l1, l2 = pair.split() index.add_entry(Entry(did=DatasetId(group=group_id, name='europarl', version='9', langs=(l1, l2)), url=EUROPARL_v9 % (l1, l2), cite=cite)) # === Europarl V7 corpus EUROPARL_v7 = 'http://www.statmt.org/europarl/v7/%s-%s.tgz' cite = index.ref_db.get_bibtex('bojar-etal-2017-findings') for l1 in 'bg cs da de el es et fi fr hu it lt lv nl pl pt ro sk sl sv'.split(): l2 = 'en' src = f'europarl-v7.{l1}-{l2}.{l1}' ref = f'europarl-v7.{l1}-{l2}.{l2}' index.add_entry(Entry( did=DatasetId(group=group_id, name='europarl', version='7', langs=(l1, l2)), in_paths=[src, ref], url=EUROPARL_v7 % (l1, l2), in_ext='txt', cite=cite)) # === Digital Corpus of European Parliament index.add_entry(Entry(did=DatasetId(group=group_id, name='dcep_wmt17', version='1', langs=(l1, l2)), in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt', url='http://data.statmt.org/wmt17/translation-task/dcep.lv-en.v1.tgz')) index.add_entry(Entry(did=DatasetId(group=group_id, name='books_wmt17', version='1', langs=(l1, l2)), in_paths=['*/*.lv', f'*/*.en'], cite=cite, in_ext='txt', url='http://data.statmt.org/wmt17/translation-task/books.lv-en.v1.tgz')) # === News Commentary v14 NEWSCOM_v14 = "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.%s-%s.tsv.gz" cite = index.ref_db.get_bibtex('bojar-etal-2018-findings') for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it', 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk', 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it', 'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja', 'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it', 'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id kk', 'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh']: l1, l2 = pair.split() index.add_entry(Entry( did=DatasetId(group=group_id, name='news_commentary', version='14', langs=(l1, l2)), url=NEWSCOM_v14 % (l1, l2), cite=cite)) for v in [15, 16]: cite = index.ref_db.get_bibtex('barrault-etal-2020-findings') url = f"http://data.statmt.org/news-commentary/v{v}/training/news-commentary-v{v}.%s-%s.tsv.gz" for pair in ['ar cs', 'ar de', 'ar en', 'ar es', 'ar fr', 'ar hi', 'ar id', 'ar it', 'ar ja', 'ar kk', 'ar nl', 'ar pt', 'ar ru', 'ar zh', 'cs de', 'cs en', 'cs es', 'cs fr', 'cs hi', 'cs id', 'cs it', 'cs ja', 'cs kk', 'cs nl', 'cs pt', 'cs ru', 'cs zh', 'de en', 'de es', 'de fr', 'de hi', 'de id', 'de it', 'de ja', 'de kk', 'de nl', 'de pt', 'de ru', 'de zh', 'en es', 'en fr', 'en hi', 'en id', 'en it', 'en ja', 'en kk', 'en nl', 'en pt', 'en ru', 'en zh', 'es fr', 'es hi', 'es id', 'es it', 'es ja', 'es kk', 'es nl', 'es pt', 'es ru', 'es zh', 'fr hi', 'fr id', 'fr it', 'fr ja', 'fr kk', 'fr nl', 'fr pt', 'fr ru', 'fr zh', 'hi id', 'hi it', 'hi nl', 'hi pt', 'hi ru', 'hi zh', 'id it', 'id ja', 'id kk', 'id nl', 'id pt', 'id ru', 'id zh', 'it kk', 'it nl', 'it pt', 'it ru', 'it zh', 'ja pt', 'ja ru', 'ja zh', 'kk nl', 'kk pt', 'kk ru', 'kk zh', 'nl pt', 'nl ru', 'nl zh', 'pt ru', 'pt zh', 'ru zh']: l1, l2 = pair.split() index.add_entry(Entry(did=DatasetId(group=group_id, name='news_commentary', version=f'{v}', langs=(l1, l2)), url=url % (l1, l2), cite=cite)) # ===== Wiki Titles V1 WIKI_TITLES_v1 = 'http://data.statmt.org/wikititles/v1/wikititles-v1.%s-%s.tsv.gz' cite = index.ref_db.get_bibtex('barrault-etal-2019-findings') for pair in ['cs en', 'cs pl', 'de en', 'es pt', 'fi en', 'gu en', 'hi ne', 'kk en', 'lt en', 'ru en', 'zh en']: l1, l2 = pair.split() index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='1', langs=(l1, l2)), url=WIKI_TITLES_v1 % (l1, l2), cite=cite)) # ===== Wiki Titles V2 WIKI_TITLES_v2 = 'http://data.statmt.org/wikititles/v2/wikititles-v2.%s-%s.tsv.gz' for pair in ['ca es', 'cs en', 'de en', 'de fr', 'es pt', 'iu en', 'ja en', 'pl en', 'ps en', 'ru en', 'ta en', 'zh en']: l1, l2 = pair.split() index.add_entry(Entry(did=DatasetId(group=group_id, name='wiki_titles', version='2', langs=(l1, l2)), url=WIKI_TITLES_v2 % (l1, l2), cite=cite)) WIKI_TITLES_v3 = 'http://data.statmt.org/wikititles/v3/wikititles-v3.{pair}.tsv' langs = 'bn-hi ca-es ca-pt ca-ro cs-en de-en de-fr es-pt es-ro ha-en ig-en is-en ja-en ps-en pt-ro ru-en xh-zu zh-en' for pair in langs.split(): l1, l2 = pair.split('-') url = WIKI_TITLES_v3.format(pair=pair) ent = Entry(did=DatasetId(group=group_id, name=f'wikititles', version='3', langs=(l1, l2)), url=url, cite=cite) index.add_entry(ent) # ==== WMT Dev and Tests wmt_sets = { 'newstest2014': [('de', 'en'), ('cs', 'en'), ('fr', 'en'), ('ru', 'en'), ('hi', 'en')], 'newsdev2015': [('fi', 'en'), ('en', 'fi')], 'newstest2015': [('fi', 'en'), ('en', 'cs'), ('cs', 'en'), ('en', 'ru'), ('en', 'de'), ('de', 'en'), ('ru', 'en'), ('en', 'fi')], 'newsdev2016': [('en', 'ro'), ('ro', 'en'), ('tr', 'en'), ('en', 'tr')], 'newstest2016': [('de', 'en'), ('en', 'de'), ('en', 'ro'), ('en', 'fi'), ('ro', 'en'), ('ru', 'en'), ('fi', 'en'), ('en', 'ru'), ('tr', 'en'), ('cs', 'en'), ('en', 'tr'), ('en', 'cs')], 'newsdev2017': [('zh', 'en'), ('lv', 'en'), ('en', 'zh'), ('en', 'lv')], 'newstest2017': [('zh', 'en'), ('ru', 'en'), ('en', 'fi'), ('lv', 'en'), ('en', 'de'), ('de', 'en'), ('cs', 'en'), ('en', 'cs'), ('en', 'tr'), ('en', 'ru'), ('tr', 'en'), ('fi', 'en'), ('en', 'zh'), ('en', 'lv')], 'newsdev2018': [('et', 'en'), ('en', 'et')], 'newstest2018': [('ru', 'en'), ('zh', 'en'), ('et', 'en'), ('en', 'fi'), ('en', 'de'), ('de', 'en'), ('en', 'cs'), ('en', 'tr'), ('cs', 'en'), ('tr', 'en'), ('en', 'ru'), ('en', 'et'), ('fi', 'en'), ('en', 'zh')], 'newsdev2019': [('gu', 'en'), ('kk', 'en'), ('en', 'lt'), ('en', 'kk'), ('lt', 'en'), ('en', 'gu')], 'newstest2019': [('de', 'en'), ('de', 'fr'), ('kk', 'en'), ('en', 'de'), ('en', 'fi'), ('ru', 'en'), ('zh', 'en'), ('gu', 'en'), ('en', 'kk'), ('en', 'zh'), ('cs', 'de'), ('fi', 'en'), ('en', 'gu'), ('lt', 'en'), ('de', 'cs'), ('en', 'lt'), ('en', 'ru'), ('en', 'cs'), ('fr', 'de')], 'newsdev2020': [('iu', 'en'), ('en', 'ta'), ('ta', 'en'), ('pl', 'en'), ('en', 'iu'), ('en', 'ja'), ('ja', 'en'), ('en', 'pl')] } for set_name, pairs in wmt_sets.items(): sub_name, year = set_name[:-4], set_name[-4:] for l1, l2 in pairs: src = f'dev/{set_name}-{l1}{l2}-src.{l1}.sgm' ref = f'dev/{set_name}-{l1}{l2}-ref.{l2}.sgm' name = f'{sub_name}_{l1}{l2}' index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)), filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='sgm', url='http://data.statmt.org/wmt20/translation-task/dev.tgz', cite=cite)) # Multi parallel wmt_sets = { '2009': ['en', 'cs', 'de', 'es', 'fr'], '2010': ['en', 'cs', 'de', 'es', 'fr'], '2011': ['en', 'cs', 'de', 'es', 'fr'], '2012': ['en', 'cs', 'de', 'es', 'fr', 'ru'], '2013': ['en', 'cs', 'de', 'es', 'fr', 'ru'], } for year, langs in wmt_sets.items(): name = 'newstest' for l1, l2 in itertools.combinations(langs, 2): f1 = f'dev/{name}{year}.{l1}' f2 = f'dev/{name}{year}.{l2}' index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version=year, langs=(l1, l2)), filename='wmt20dev.tgz', in_paths=[f1, f2], in_ext='txt', cite=cite, url='http://data.statmt.org/wmt20/translation-task/dev.tgz')) for l1, l2 in [('ps', 'en'), ('km', 'en')]: for set_name in ['wikipedia.dev', 'wikipedia.devtest']: src = f'dev/{set_name}.{l1}-{l2}.{l1}' ref = f'dev/{set_name}.{l1}-{l2}.{l2}' name = f'{set_name.replace(".", "_")}_{l1}{l2}' index.add_entry(Entry(did=DatasetId(group=group_id, name=name, version='1', langs=(l1, l2)), filename='wmt20dev.tgz', in_paths=[src, ref], in_ext='txt', cite=cite, url='http://data.statmt.org/wmt20/translation-task/dev.tgz')) #### WMT 20 Tests url = "http://data.statmt.org/wmt20/translation-task/test.tgz" wmt20_cite = index.ref_db.get_bibtex('barrault-etal-2020-findings') for _pref, pairs in { "": ["csen", "deen", "defr", "encs", "ende", "eniu", "enja", "enkm", "enpl", "enps", "enru", "enta", "enzh", "frde", "iuen", "jaen", "kmen", "plen", "psen", "ruen", "taen", "zhen"], "B": ["deen", "ende", "enzh", "ruen", "zhen"]}.items(): year = "2020" name = f'newstest{_pref}' for pair in pairs: l1, l2 = pair[:2], pair[2:] f1 = f'sgm/{name}{year}-{pair}-src.{l1}.sgm' f2 = f'sgm/{name}{year}-{pair}-ref.{l2}.sgm' index.add_entry(Entry(did=DatasetId(group=group_id, name=f'{name}_{pair}'.lower(), version=year, langs=(l1, l2)), filename='wmt20tests.tgz', in_paths=[f1, f2], in_ext='sgm', cite=wmt20_cite, url=url)) # WMT 21 Dev url = "http://data.statmt.org/wmt21/translation-task/dev.tgz" pairs = "en-ha en-is is-en ha-en".split() for pair in pairs: l1, l2 = pair.split('-') in_path = f'dev/xml/newsdev2021.{l1}-{l2}.xml' ent = Entry(did=DatasetId(group=group_id, name=f'newsdev_{l1}{l2}', version='2021', langs=(l1, l2)), filename='wmt21dev.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url) index.add_entry(ent) url = "http://data.statmt.org/wmt21/translation-task/test.tgz" pairs = 'bn-hi hi-bn xh-zu zu-xh cs-en de-en de-fr en-cs en-de en-ha en-is en-ja en-ru en-zh fr-de ha-en is-en ja-en ru-en zh-en'.split() for pair in pairs: l1, l2 = pair.split('-') name = 'newstest' if pair in 'bn-hi hi-bn xh-zu zu-xh': name = 'florestest' in_path = f'test/{name}2021.{l1}-{l2}.xml' ent = Entry(did=DatasetId(group=group_id, name=f'{name}_{l1}{l2}', version='2021', langs=(l1, l2)), filename='wmt21tests.tgz', in_paths=[in_path], in_ext='wmt21xml', cite=wmt20_cite, url=url) index.add_entry(ent) # ==== TED Talks 2.0 ar-en index.add_entry(Entry(did=DatasetId(group=group_id, name='tedtalks', version='2_clean', langs=('en', 'ar')), ext='tsv.xz', url='http://data.statmt.org/ted-talks/en-ar.v2.aligned.clean.xz')) # ==== Europarl v10 EP_v10 = "http://www.statmt.org/europarl/v10/training/europarl-v10.%s-%s.tsv.gz" for pair in ['cs en', 'cs pl', 'de en', 'de fr', 'es pt', 'fi en', 'fr en', 'lt en', 'pl en']: l1, l2 = pair.split() index.add_entry(Entry(did=DatasetId(group=group_id, name=f'europarl', version='10', langs=(l1, l2)), url=EP_v10 % (l1, l2), cite=wmt20_cite)) # ==== PMIndia V1 PMINDIA_v1 = "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.%s-%s.tsv" cite = index.ref_db.get_bibtex('Haddow-etal-2020-PMIndia') for pair in ["as en", "bn en", "gu en", "hi en", "kn en", "ml en", "mni en", "mr en", "or en", "pa en", "ta en", "te en", "ur en"]: l1, l2 = pair.split() # Note: listed as xx-en in URL but actually en-xx in the tsv; and its not compressed! index.add_entry(Entry(did=DatasetId(group=group_id, name=f'pmindia', version='1', langs=(l2, l1)), url=PMINDIA_v1 % (l1, l2), cite=cite)) # Pashto - English pseudo parallel dataset for alignment index.add_entry(Entry(did=DatasetId(group=group_id, name=f'wmt20_enps_aligntask', version='1', langs=('en', 'ps')), url='http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz', cite=wmt20_cite, ext='tsv.xz')) # Pashto - English mostly parallel dataset for name in ["GNOME.en-ps", "KDE4.en-ps", "Tatoeba.en-ps", "Ubuntu.en-ps", "bible.en-ps.clean", "ted-wmt20.en-ps", "wikimedia.en-ps"]: ps = f'ps-parallel/{name}.ps' en = f'ps-parallel/{name}.en' url = 'http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz' name = name.replace('.en-ps', '').replace('.', '_').replace('-', '_').lower() entry = Entry(did=DatasetId(group=group_id, name=name, version='1', langs=('ps', 'en')), url=url, cite=wmt20_cite, in_paths=[ps, en], filename='wmt20-psen-parallel.tgz', in_ext='txt') index.add_entry(entry) for l2 in ['ps', 'km']: url = f"http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-{l2}.xz" entry = Entry(did=DatasetId(group=group_id, name='paracrawl', version='5.1', langs=('en', l2)), url=url, cite=wmt20_cite, ext='tsv.xz', cols=(0, 1)) index.add_entry(entry) # for ja-en only TED was available index.add_entry(Entry(url="http://data.statmt.org/wmt20/translation-task/ja-en/ted.en-ja.tgz", did=DatasetId(group=group_id, name='ted', version='wmt20', langs=('en', 'ja')), cite=wmt20_cite, ext='tgz', in_ext='txt', in_paths=['en-ja/train.tags.en-ja.en', 'en-ja/train.tags.en-ja.ja'])) ccalign_cite = index.ref_db.get_bibtex('chaudhary-EtAl:2019:WMT') CC_ALIGNED = 'http://www.statmt.org/cc-aligned/sentence-aligned/{src}-{tgt}.tsv.xz' tgts='es_XX et_EE fa_IR ff_NG fi_FI fr_XX gu_IN ha_NG he_IL hi_IN hr_HR ht_HT hu_HU hy_AM id_ID ig_NG is_IS it_IT ja_XX jv_ID ka_GE kg_AO kk_KZ km_KH kn_IN ko_KR ku_TR ky_KG lg_UG ln_CD lo_LA lt_LT lv_LV mg_MG mi_NZ mk_MK ml_IN mn_MN mr_IN ms_MY mt_MT my_MM ne_NP nl_XX no_XX ns_ZA ny_MW om_KE or_IN pa_IN pl_PL ps_AF pt_XX qa_MM qd_MM ro_RO ru_RU si_LK sk_SK sl_SI sn_ZW so_SO sq_AL sr_RS ss_SZ st_ZA su_ID sv_SE sw_KE sz_PL ta_IN te_IN tg_TJ th_TH ti_ET tl_XX tn_BW tr_TR ts_ZA tz_MA uk_UA ur_PK ve_ZA vi_VN wo_SN xh_ZA yo_NG zh_CN zh_TW zu_ZA zz_TR'.split() srcs = 'af_ZA ak_GH am_ET ar_AR as_IN ay_BO az_AZ az_IR be_BY bg_BG bm_ML bn_IN br_FR bs_BA ca_ES cb_IQ cs_CZ cx_PH cy_GB da_DK de_DE el_GR'.split() pairs = [('en_XX', tgt) for tgt in tgts] + [(src, 'en_XX') for src in srcs] dont_know = {'qa', 'qd'} # looks like some Myanmar languages, but not sure which one. # Cant find them in ISO 639-1: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes # and lingo http://www.lingoes.net/en/translator/langcode.htm # and web-info https://wp-info.org/tools/languagecodes.php #unsupported = {'zh_TW', 'az_IR'} # country locales are not supported; they create conflicts. keeping large ones instead for src, tgt in pairs: # l1, l2 = src.split('_')[0], tgt.split('_')[0] if src[:2] in dont_know or tgt[:2] in dont_know: # I dont know what language these are continue url = CC_ALIGNED.format(src=src, tgt=tgt) entry = Entry(did=DatasetId(group=group_id, name='ccaligned', version='1', langs=(src, tgt)), url=url, cite=ccalign_cite, ext='tsv.xz', cols=(0, 1)) index.add_entry(entry) wmt21_cite = 'WMT21' # unavailable at the time of adding index.add_entry(Entry( did=DatasetId(group=group_id, name=f'khamenei', version='wmt21', langs=('ha','en')), cite=wmt21_cite, url='http://data.statmt.org/wmt21/translation-task/ha-en/khamenei.v1.ha-en.tsv', ext='tsv', cols=(2, 3))) index.add_entry(Entry( did=DatasetId(group=group_id, name=f'opus', version='wmt21', langs=('ha', 'en')), cite=wmt21_cite, url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0))) index.add_entry(Entry( did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ha')), cite=wmt21_cite, url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ha.bifixed.dedup.laser.filter-0.9.xz', ext='tsv.xz', cols=[1, 2])) index.add_entry(Entry( did=DatasetId(group=group_id, name=f'paracrawl', version='8.wmt21', langs=('en', 'ru')), cite=wmt21_cite, url='http://data.statmt.org/wmt21/translation-task/paracrawl8/paracrawl-release8.en-ru.bifixed.dedup.filter-1.1.xz', ext='tsv.xz', cols=[0, 1])) for pair in ['bn-hi', 'xh-zu']: l1, l2 = pair.split('-') url = f'http://data.statmt.org/wmt21/translation-task/cc-aligned/{pair}.tsv.xz' index.add_entry(Entry( did=DatasetId(group=group_id, name=f'ccaligned', version='wmt21', langs=(l1, l2)), cite=wmt21_cite, url='http://data.statmt.org/wmt21/translation-task/ha-en/opus.ha-en.tsv', ext='tsv', cols=(1, 0))) # https://data.statmt.org/wmt19/translation-task/fr-de/bitexts/de-fr.bicleaner07.de.gz for cln_name, name in [('commoncrawl', ''), ('paracrawl', 'de-fr.bicleaner07'), ('europarl_v7', '')]: l1, l2 = 'fr', 'de' prefix = 'https://data.statmt.org/wmt19/translation-task/fr-de/bitexts' index.add_entry(Entry(did=DatasetId(group=group_id, name=cln_name or name, version='wmt19', langs=(l1, l2)), ext='txt.gz', url=(f'{prefix}/{name}.{l1}.gz', f'{prefix}/{name}.{l2}.gz'))) # Back Translation prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/zh-en' index.add_entry(Entry( did=DatasetId(group=group_id, name='backtrans_enzh', version='wmt20', langs=('en', 'zh')), ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.translatedto.zh.gz'))) prefix = 'https://data.statmt.org/wmt20/translation-task/back-translation/ru-en' index.add_entry(Entry( did=DatasetId(group=group_id, name='backtrans_enru', version='wmt20', langs=('en', 'ru')), ext='txt.gz', url=(f'{prefix}/news.en.gz', f'{prefix}/news.en.translatedto.ru.gz'))) index.add_entry(Entry( did=DatasetId(group=group_id, name='backtrans_ruen', version='wmt20', langs=('ru', 'en')), ext='txt.gz', url=(f'{prefix}/news.ru.gz', f'{prefix}/news.ru.translatedto.en.gz')))
def load_all(index: Index): # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/ cite = index.ref_db.get_bibtex('Kunchukuttan-etal-iitb') l1, l2 = 'hi', 'en' for version, prefix in [ # ('1.0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'), ('1.5', 'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download')]: # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/ # version is not explicit, but guessed from file modification time and description url = prefix + "/parallel.tgz" ent = Entry(did=DatasetId(group='IITB', name=f'hien_train', version=version, langs=(l1, l2)), url=url, filename=f'IITB{version}-hin_eng-parallel.tar.gz', in_ext='txt', cite=cite, in_paths=[f'parallel/IITB.en-hi.{l1}', f'parallel/IITB.en-hi.{l2}']) index.add_entry(ent) url = prefix + "/dev_test.tgz" for split in ['dev', 'test']: f1 = f'dev_test/{split}.{l1}' f2 = f'dev_test/{split}.{l2}' ent = Entry(did=DatasetId(group='IITB', name=f'hien_{split}', version=version, langs=(l1, l2)), url=url, filename=f'IITB{version}-hin_eng-dev_test.tar.gz', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent) # == Japanese == cite = index.ref_db.get_bibtex('neubig11kftt') url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" l1, l2 = 'en', 'ja' for split in ['train', 'test', 'dev', 'tune']: f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}' f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}' ent = Entry(did=DatasetId(group='Phontron', name=f'kftt_{split}', version='1', langs=(l1, l2)), url=url, filename="kftt-data-1.0.tar.gz", in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent) url = "http://lotus.kuee.kyoto-u.ac.jp/WAT/my-en-data/wat2020.my-en.zip" cite = index.ref_db.get_bibtex('ding2020a') for split in ['dev', 'test', 'train']: ent = Entry(did=DatasetId(group='WAT', name=f'alt_{split}', version='2020', langs=('my', 'en')), url=url, in_ext='txt', cite=cite, filename='wat2020.my-en.zip', in_paths=[f'wat2020.my-en/alt/{split}.alt.my', f'wat2020.my-en/alt/{split}.alt.en']) index.add_entry(ent) l1, l2 = 'iu', 'en' url = "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" cite = index.ref_db.get_bibtex('joanis-etal-2020-nunavut') for split in ['dev', 'devtest', 'test', 'train']: path_pref = f'Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/split/{split}' if split != 'train': path_pref += '-dedup' ent = Entry(did=DatasetId(group='NRC_CA', name=f'nunavut_hansard_{split}', version='3', langs=(l1, l2)), url=url, in_ext='txt', cite=cite, filename='NunavutHansard_iuen_v3.tgz', in_paths=[f'{path_pref}.{l1}', f'{path_pref}.{l2}']) index.add_entry(ent) # https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2122 url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2122/khresmoi-summary-test-set-2.0.zip" cite = index.ref_db.get_bibtex('Khresmoi') langs = ["cs", "de", "en", "es", "fr", "hu", "pl", "sv"] for i, l1 in enumerate(langs): for l2 in langs[i + 1:]: ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_test', version='2', langs=(l1, l2)), url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt', in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l1}", f"khresmoi-summary-test-set-2.0/khresmoi-summary-test.{l2}"]) index.add_entry(ent) ent = Entry(did=DatasetId(group='Lindat', name=f'khresmoi_summary_dev', version='2', langs=(l1, l2)), url=url, filename='khresmoi-summary-test-set-2.0.zip', cite=cite, in_ext='txt', in_paths=[f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l1}", f"khresmoi-summary-test-set-2.0/khresmoi-summary-dev.{l2}"]) index.add_entry(ent) jesc_cite = index.ref_db.get_bibtex('pryzant_jesc_2018') for split in ['train', 'dev', 'test']: ent = Entry(url='https://nlp.stanford.edu/projects/jesc/data/split.tar.gz', did=DatasetId(group='StanfordNLP', name=f'jesc_{split}', version='1', langs=('en', 'ja')), filename='jesc-split.tar.gz', in_ext='tsv', in_paths=[f"split/{split}"], cite=jesc_cite) index.add_entry(ent) prefix = 'https://nlp.stanford.edu/projects/nmt/data' for name, subdir, src, tgt, cite_key in [ ("wmt15_train", "wmt15.en-cs", "train.en", "train.cs", "luong2016acl_hybrid"), ("newstest2013", "wmt15.en-cs", "newstest2013.en", "newstest2013.cs", "luong2016acl_hybrid"), ("newstest2014", "wmt15.en-cs", "newstest2014.en", "newstest2014.cs", "luong2016acl_hybrid"), ("newstest2015", "wmt15.en-cs", "newstest2015.en", "newstest2015.cs", "luong2016acl_hybrid"), ("wmt14_train", "wmt14.en-de", "train.en", "train.de", "luong-pham-manning:2015:EMNLP"), ("newstest2012", "wmt14.en-de", "newstest2012.en", "newstest2012.de", "luong-pham-manning:2015:EMNLP"), ("newstest2013", "wmt14.en-de", "newstest2013.en", "newstest2013.de", "luong-pham-manning:2015:EMNLP"), ("newstest2014", "wmt14.en-de", "newstest2014.en", "newstest2014.de", "luong-pham-manning:2015:EMNLP"), ("newstest2015", "wmt14.en-de", "newstest2015.en", "newstest2015.de", "luong-pham-manning:2015:EMNLP"), ("iwslt15_train", "iwslt15.en-vi", "train.en", "train.vi", "Luong-Manning:iwslt15"), ("test2012", "iwslt15.en-vi", "tst2012.en", "tst2012.vi", "Luong-Manning:iwslt15"), ("test2013", "iwslt15.en-vi", "tst2013.en", "tst2013.vi", "Luong-Manning:iwslt15")]: l1, l2 = src.split(".")[-1], tgt.split(".")[-1] url1 = f"{prefix}/{subdir}/{src}" url2 = f"{prefix}/{subdir}/{tgt}" cite = index.ref_db.get_bibtex(cite_key) ent = Entry(did=DatasetId(group='StanfordNLP', name=name, version='1', langs=(l1, l2)), ext='txt', url=(url1, url2), cite=cite) index.add_entry(ent) _url = 'https://repository.clarin.is/repository/xmlui/bitstream/handle/20.500.12537/24/Parice_dev_test.20.05.zip' cite = index.ref_db.get_bibtex('Barkarson-et-al-2020') for sub in ['eea train dev test', 'ema train dev test', 'opensubtitles dev test']: l1, l2 = 'en', 'is' sub, *splits = sub.split() for split in splits: in_paths = [f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l1}.csv', f'Parice_dev_test.20.05/csv/{sub}/{sub}_{split}_{l2}.csv'] if split == 'train' and sub == 'eea': in_paths = [in_paths[1], in_paths[0]] # aha! they have swapped it ent = Entry(did=DatasetId(group='ParIce', name=f'{sub}_{split}', version='20.05', langs=(l1, l2)), url=_url, ext='zip', in_ext='txt', in_paths=in_paths, cite=cite, filename='Parice_dev_test.20.05.zip') index.add_entry(ent) # https://github.com/bonaventuredossou/ffr-v1/tree/master/FFR-Dataset/FFR%20Dataset%20v2 _url = 'https://raw.githubusercontent.com/bonaventuredossou/ffr-v1/master/FFR-Dataset/FFR%20Dataset%20v2/ffr_dataset_v2.txt' cite = index.ref_db.get_bibtex("emezue-dossou-2020-ffr") ent = Entry(did=DatasetId(group='Masakhane', name=f'ffr', version='2', langs=('fon', 'fra')), url=_url, ext='tsv', cite=cite) index.add_entry(ent) # https://zenodo.org/record/4432712 _url = 'https://zenodo.org/record/4432712/files/Fon_French_Parallel_Data_25377.csv?download=1' cite = index.ref_db.get_bibtex("dossou2021crowdsourced") ent = Entry(did=DatasetId(group='Masakhane', name=f'daily_dialogues', version='1', langs=('fon', 'fra')), url=_url, ext='csvwithheader', cite=cite) index.add_entry(ent)
def load_all(index: Index): group = 'AI4Bharath' cite = index.ref_db.get_bibtex('ramesh2021samanantar') pairs = ( 'en-as en-bn en-gu en-hi en-kn en-ml en-mr en-or en-pa en-ta en-te as-bn as-gu as-hi' ' as-kn as-ml as-mr as-or as-pa as-ta as-te bn-gu bn-hi bn-kn bn-ml bn-mr bn-or bn-pa' ' bn-ta bn-te gu-hi gu-kn gu-ml gu-mr gu-or gu-pa gu-ta gu-te hi-kn hi-ml hi-mr hi-or' ' hi-pa hi-ta hi-te kn-ml kn-mr kn-or kn-pa kn-ta kn-te ml-mr ml-or ml-pa ml-ta ml-te' ' mr-or mr-pa mr-ta mr-te or-pa or-ta or-te pa-ta pa-te ta-te') BASE_v0_2 = 'https://storage.googleapis.com/samanantar-public/V0.2/data/{dirname}/{pair}.zip' for pair in pairs.strip().split(' '): l1, l2 = pair.split('-') dirname = 'en2indic' if l1 == 'en' else 'indic2indic' url = BASE_v0_2.format(dirname=dirname, pair=pair) ent = Entry(did=DatasetId(group=group, name=f'samananthar', version='0.2', langs=(l1, l2)), url=url, cite=cite, in_paths=[f'{pair}/train.{l1}', f'{pair}/train.{l2}'], in_ext='txt') index.add_entry(ent) URL = "https://storage.googleapis.com/samanantar-public/benchmarks.zip" filename = "samananthar-benchmarks.zip" for split in ('dev', 'test'): want20_langs = 'bn gu hi ml mr ta te'.split() for l2 in want20_langs: f1 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.en' f2 = f'benchmarks/wat2020-devtest/en-{l2}/{split}.{l2}' ent = Entry(did=DatasetId(group=group, name=f'wat_{split}', version='2020', langs=('en', l2)), filename=filename, url=URL, cite=cite, in_paths=[f1, f2], in_ext='txt') index.add_entry(ent) wat21_langs = 'bn en gu hi kn ml mr or pa ta te'.split() for i, l1 in enumerate(wat21_langs): for l2 in wat21_langs[i + 1:]: f1 = f'benchmarks/wat2021-devtest/{split}.{l1}' f2 = f'benchmarks/wat2021-devtest/{split}.{l2}' ent = Entry(did=DatasetId(group=group, name=f'wat_{split}', version='2021', langs=(l1, l2)), filename=filename, url=URL, cite=cite, in_paths=[f1, f2], in_ext='txt') index.add_entry(ent) # PMI langs; en-as index.add_entry( Entry(did=DatasetId(group=group, name=f'pmi_{split}', version='2021', langs=('en', 'as')), filename=filename, url=URL, cite=cite, in_ext='txt', in_paths=[ f'benchmarks/pmi/en-as/{split}.en', f'benchmarks/pmi/en-as/{split}.as' ]))
def load_all(index: Index): # === IITB hin eng http://www.cfilt.iitb.ac.in/iitb_parallel/ cite = """@article{DBLP:journals/corr/abs-1710-02855, author = {Anoop Kunchukuttan and Pratik Mehta and Pushpak Bhattacharyya}, title = {The {IIT} Bombay English-Hindi Parallel Corpus}, journal = {CoRR}, volume = {abs/1710.02855}, year = {2017}, url = {http://arxiv.org/abs/1710.02855}, archivePrefix = {arXiv}, eprint = {1710.02855}, timestamp = {Mon, 13 Aug 2018 16:48:50 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-1710-02855.bib}, bibsource = {dblp computer science bibliography, https://dblp.org} }""" l1, l2 = 'hi', 'en' for version, prefix in [ #('v1_0', 'http://www.cfilt.iitb.ac.in/iitb_parallel/iitb_corpus_download'), ('v1_5', 'http://www.cfilt.iitb.ac.in/~moses/iitb_en_hi_parallel/iitb_corpus_download' ) ]: # they also have v2, but the link is broken http://www.cfilt.iitb.ac.in/iitb_parallel/ # version is not explicit, but guessed from file modification time and description url = prefix + "/parallel.tgz" ent = Entry(langs=(l1, l2), url=url, filename=f'IITB{version}-hin_eng-parallel.tar.gz', name=f'IITB{version}_train', in_ext='txt', cite=cite, in_paths=[ f'parallel/IITB.en-hi.{l1}', f'parallel/IITB.en-hi.{l2}' ]) index.add_entry(ent) url = prefix + "/dev_test.tgz" for split in ['dev', 'test']: f1 = f'dev_test/{split}.{l1}' f2 = f'dev_test/{split}.{l2}' ent = Entry(langs=(l1, l2), url=url, filename=f'IITB{version}-hin_eng-dev_test.tar.gz', name=f'IITB{version}_{split}', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent) # == Japanese == cite = """@misc{neubig11kftt, author = {Graham Neubig}, title = {The {Kyoto} Free Translation Task}, howpublished = {http://www.phontron.com/kftt}, year = {2011} }""" url = "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" l1, l2 = 'en', 'ja' for split in ['train', 'test', 'dev', 'tune']: f1 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l1}' f2 = f'kftt-data-1.0/data/orig/kyoto-{split}.{l2}' ent = Entry(langs=(l1, l2), url=url, filename="kftt-data-1.0.tar.gz", name=f'kftt_v1_{split}', in_ext='txt', in_paths=[f1, f2], cite=cite) index.add_entry(ent)
def load_all(index: Index): data="""an-ca an-de an-en an-es an-fr an-gl an-it an-pl an-pt an-ru ar-arz ar-az ar-ba ar-be ar-bg ar-bn ar-br ar-bs ar-ca ar-ceb ar-cs ar-da ar-de ar-el ar-en ar-eo ar-es ar-et ar-eu ar-fa ar-fi ar-fr ar-gl ar-he ar-hi ar-hr ar-hu ar-id ar-is ar-it ar-ja ar-kk ar-ko ar-lt ar-mk ar-ml ar-mr ar-nds ar-ne ar-nl ar-no ar-pl ar-pt ar-ro ar-ru ar-sh ar-si ar-sk ar-sl ar-sq ar-sr ar-sv ar-sw ar-ta ar-te ar-tl ar-tr ar-tt ar-uk ar-vi arz-de arz-en arz-es arz-fr ar-zh arz-it arz-pt arz-ru as-de as-es as-fr as-it azb-fr az-bg az-ca az-cs az-da az-de az-el az-en az-es az-et az-fa az-fi az-fr az-gl az-he az-hr az-hu az-id az-it az-ja az-ko az-nl az-no az-pl az-pt az-ro az-ru az-sr az-sv az-ta az-tr az-uk az-vi az-zh ba-bg ba-ca ba-cs ba-da ba-de ba-el ba-en ba-es ba-fi ba-fr ba-gl ba-hr ba-hu ba-id ba-it ba-ja ba-nl ba-no ba-pl ba-pt bar-de bar-en bar-es bar-fr bar-it ba-ro bar-pt bar-ru ba-ru ba-sh ba-sk ba-sl ba-sr ba-sv ba-tr ba-uk ba-zh be-bg be-ca be-cs be-de be-en be-es be-fi be-fr be-he be-hu be-it be-ja be-nl be-no be-pl be-pt be-ro be-ru be-sr be-sv be-uk bg-bn bg-bs bg-ca bg-ceb bg-cs bg-da bg-de bg-el bg-en bg-eo bg-es bg-et bg-eu bg-fa bg-fi bg-fr bg-gl bg-he bg-hi bg-hr bg-hu bg-id bg-is bg-it bg-ja bg-kk bg-ko bg-lt bg-mk bg-ml bg-mr bg-nds bg-ne bg-nl bg-no bg-pl bg-pt bg-ro bg-ru bg-sh bg-si bg-sk bg-sl bg-sq bg-sr bg-sv bg-sw bg-ta bg-te bg-tl bg-tr bg-tt bg-uk bg-vi bg-zh bn-bs bn-ca bn-cs bn-da bn-de bn-el bn-en bn-eo bn-es bn-et bn-eu bn-fa bn-fi bn-fr bn-gl bn-he bn-hi bn-hr bn-hu bn-id bn-it bn-ja bn-ko bn-lt bn-mk bn-nl bn-no bn-pl bn-pt bn-ro bn-ru bn-sh bn-sk bn-sl bn-sq bn-sr bn-sv bn-ta bn-tr bn-uk bn-vi bn-zh br-de br-en br-es br-fr br-it br-pt br-ru br-uk bs-ca bs-cs bs-da bs-de bs-el bs-en bs-eo bs-es bs-et bs-eu bs-fa bs-fi bs-fr bs-gl bs-he bs-hi bs-hr bs-hu bs-id bs-is bs-it bs-ja bs-ko bs-lt bs-mk bs-ml bs-mr bs-nl bs-no bs-pl bs-pt bs-ro bs-ru bs-sh bs-si bs-sk bs-sl bs-sq bs-sr bs-sv bs-ta bs-te bs-tl bs-tr bs-uk bs-vi bs-zh ca-ceb ca-cs ca-da ca-de ca-el ca-en ca-eo ca-es ca-et ca-eu ca-fa ca-fi ca-fo ca-fr ca-fy ca-gl ca-he ca-hi ca-hr ca-hu ca-id ca-is ca-it ca-ja ca-ka ca-kk ca-ko ca-la ca-lb ca-lt ca-mk ca-ml ca-mr ca-nds ca-ne ca-nl ca-no ca-oc ca-pl ca-pt ca-ro ca-ru ca-sh ca-si ca-sk ca-sl ca-sq ca-sr ca-sv ca-sw ca-ta ca-te ca-tl ca-tr ca-tt ca-uk ca-vi ca-zh ceb-cs ceb-de ceb-en ceb-es ceb-fi ceb-fr ceb-hu ceb-it ceb-ja ceb-nl ceb-no ceb-pl ceb-pt ceb-ro ceb-ru ceb-sv ceb-uk cs-da cs-de cs-el cs-en cs-eo cs-es cs-et cs-eu cs-fa cs-fi cs-fr cs-fy cs-gl cs-he cs-hi cs-hr cs-hu cs-id cs-is cs-it cs-ja cs-ka cs-kk cs-ko cs-la cs-lt cs-mk cs-ml cs-mr cs-nds cs-ne cs-nl cs-no cs-oc cs-pl cs-pt cs-ro cs-ru cs-sh cs-si cs-sk cs-sl cs-sq cs-sr cs-sv cs-sw cs-ta cs-te cs-tl cs-tr cs-tt cs-uk cs-vi cs-zh da-de da-el da-en da-eo da-es da-et da-eu da-fa da-fi da-fo da-fr da-gl da-he da-hi da-hr da-hu da-id da-is da-it da-ja da-ko da-lt da-mk da-ml da-mr da-nds da-ne da-nl da-no da-pl da-pt da-ro da-ru da-sh da-si da-sk da-sl da-sq da-sr da-sv da-sw da-ta da-te da-tl da-tr da-tt da-uk da-vi da-zh de-el de-en de-eo de-es de-et de-eu de-fa de-fi de-fo de-fr de-fy de-gl de-gom de-he de-hi de-hr de-hu de-hy de-id de-is de-it de-ja de-ka de-kk de-ko de-la de-lb de-lt de-mk de-ml de-mr de-nds de-ne de-nl de-no de-oc de-pl de-pt de-rm de-ro de-ru de-sh de-si de-sk de-sl de-sq de-sr de-sv de-sw de-ta de-te de-tg de-tl de-tr de-tt de-uk de-vi de-wuu de-zh el-en el-eo el-es el-et el-eu el-fa el-fi el-fr el-gl el-he el-hi el-hr el-hu el-id el-is el-it el-ja el-ko el-lt el-mk el-ml el-mr el-nl el-no el-pl el-pt el-ro el-ru el-sh el-si el-sk el-sl el-sq el-sr el-sv el-sw el-ta el-te el-tl el-tr el-uk el-vi el-zh en-eo en-es en-et en-eu en-fa en-fi en-fo en-fr en-fy en-gl en-he en-hi en-hr en-hu en-id en-io en-is en-it en-ja en-jv en-ka en-kk en-ko en-la en-lb en-lmo en-lt en-mg en-mk en-ml en-mr en-mwl en-nds_nl en-nds en-ne en-nl en-no en-oc en-pl en-pt en-ro en-ru en-sh en-simple en-si en-sk en-sl en-sq en-sr en-sv en-sw en-ta en-te en-tg en-tl en-tr en-tt en-ug en-uk en-vi en-wuu en-zh eo-es eo-et eo-eu eo-fa eo-fi eo-fr eo-gl eo-he eo-hi eo-hr eo-hu eo-id eo-is eo-it eo-ja eo-ko eo-lt eo-mk eo-ml eo-mr eo-nds eo-nl eo-no eo-pl eo-pt eo-ro eo-ru eo-sh eo-si eo-sk eo-sl eo-sq eo-sr eo-sv eo-ta eo-te eo-tl eo-tr eo-uk eo-vi eo-zh es-et es-eu es-fa es-fi es-fo es-fr es-fy es-gl es-gom es-he es-hi es-hr es-hu es-hy es-id es-is es-it es-ja es-jv es-ka es-kk es-ko es-la es-lb es-lt es-mk es-ml es-mr es-nds es-ne es-nl es-no es-oc es-pl es-pt es-ro es-ru es-sh es-si es-sk es-sl es-sq es-sr es-sv es-sw es-ta es-te es-tl es-tr es-tt es-uk es-vi es-wuu es-zh et-eu et-fa et-fi et-fr et-gl et-he et-hi et-hr et-hu et-id et-is et-it et-ja et-ko et-lt et-mk et-ml et-mr et-nl et-no et-pl et-pt et-ro et-ru et-sh et-si et-sk et-sl et-sq et-sr et-sv et-ta et-te et-tl et-tr et-uk et-vi et-zh eu-fa eu-fi eu-fr eu-gl eu-he eu-hi eu-hr eu-hu eu-id eu-is eu-it eu-ja eu-ko eu-lt eu-mk eu-ml eu-mr eu-nl eu-no eu-pl eu-pt eu-ro eu-ru eu-sh eu-sk eu-sl eu-sq eu-sr eu-sv eu-ta eu-te eu-tr eu-uk eu-vi eu-zh fa-fi fa-fr fa-gl fa-he fa-hi fa-hr fa-hu fa-id fa-it fa-ja fa-ko fa-lt fa-mk fa-ml fa-mr fa-nl fa-no fa-pl fa-pt fa-ro fa-ru fa-sh fa-sk fa-sl fa-sq fa-sr fa-sv fa-ta fa-te fa-tr fa-uk fa-vi fa-zh fi-fr fi-gl fi-he fi-hi fi-hr fi-hu fi-id fi-is fi-it fi-ja fi-ko fi-lt fi-mk fi-ml fi-mr fi-nds fi-ne fi-nl fi-no fi-oc fi-pl fi-pt fi-ro fi-ru fi-sh fi-si fi-sk fi-sl fi-sq fi-sr fi-sv fi-sw fi-ta fi-te fi-tl fi-tr fi-tt fi-uk fi-vi fi-zh fo-fr fo-it fo-nl fo-pl fo-pt fo-ru fo-sv fr-fy fr-gl fr-gom fr-he fr-hi fr-hr fr-hu fr-hy fr-id fr-is fr-it fr-ja fr-jv fr-ka fr-kk fr-ko fr-la fr-lb fr-lt fr-mg fr-mk fr-ml fr-mr fr-nds fr-ne fr-nl fr-no fr-oc fr-pl fr-pt fr-ro fr-ru fr-sh fr-si fr-sk fr-sl fr-sq fr-sr fr-sv fr-sw fr-ta fr-te fr-tl fr-tr fr-tt fr-uk fr-vi fr-wuu fr-zh fy-it fy-nl fy-pl fy-pt fy-ru fy-sv gl-he gl-hi gl-hr gl-hu gl-id gl-is gl-it gl-ja gl-ko gl-lt gl-mk gl-ml gl-mr gl-nds gl-ne gl-nl gl-no gl-oc gl-pl gl-pt gl-ro gl-ru gl-sh gl-si gl-sk gl-sl gl-sq gl-sr gl-sv gl-ta gl-te gl-tl gl-tr gl-tt gl-uk gl-vi gl-zh gom-it gom-pt gom-ru he-hi he-hr he-hu he-id he-is he-it he-ja he-ko he-lt he-mk he-ml he-mr he-nl he-no he-pl he-pt he-ro he-ru he-sh he-si he-sk he-sl he-sq he-sr he-sv he-sw he-ta he-te he-tl he-tr he-uk he-vi he-zh hi-hr hi-hu hi-id hi-it hi-ja hi-ko hi-lt hi-mk hi-mr hi-ne hi-nl hi-no hi-pl hi-pt hi-ro hi-ru hi-sh hi-sk hi-sl hi-sq hi-sr hi-sv hi-ta hi-te hi-tr hi-uk hi-vi hi-zh hr-hu hr-id hr-is hr-it hr-ja hr-ko hr-lt hr-mk hr-ml hr-mr hr-ne hr-nl hr-no hr-pl hr-pt hr-ro hr-ru hr-sh hr-si hr-sk hr-sl hr-sq hr-sr hr-sv hr-ta hr-te hr-tl hr-tr hr-uk hr-vi hr-zh hu-id hu-is hu-it hu-ja hu-kk hu-ko hu-lt hu-mk hu-ml hu-mr hu-nds hu-ne hu-nl hu-no hu-oc hu-pl hu-pt hu-ro hu-ru hu-sh hu-si hu-sk hu-sl hu-sq hu-sr hu-sv hu-sw hu-ta hu-te hu-tl hu-tr hu-uk hu-vi hu-zh hy-it hy-pt hy-ru id-is id-it id-ja id-jv id-ko id-lt id-mk id-ml id-mr id-ne id-nl id-no id-pl id-pt id-ro id-ru id-sh id-si id-sk id-sl id-sq id-sr id-sv id-sw id-ta id-te id-tl id-tr id-tt id-uk id-vi id-zh is-it is-ja is-lt is-mk is-nl is-no is-pl is-pt is-ro is-ru is-sh is-sk is-sl is-sr is-sv is-tr is-uk is-vi is-zh it-ja it-jv it-ka it-kk it-ko it-la it-lb it-lmo it-lt it-mk it-ml it-mr it-nds it-ne it-nl it-no it-oc it-pl it-pt it-ro it-ru it-scn it-sh it-si it-sk it-sl it-sq it-sr it-sv it-sw it-ta it-te it-tl it-tr it-tt it-uk it-vi it-wuu it-zh ja-kk ja-ko ja-lt ja-mk ja-ml ja-mr ja-nds ja-nl ja-no ja-pl ja-pt ja-ro ja-ru ja-sh ja-si ja-sk ja-sl ja-sq ja-sr ja-sv ja-sw ja-ta ja-te ja-tl ja-tr ja-tt ja-uk ja-vi ja-zh jv-pt ka-nl ka-pl ka-pt ka-ru ka-sv kk-nl kk-no kk-pl kk-pt kk-ro kk-ru kk-sv kk-tr kk-uk ko-lt ko-mk ko-ml ko-mr ko-nl ko-no ko-pl ko-pt ko-ro ko-ru ko-sh ko-sk ko-sl ko-sq ko-sr ko-sv ko-ta ko-te ko-tr ko-uk ko-vi ko-zh la-nl la-pl la-pt la-ro la-ru la-sv lb-nl lb-pl lb-pt lb-ru lb-sv lt-mk lt-ml lt-mr lt-nl lt-no lt-pl lt-pt lt-ro lt-ru lt-sh lt-si lt-sk lt-sl lt-sq lt-sr lt-sv lt-ta lt-te lt-tl lt-tr lt-uk lt-vi lt-zh mk-ml mk-mr mk-nl mk-no mk-pl mk-pt mk-ro mk-ru mk-sh mk-si mk-sk mk-sl mk-sq mk-sr mk-sv mk-ta mk-te mk-tl mk-tr mk-uk mk-vi mk-zh ml-nl ml-no ml-pl ml-pt ml-ro ml-ru ml-sh ml-sk ml-sl ml-sq ml-sr ml-sv ml-tr ml-uk ml-vi ml-zh mr-nl mr-no mr-pl mr-pt mr-ro mr-ru mr-sh mr-sk mr-sl mr-sq mr-sr mr-sv mr-tr mr-uk mr-vi mr-zh mwl-pt nds_nl-nl nds-nl nds-no nds-pl nds-pt nds-ro nds-ru nds-sv nds-uk ne-nl ne-no ne-pl ne-pt ne-ro ne-ru ne-sh ne-sk ne-sl ne-sv ne-uk nl-no nl-oc nl-pl nl-pt nl-ro nl-ru nl-sh nl-si nl-sk nl-sl nl-sq nl-sr nl-sv nl-sw nl-ta nl-te nl-tl nl-tr nl-tt nl-uk nl-vi nl-zh no-pl no-pt no-ro no-ru no-sh no-si no-sk no-sl no-sq no-sr no-sv no-sw no-ta no-te no-tl no-tr no-tt no-uk no-vi no-zh oc-pl oc-pt oc-ro oc-ru oc-sv pl-pt pl-ro pl-ru pl-sh pl-si pl-sk pl-sl pl-sq pl-sr pl-sv pl-sw pl-ta pl-te pl-tl pl-tr pl-tt pl-uk pl-vi pl-zh pt-ro pt-ru pt-sh pt-si pt-sk pt-sl pt-sq pt-sr pt-sv pt-sw pt-ta pt-te pt-tl pt-tr pt-tt pt-uk pt-vi pt-wuu pt-zh ro-ru ro-sh ro-si ro-sk ro-sl ro-sq ro-sr ro-sv ro-sw ro-ta ro-te ro-tl ro-tr ro-tt ro-uk ro-vi ro-zh ru-sh ru-si ru-sk ru-sl ru-sq ru-sr ru-sv ru-sw ru-ta ru-te ru-tg ru-tl ru-tr ru-tt ru-uk ru-vi ru-wuu ru-zh sh-si sh-sk sh-sl sh-sq sh-sr sh-sv sh-ta sh-te sh-tl sh-tr sh-uk sh-vi sh-zh si-sk si-sl si-sq si-sr si-sv si-tr si-uk si-vi si-zh sk-sl sk-sq sk-sr sk-sv sk-ta sk-te sk-tl sk-tr sk-uk sk-vi sk-zh sl-sq sl-sr sl-sv sl-ta sl-te sl-tl sl-tr sl-uk sl-vi sl-zh sq-sr sq-sv sq-ta sq-te sq-tl sq-tr sq-uk sq-vi sq-zh sr-sv sr-ta sr-te sr-tl sr-tr sr-uk sr-vi sr-zh sv-sw sv-ta sv-te sv-tl sv-tr sv-tt sv-uk sv-vi sv-zh sw-tr sw-uk sw-vi sw-zh ta-tr ta-uk ta-vi ta-zh te-tr te-uk te-vi te-zh tl-tr tl-uk tl-vi tl-zh tr-tt tr-uk tr-vi tr-zh tt-uk tt-zh uk-vi uk-zh vi-zh wuu-zh""" cite = """@article{wikimatrix1, author = {Holger Schwenk and Vishrav Chaudhary and Shuo Sun and Hongyu Gong and Francisco Guzm{\'{a}}n}, title = {WikiMatrix: Mining 135M Parallel Sentences in 1620 Language Pairs from Wikipedia}, journal = {CoRR}, volume = {abs/1907.05791}, year = {2019}, url = {http://arxiv.org/abs/1907.05791}, archivePrefix = {arXiv}, eprint = {1907.05791}, timestamp = {Wed, 17 Jul 2019 10:27:36 +0200}, biburl = {https://dblp.org/rec/journals/corr/abs-1907-05791.bib}, bibsource = {dblp computer science bibliography, https://dblp.org}}""" url_pat = "https://dl.fbaipublicfiles.com/laser/WikiMatrix/v1/WikiMatrix.%s-%s.tsv.gz" mapping = dict(sh='hbs') skips = {'nds_nl', 'simple'} for pair in data.split(): l1, l2 = pair.split('-') if l1 in skips or l2 in skips: continue l1iso, l2iso = mapping.get(l1, l1), mapping.get(l2, l2) url = url_pat % (l1, l2) ent = Entry(langs=(l1iso, l2iso), url=url, name='WikiMatrix_v1', cols=(1, 2), cite=cite) index.add_entry(ent)
def load_all(index: Index): data = """am-en am-fr ar-am ar-en ar-fr aym-am aym-ar aym-en aym-fr bg-ar bg-aym bg-en bg-fr bn-am bn-ar bn-aym bn-bg bn-en bn-fr ca-am ca-ar ca-aym ca-bg ca-bn ca-en ca-fr cs-ar cs-aym cs-bg cs-bn cs-ca cs-en cs-fr da-am da-ar da-aym da-bg da-bn da-ca da-cs da-en da-fr de-am de-ar de-aym de-bg de-bn de-ca de-cs de-da de-en de-fr el-am el-ar el-aym el-bg el-bn el-ca el-cs el-da el-de el-en el-fr eo-ar eo-aym eo-bg eo-bn eo-ca eo-cs eo-da eo-de eo-el eo-en eo-fr es-am es-ar es-aym es-bg es-bn es-ca es-cs es-da es-de es-el es-en es-eo es-fr fa-am fa-ar fa-aym fa-bg fa-bn fa-ca fa-cs fa-da fa-de fa-el fa-en fa-eo fa-es fa-fr fil-ar fil-aym fil-bg fil-bn fil-ca fil-cs fil-da fil-de fil-el fil-en fil-eo fil-es fil-fa fil-fr fr-en he-ar he-bn he-ca he-cs he-da he-de he-el he-en he-es he-fa he-fr hi-am hi-ar hi-bg hi-bn hi-cs hi-de hi-el hi-en hi-eo hi-es hi-fa hi-fr hu-am hu-ar hu-aym hu-bg hu-bn hu-ca hu-cs hu-da hu-de hu-el hu-en hu-eo hu-es hu-fa hu-fil hu-fr hu-hi id-am id-ar id-aym id-bg id-bn id-ca id-cs id-da id-de id-el id-en id-eo id-es id-fa id-fil id-fr id-hi id-hu it-am it-ar it-aym it-bg it-bn it-ca it-cs it-da it-de it-el it-en it-eo it-es it-fa it-fil it-fr it-he it-hi it-hu it-id jp-am jp-ar jp-aym jp-bg jp-bn jp-ca jp-cs jp-da jp-de jp-el jp-en jp-eo jp-es jp-fa jp-fil jp-fr jp-he jp-hi jp-hu jp-id jp-it km-ar km-aym km-bn km-ca km-da km-de km-el km-en km-es km-fa km-fil km-fr km-hu km-it km-jp ko-am ko-ar ko-aym ko-bg ko-bn ko-ca ko-cs ko-da ko-de ko-el ko-en ko-eo ko-es ko-fa ko-fil ko-fr ko-hu ko-id ko-it ko-jp ku-ar ku-el ku-en ku-es ku-fr ku-it ku-jp mg-am mg-ar mg-aym mg-bg mg-bn mg-ca mg-cs mg-da mg-de mg-el mg-en mg-eo mg-es mg-fa mg-fil mg-fr mg-he mg-hi mg-hu mg-id mg-it mg-jp mg-km mg-ko mg-ku mk-am mk-ar mk-aym mk-bg mk-bn mk-ca mk-cs mk-da mk-de mk-el mk-en mk-eo mk-es mk-fa mk-fil mk-fr mk-he mk-hi mk-hu mk-id mk-it mk-jp mk-km mk-ko mk-mg my-am my-ar my-aym my-bg my-bn my-ca my-cs my-da my-de my-el my-en my-es my-fa my-fil my-fr my-he my-hi my-hu my-id my-it my-jp my-ko my-mg my-mk ne-ar ne-aym ne-bg ne-bn ne-ca ne-cs ne-de ne-el ne-en ne-eo ne-es ne-fa ne-fr ne-hi ne-id ne-it ne-jp ne-ko ne-mg ne-mk nl-am nl-ar nl-aym nl-bg nl-bn nl-ca nl-cs nl-da nl-de nl-el nl-en nl-eo nl-es nl-fa nl-fil nl-fr nl-he nl-hi nl-hu nl-id nl-it nl-jp nl-km nl-ko nl-mg nl-mk nl-my nl-ne or-ar or-aym or-bn or-ca or-cs or-de or-el or-en or-es or-fa or-fr or-hi or-it or-jp or-mg or-mk or-nl pa-ar pa-bn pa-ca pa-cs pa-de pa-el pa-en pa-es pa-fr pa-hi pa-hu pa-it pa-jp pa-ko pa-mg pa-mk pa-ne pa-nl pl-am pl-ar pl-aym pl-bg pl-bn pl-ca pl-cs pl-da pl-de pl-el pl-en pl-eo pl-es pl-fa pl-fil pl-fr pl-he pl-hi pl-hu pl-id pl-it pl-jp pl-ko pl-ku pl-mg pl-mk pl-my pl-ne pl-nl pl-or pl-pa pt-am pt-ar pt-aym pt-bg pt-bn pt-ca pt-cs pt-da pt-de pt-el pt-en pt-eo pt-es pt-fa pt-fil pt-fr pt-he pt-hi pt-hu pt-id pt-it pt-jp pt-km pt-ko pt-ku pt-mg pt-mk pt-my pt-ne pt-nl pt-or pt-pa pt-pl ro-ar ro-aym ro-bg ro-bn ro-ca ro-cs ro-de ro-el ro-en ro-eo ro-es ro-fa ro-fr ro-hu ro-id ro-it ro-jp ro-ko ro-ku ro-mg ro-mk ro-my ro-ne ro-nl ro-pl ro-pt ru-am ru-ar ru-aym ru-bg ru-bn ru-ca ru-cs ru-da ru-de ru-el ru-en ru-eo ru-es ru-fa ru-fil ru-fr ru-he ru-hi ru-hu ru-id ru-it ru-jp ru-km ru-ko ru-mg ru-mk ru-my ru-ne ru-nl ru-or ru-pa ru-pl ru-pt ru-ro sq-am sq-ar sq-aym sq-bg sq-bn sq-ca sq-cs sq-da sq-de sq-el sq-en sq-eo sq-es sq-fa sq-fil sq-fr sq-hi sq-hu sq-id sq-it sq-jp sq-ko sq-mg sq-mk sq-my sq-nl sq-pl sq-pt sq-ru sr-am sr-ar sr-aym sr-bg sr-bn sr-ca sr-cs sr-da sr-de sr-el sr-en sr-eo sr-es sr-fa sr-fil sr-fr sr-hi sr-hu sr-id sr-it sr-jp sr-km sr-ko sr-mg sr-mk sr-my sr-ne sr-nl sr-pl sr-pt sr-ro sr-ru sr-sq sv-am sv-ar sv-aym sv-bg sv-bn sv-ca sv-cs sv-da sv-de sv-el sv-en sv-eo sv-es sv-fa sv-fil sv-fr sv-he sv-hi sv-hu sv-id sv-it sv-jp sv-ko sv-mg sv-mk sv-my sv-nl sv-pl sv-pt sv-ro sv-ru sv-sq sv-sr sw-am sw-ar sw-aym sw-bg sw-bn sw-ca sw-cs sw-da sw-de sw-el sw-en sw-eo sw-es sw-fa sw-fil sw-fr sw-he sw-hi sw-hu sw-id sw-it sw-jp sw-km sw-ko sw-mg sw-mk sw-my sw-ne sw-nl sw-pa sw-pl sw-pt sw-ro sw-ru sw-sq sw-sr sw-sv tet-ar tet-aym tet-bn tet-cs tet-de tet-el tet-en tet-es tet-fr tet-id tet-it tet-mg tet-pt tet-ru tet-sw tr-am tr-ar tr-aym tr-bg tr-bn tr-ca tr-cs tr-da tr-de tr-el tr-en tr-eo tr-es tr-fa tr-fil tr-fr tr-he tr-hi tr-hu tr-id tr-it tr-jp tr-ko tr-mg tr-mk tr-my tr-ne tr-nl tr-pa tr-pl tr-pt tr-ro tr-ru tr-sq tr-sr tr-sv tr-sw ur-am ur-ar ur-aym ur-bg ur-bn ur-ca ur-cs ur-da ur-de ur-el ur-en ur-eo ur-es ur-fa ur-fil ur-fr ur-he ur-hi ur-hu ur-id ur-it ur-jp ur-ko ur-mg ur-mk ur-my ur-ne ur-nl ur-or ur-pa ur-pl ur-pt ur-ro ur-ru ur-sq ur-sr ur-sv ur-sw ur-tr yo-ar yo-el yo-en yo-es yo-fr yo-it yo-mg yo-pl yo-pt yo-ru yo-sw zhs-am zhs-ar zhs-aym zhs-bg zhs-bn zhs-ca zhs-cs zhs-da zhs-de zhs-el zhs-en zhs-eo zhs-es zhs-fa zhs-fil zhs-fr zhs-he zhs-hi zhs-hu zhs-id zhs-it zhs-jp zhs-km zhs-ko zhs-mg zhs-mk zhs-my zhs-ne zhs-nl zhs-pa zhs-pl zhs-pt zhs-ro zhs-ru zhs-sq zhs-sr zhs-sv zhs-sw zhs-tr zhs-ur zht-am zht-ar zht-aym zht-bg zht-bn zht-ca zht-cs zht-da zht-de zht-el zht-en zht-eo zht-es zht-fa zht-fil zht-fr zht-he zht-hi zht-hu zht-id zht-it zht-jp zht-km zht-ko zht-mg zht-mk zht-my zht-ne zht-nl zht-pa zht-pl zht-pt zht-ro zht-ru zht-sq zht-sr zht-sv zht-sw zht-tet zht-tr zht-ur zht-zhs""" url = 'http://casmacat.eu/corpus/global-voices-tar-balls/training.tgz' cite = """Philipp Koehn, "Global Voices Corpus" http://casmacat.eu/corpus/global-voices.html """ # any hot fixes for lang id mapping specific to this source code_map = { 'jp': 'jpn', # there was never a jp in ISO 693, it was always a 'ja' not 'jp' 'zhs': 'zho' # map simplified to chinese } code_map = code_map.get for pair in data.split(): if 'zht' in pair: continue #skipping traditional chinese because I dont know the ISO code for it l1, l2 = pair.split('-') f1 = f'training/globalvoices.{l1}-{l2}.{l1}' f2 = f'training/globalvoices.{l1}-{l2}.{l2}' l1, l2 = code_map(l1, l1), code_map(l2, l2) # map codes ent = Entry(langs=(l1, l2), name='GlobalVoices_2018Q4', url=url, filename='GlobalVoices_2018Q4-training.tgz', in_ext='txt', cite=cite, in_paths=[f1, f2]) index.add_entry(ent)