def resolve_entries(cls, langs, names): inp_names = set(names) assert len(inp_names) == len(names), f'{names} are not unique.' entries = get_entries(langs=langs, names=inp_names) out_names = set(e.name for e in entries) if inp_names & out_names != inp_names | out_names: missed = inp_names - out_names assert missed raise Exception(f'Could not find: {missed} for languages: {langs}') return entries
def generate_report(langs, names, not_names=None, format='plain'): from mtdata.index import get_entries entries = get_entries(langs, names, not_names) lang_stats = defaultdict(int) name_stats = defaultdict(int) group_stats = defaultdict(int) for ent in entries: lang_stats[ent.lang_str] += 1 name_stats[ent.did.name] += 1 group_stats[ent.did.group] += 1 print("Languages:") for key, val in lang_stats.items(): print(f'{key}\t{val:,}') print("\nNames:") for key, val in name_stats.items(): print(f'{key}\t{val:,}') print("\nGroups:") for key, val in group_stats.items(): print(f'{key}\t{val:,}')
def list_data(langs, names, not_names=None, full=False, groups=None, not_groups=None, id_only=False): from mtdata.index import get_entries entries = get_entries(langs, names, not_names, groups=groups, not_groups=not_groups, fuzzy_match=True) for i, ent in enumerate(entries): if id_only: print(ent.did) else: print(ent.format(delim='\t')) if full: print(ent.cite or "CITATION_NOT_LISTED", end='\n\n') log.info(f"Total {len(entries)} entries")
).json() names = [f'opus_{d["corpus"]}/{d["version"]}' for d in datasets['corpora']] elif type == 'sacrebleu': import sacrebleu names = [ f'sacrebleu_{name}' for name, meta in sacrebleu.DATASETS.items() if f'{source}-{target}' in meta or f'{target}-{source}' in meta ] elif type == 'mtdata': from mtdata.entry import LangPair, lang_pair from mtdata.index import get_entries from mtdata.iso import iso3_code source_tricode = iso3_code(source, fail_error=True) target_tricode = iso3_code(target, fail_error=True) exclude += ['opus', 'newstest', 'UNv1'] entries = sorted(get_entries( lang_pair(source_tricode + '-' + target_tricode), None, None, True), key=lambda entry: entry.did.group) names = [ f'mtdata_{entry.did.group}-{entry.did.name}-{entry.did.version}-{entry.did.lang_str}' for entry in entries ] else: print( f'Importer type {type} is unsupported. Supported importers: opus, mtdata, sacrebleu' ) cleaned = set() for name in names: filter = False for ex in exclude: if ex.lower() in name.lower():