Пример #1
0
Файл: tmx.py Проект: kpu/mtdata
def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages, but provided with {lang_seg.keys()} in TMX {path}"
                    )
            if langs[0] in lang_seg and langs[1] in lang_seg:
                yield lang_seg[langs[0]], lang_seg[langs[1]]
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}--{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")
Пример #2
0
def parse_tmx(data, n_langs=2, log_every=DEF_PROGRESS):
    context = ET.iterparse(data, events=['end'])
    tus = (el for event, el in context if el.tag == 'tu')
    count, skips = 0, 0
    st = t = time.time()
    for tu in tus:
        langs, segs = [], []
        for tuv in tu.findall('tuv'):
            lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')]
            if lang:
                langs.append(lang[0])
            seg = tuv.findtext('seg')
            if seg:
                segs.append(unescape(seg.strip()))
        if n_langs and len(segs) == len(langs) == n_langs:
            count += 1
            yield list(zip(langs, segs))
        else:
            skips += 1
            log.warning(
                f"Skipped: langs {langs} segs {len(segs)} ; Parsed count {count}"
            )
        if log_every and (time.time() - t) > log_every:
            elapsed = datetime.timedelta(seconds=round(time.time() - st))
            log.info(f"{elapsed} :: Parsed: {count:,} Skipped:{skips:,}")
            t = time.time()
        tu.clear()
    log.info(f"Skipped ={skips}; parsed: {count}")
Пример #3
0
Файл: tmx.py Проект: kpu/mtdata
def parse_tmx(data, log_every=DEF_PROGRESS):
    context = ET.iterparse(data, events=['end'])
    tus = (el for event, el in context if el.tag == 'tu')
    count = 0
    st = t = time.time()
    for tu in tus:
        lang_seg = {}
        for tuv in tu.findall('tuv'):
            lang = [v for k, v in tuv.attrib.items() if k.endswith('lang')]
            seg = tuv.findtext('seg')
            if lang and seg:
                lang = iso3_code(lang[0], fail_error=True)
                seg = unescape(seg.strip()).replace('\n',
                                                    ' ').replace('\t', ' ')
                if lang in lang_seg:
                    log.warning(
                        f"Language {lang} appears twice in same translation unit."
                    )
                lang_seg[lang] = seg
        yield lang_seg
        count += 1
        if log_every and (time.time() - t) > log_every:
            elapsed = datetime.timedelta(seconds=round(time.time() - st))
            log.info(f"{elapsed} :: Parsed: {count:,}")
            t = time.time()
        tu.clear()
Пример #4
0
def get_recipe(recipe_id,
               out_dir: Path,
               compress=False,
               drop_dupes=False,
               drop_tests=False,
               fail_on_error=False,
               n_jobs=DEF_N_JOBS,
               merge_train=True,
               **kwargs):
    if kwargs:
        log.warning(f"Args are ignored: {kwargs}")
    from mtdata.recipe import RECIPES
    recipe = RECIPES.get(recipe_id)
    if not recipe:
        raise ValueError(
            f'recipe {recipe_id} not found. See "mtdata list-recipe"')

    get_data(langs=recipe.langs,
             train_dids=recipe.train,
             dev_dids=recipe.dev,
             test_dids=recipe.test,
             merge_train=merge_train,
             out_dir=out_dir,
             compress=compress,
             drop_dupes=drop_dupes,
             drop_tests=drop_tests,
             fail_on_error=fail_on_error,
             n_jobs=n_jobs)
Пример #5
0
Файл: tmx.py Проект: kpu/mtdata
def main(inp, out, langs):
    recs = read_tmx(inp, langs=langs)
    with IO.writer(out) as out:
        count = 0
        for rec in recs:
            rec = [l.replace('\t', ' ') for l in rec]
            out.write('\t'.join(rec) + '\n')
            count += 1
        log.warning(f"Wrote {count} lines to {out}")
Пример #6
0
def main(inp, out):
    segs = read_sgm(inp)
    with IO.writer(out) as out:
        count = 0
        for seg in segs:
            seg = seg.replace('\t', ' ')
            out.write(seg + '\n')
            count += 1
        log.warning(f"Wrote {count} lines to {out}")
Пример #7
0
def LangPair(string):
    parts = string.split('-')
    if len(parts) != 2:
        msg = f'expected value of form "xx-yy" eg "de-en"; given {string}'
        raise argparse.ArgumentTypeError(msg)
    iso_codes = [iso3_code(part, fail_error=True) for part in parts]
    if iso_codes != parts:
        log.warning(
            f"Suggestion: Use ISO 639_3 codes {'-'.join(iso_codes)} instead of {string}."
            f" Let's make a little space for all 7000+ languages of our planet 😢."
        )
    return tuple(iso_codes)
Пример #8
0
 def add_part(self,
              dir_path: Path,
              entry: Entry,
              drop_noise=False,
              compress=False):
     flag_file = dir_path / f'.valid.{entry.did}'
     if flag_file.exists():
         log.info(f"{flag_file} exits. Skipping")
         return -1, -1
     path = self.cache.get_entry(entry)
     # swap = entry.is_swap(self.langs)
     parser = Parser(path, ext=entry.in_ext or None, ent=entry)
     # langs = '_'.join(str(lang) for lang in self.langs)
     # Check that files are written in correct order
     l1, l2 = self.get_paths(dir_path, entry, compress=compress)
     io_args = dict(encoding='utf-8', errors='ignore')
     with IO.writer(l1, **io_args) as f1, IO.writer(l2, **io_args) as f2:
         count, skips, noise = 0, 0, 0
         for rec in parser.read_segs():
             rec = rec[:2]  # get the first two recs
             if len(rec) != 2:
                 skips += 1
                 continue
             if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                 skips += 1
                 noise += 1
                 continue
             sent1, sent2 = [s.strip() for s in rec]
             if not sent1 or not sent2:
                 skips += 1
                 continue
             sent1 = sent1.replace('\n',
                                   ' ').replace('\t',
                                                ' ').replace('\r', ' ')
             sent2 = sent2.replace('\n',
                                   ' ').replace('\t',
                                                ' ').replace('\r', ' ')
             f1.write(f'{sent1}\n')
             f2.write(f'{sent2}\n')
             count += 1
         msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}'
         assert count > 0, msg
         if skips > count:
             log.warning(msg)
         if noise > 0:
             log.info(
                 f"{entry}: Noise : {noise:,}/{count:,} => {100 * noise / count:.4f}%"
             )
         log.info(f"wrote {count} lines to {l1} == {l2}")
     flag_file.touch()
     return count, skips
Пример #9
0
    def __post_init__(self):
        if not isinstance(self.paths, list):
            self.paths = [self.paths]
        assert 1 <= len(self.paths) <= 2
        for p in self.paths:
            assert p.exists(), f'{p} not exists'

        if not self.ext:
            exts = [detect_extension(p.name) for p in self.paths]
            if len(exts) == 2 and set(exts) == set(self.langs):
                log.warning(
                    f"Treating {' .'.join(exts)} as plain text. To override: in_ext=<ext>"
                )
                exts = ['txt']  # treat that as plain text
            assert len(
                set(exts)) == 1, f'Expected a type of exts, but found: {exts}'
            self.ext = exts[0]
Пример #10
0
def read_tmx(path: Union[Path, str], langs=None):
    """
    reads a TMX file as records
    :param path: path to .tmx file
    :param langs: (lang1, lang2) codes eg (de, en); when it is None the code tries to auto detect
    :return: stream of (text1, text2)
    """
    passes = 0
    fails = 0
    if langs:
        assert len(langs) == 2
        langs = [bcp47(lang) for lang in langs]
        assert not BCP47Tag.are_compatible(
            *langs), f'{langs} expected to be different (/unambiguous)'
    with IO.reader(path) as data:
        recs = parse_tmx(data)
        for lang_seg in recs:
            if langs is None:
                log.warning(
                    "langs not set; this could result in language mismatch")
                if len(lang_seg) == 2:
                    langs = tuple(lang_seg.keys())
                else:
                    raise Exception(
                        f"Language autodetect for TMX only supports 2 languages,"
                        f" but provided with {lang_seg.keys()} in TMX {path}")
            seg1, seg2 = None, None
            for lang, seg in lang_seg.items():
                if BCP47Tag.are_compatible(langs[0], lang):
                    seg1 = seg
                elif BCP47Tag.are_compatible(langs[1], lang):
                    seg2 = seg
                # else ignore
            if seg1 and seg2:  # both segs are found
                yield seg1, seg2
                passes += 1
            else:
                fails += 1
    if passes == 0:
        if fails == 0:
            raise Exception(f"Empty TMX {path}")
        raise Exception(f"Nothing for {langs[0]}-{langs[1]} in TMX {path}")
    if fails != 0:
        log.warning(
            f"Skipped {fails} entries due to language mismatch in TMX {path}")
    log.info(f"Extracted {passes} pairs from TMX {path}")
Пример #11
0
Файл: data.py Проект: kpu/mtdata
 def add_part(self, dir_path: Path, entry: Entry, drop_noise=False):
     path = self.cache.get_entry(entry)
     swap = entry.is_swap(self.langs)
     parser = Parser(path,
                     langs=self.langs,
                     ext=entry.in_ext or None,
                     ent=entry)
     langs = '_'.join(self.langs)
     l1 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[0]}')
     l2 = (dir_path /
           f'{entry.name}-{langs}').with_suffix(f'.{self.langs[1]}')
     mode = dict(mode='w', encoding='utf-8', errors='ignore')
     with l1.open(**mode) as f1, l2.open(**mode) as f2:
         count, skips, noise = 0, 0, 0
         for rec in parser.read_segs():
             rec = rec[:2]  # get the first two recs
             if len(rec) != 2:
                 skips += 1
                 continue
             if drop_noise and entry.is_noisy(seg1=rec[0], seg2=rec[1]):
                 skips += 1
                 noise += 1
                 continue
             sent1, sent2 = [s.strip() for s in rec]
             if not sent1 or not sent2:
                 skips += 1
                 continue
             if swap:
                 sent2, sent1 = sent1, sent2
             sent1 = sent1.replace('\n', ' ').replace('\t', ' ')
             sent2 = sent2.replace('\n', ' ').replace('\t', ' ')
             f1.write(f'{sent1}\n')
             f2.write(f'{sent2}\n')
             count += 1
         msg = f'Looks like an error. {count} segs are valid {skips} are invalid: {entry}'
         assert count > 0, msg
         if skips > count:
             log.warning(msg)
         if noise > 0:
             log.info(
                 f"{entry}: Noise : {noise:,}/{count:,} => {100*noise/count:.4f}%"
             )
         log.info(f"wrote {count} lines to {l1} == {l2}")
     return count, skips
Пример #12
0
    def __post_init__(self):
        if not isinstance(self.paths, list):
            self.paths = [self.paths]
        for p in self.paths:
            assert p.exists(), f'{p} not exists'

        if not self.ext:
            exts = [detect_extension(p.name) for p in self.paths]
            if len(exts) == 2:
                log.warning(
                    f"Treating {' .'.join(exts)} as plain text. To override: in_ext=<ext>"
                )
                exts = ['txt']  # treat that as plain text
            assert len(
                set(exts)) == 1, f'Expected a type of exts, but found: {exts}'
            self.ext = exts[0]
        assert 1 <= len(self.paths)
        # tsv and tmx just concatenate all of them
        assert len(self.paths) <= 3 or self.ext == 'tmx' or self.ext == 'tsv'
Пример #13
0
def get_data(langs,
             out_dir,
             train_dids=None,
             test_dids=None,
             dev_dids=None,
             merge_train=False,
             compress=False,
             drop_dupes=False,
             drop_tests=False,
             fail_on_error=False,
             n_jobs=DEF_N_JOBS,
             **kwargs):
    if kwargs:
        log.warning(f"Args are ignored: {kwargs}")
    from mtdata.data import Dataset
    assert train_dids or test_dids, 'Required --train or --test or both'
    dataset = Dataset.prepare(langs,
                              train_dids=train_dids,
                              test_dids=test_dids,
                              out_dir=out_dir,
                              dev_dids=dev_dids,
                              cache_dir=CACHE_DIR,
                              merge_train=merge_train,
                              compress=compress,
                              drop_dupes=drop_dupes,
                              drop_tests=drop_tests,
                              fail_on_error=fail_on_error,
                              n_jobs=n_jobs)
    cli_sig = f'-l {"-".join(str(l) for l in langs)}'
    for flag, dids in [('-tr', train_dids), ('-ts', test_dids),
                       ('-dv', dev_dids)]:
        if dids:
            cli_sig += f' {flag} {" ".join(map(str, dids))}'
    for flag, val in [('--merge', merge_train), ('--compress', compress),
                      ('-dd', drop_dupes), ('-dt', drop_tests)]:
        if val:
            cli_sig += ' ' + flag
    sig = f'mtdata get {cli_sig} -o <out-dir>\nmtdata version {mtdata.__version__}\n'
    log.info(f'Dataset is ready at {dataset.dir}')
    log.info(f'mtdata args for reproducing this dataset:\n {sig}')
    with IO.writer(out_dir / 'mtdata.signature.txt', append=True) as w:
        w.write(sig)