Пример #1
0
def handle(srcfs, srtsf_base, srttf_base, srtsf, srttf, tgtf):

    data = {}

    with open(srtsf_base, "rb") as fs, open(srttf_base, "rb") as ft:
        for sl, tl in zip(fs, ft):
            _sl, _tl = sl.strip(), tl.strip()
            if _sl and _tl:
                _sl = clean_str(_sl.decode("utf-8"))
                _tl = clean_str(_tl.decode("utf-8"))
                data[_sl] = _tl
    with open(srtsf, "rb") as fs, open(srttf, "rb") as ft:
        for sl, tl in zip(fs, ft):
            _sl, _tl = sl.strip(), tl.strip()
            if _sl and _tl:
                _sl = clean_str(_sl.decode("utf-8"))
                _tl = clean_str(_tl.decode("utf-8"))
                data[_sl] = _tl

    ens = "\n".encode("utf-8")

    with open(srcfs, "rb") as fs, open(tgtf, "wb") as ft:
        for line in fs:
            tmp = line.strip()
            if tmp:
                tmp = clean_str(tmp.decode("utf-8"))
                tmp = data.get(tmp, "")
                ft.write(tmp.encode("utf-8"))
            ft.write(ens)
Пример #2
0
def handle(srcfs, srcft, srcfg, tgtfs, tgtft, tgtfg):

    ens = "\n".encode("utf-8")

    with open(srcfs, "rb") as fs, open(srcft, "rb") as ft, open(
            srcfg, "rb") as fg, open(tgtfs, "wb") as fsw, open(
                tgtft, "wb") as ftw, open(tgtfg, "wb") as fgw:
        total = 0
        keep = 0
        for ls, lt, lg in zip(fs, ft, fg):
            ls, lt, lg = ls.strip(), lt.strip(), lg.strip()
            if ls and lt and lg:
                ls, lt, lg = clean_str(ls.decode("utf-8")), clean_str(
                    lt.decode("utf-8")), clean_str(lg.decode("utf-8"))
                if lt != lg:
                    fsw.write(ls.encode("utf-8"))
                    fsw.write(ens)
                    ftw.write(lt.encode("utf-8"))
                    ftw.write(ens)
                    fgw.write(lg.encode("utf-8"))
                    fgw.write(ens)
                    keep += 1
                total += 1
        print("%d in %d data keeped with ratio %.2f" %
              (keep, total,
               float(keep) / float(total) * 100.0 if total > 0 else 0.0))
Пример #3
0
def handle(srcf, tgtf, rankf, rssf, rstf, threshold):

    with open(srcf, "rb") as frs, open(tgtf, "rb") as frt, open(
            rankf, "rb") as fs, open(rssf, "wb") as fws, open(rstf,
                                                              "wb") as fwt:

        ndata = nkeep = 0

        ens = "\n".encode("utf-8")

        for srcl, tgtl, score in zip(frs, frt, fs):
            src, tgt, s = srcl.strip(), tgtl.strip(), score.strip()
            if src and tgt and s:
                src, tgt, s = clean_str(src.decode("utf-8")), clean_str(
                    tgt.decode("utf-8")), float(s.decode("utf-8"))
                if s <= threshold:
                    fws.write(src.encode("utf-8"))
                    fws.write(ens)
                    fwt.write(tgt.encode("utf-8"))
                    fwt.write(ens)
                    nkeep += 1
                ndata += 1

        print("%d in %d data keeped with ratio %.2f" %
              (nkeep, ndata,
               float(nkeep) / float(ndata) * 100.0 if ndata > 0 else 0.0))
Пример #4
0
    def __call__(self, paragraphs):

        _paras = [
            clean_str(tmpu.strip()) for tmpu in paragraphs.strip().split("\n")
            if tmpu
        ]

        _tmp = []
        if self.sent_split is None:
            for _tmpu in paras:
                _tmp.append(_tmpu)
                _tmp.append("\n")
        else:
            for _tmpu in _paras:
                _tmp.extend(
                    clean_list([
                        clean_str(_tmps) for _tmps in self.sent_split(_tmpu)
                    ]))
                _tmp.append("\n")
        _tmp_o = _tmpi = sorti(_tmp)

        for pu in self.flow:
            _tmp_o = pu(_tmp_o)

        _tmp = restore(_tmp, _tmpi, _tmp_o)

        return " ".join(_tmp).replace(" \n", "\n").replace("\n ", "\n")
Пример #5
0
def handle(srcfs, srcfm, srtsf, srtmf, srttf, tgtf):

    data = {}

    with open(srtsf, "rb") as fs, open(srtmf, "rb") as fm, open(srttf,
                                                                "rb") as ft:
        for sl, ml, tl in zip(fs, fm, ft):
            _sl, _ml, _tl = sl.strip(), ml.strip(), tl.strip()
            if _sl and _tl:
                _sl = clean_str(_sl.decode("utf-8"))
                _ml = clean_str(_ml.decode("utf-8"))
                _tl = clean_str(_tl.decode("utf-8"))
                data[(
                    _sl,
                    _ml,
                )] = _tl

    ens = "\n".encode("utf-8")

    with open(srcfs, "rb") as fs, open(srcfm, "rb") as fm, open(tgtf,
                                                                "wb") as ft:
        for sl, ml in zip(fs, fm):
            _sl, _ml = sl.strip(), ml.strip()
            if _sl:
                _sl = clean_str(_sl.decode("utf-8"))
                _ml = clean_str(_ml.decode("utf-8"))
                tmp = data.get((
                    _sl,
                    _ml,
                ), "")
                ft.write(tmp.encode("utf-8"))
            ft.write(ens)
Пример #6
0
def restore(src, tsrc, trs):

    data = {}

    for sl, tl in zip(tsrc, trs):
        _sl, _tl = sl.strip(), tl.strip()
        if _sl and _tl:
            data[_sl] = clean_str(_tl)

    rs = []
    _tl = []
    for line in src:
        tmp = line.strip()
        if tmp:
            tmp = clean_str(tmp)
            tmp = data.get(tmp, "").strip()
            if tmp:
                _tl.append(tmp)
            elif _tl:
                rs.append(" ".join(_tl))
                _tl = []
        elif _tl:
            rs.append(" ".join(_tl))
            _tl = []
        else:
            rs.append("")
    if _tl:
        rs.append(" ".join(_tl))

    return rs
Пример #7
0
    def __call__(self, paragraph):

        _tmp = [tmpu.strip() for tmpu in paragraph.strip().split("\n")]
        _rs = []
        _tmpi = None
        if self.sent_split is not None:
            np = len(_tmp) - 1
            if np > 0:
                for _i, _tmpu in enumerate(_tmp):
                    if _tmpu:
                        _rs.extend(self.sent_split(_tmpu))
                    if _i < np:
                        _rs.append("")
                _tmpi = sorti(_rs)
                _tmp = _tmpi
            else:
                _tmp = [clean_str(_tmp[0])]
        else:
            _tmp = [clean_str(tmpu) for tmpu in _tmp]

        for pu in self.flow:
            _tmp = pu(_tmp)

        if len(_rs) > 1:
            _tmp = restore(_rs, _tmpi, _tmp)
            return "\n".join(_tmp)

        return " ".join(_tmp)
Пример #8
0
def restore(src, tsrc, trs):

	data = {}

	for sl, tl in zip(tsrc, trs):
		_sl, _tl = sl.strip(), tl.strip()
		if _sl and _tl:
			data[_sl] = clean_str(_tl)

	return [data.get(clean_str(line.strip()), line) for line in src]
Пример #9
0
def handle(srcfl, srtfl, tgtf):

    data = {}

    with FileList(srtfl, "rb") as fs:
        for lines in zip(*fs):
            lines = tuple(line.strip() for line in lines)
            if all(lines):
                lines = tuple(
                    clean_str(line.decode("utf-8")) for line in lines)
                data[lines[:-1]] = lines[-1].encode("utf-8")

    ens = "\n".encode("utf-8")
    with FileList(srcfl, "rb") as fs, open(tgtf, "wb") as ft:
        for lines in zip(*fs):
            lines = tuple(line.strip() for line in lines)
            if all(lines):
                lines = tuple(
                    clean_str(line.decode("utf-8")) for line in lines)
                if lines in data:
                    ft.write(data[lines])
            ft.write(ens)
Пример #10
0
def handle(srcfl, rsfl):

    data = []
    with FileList(srcfl, "rb") as files:
        for lines in zip(*files):
            data.append(
                [clean_str(tmpu.strip().decode("utf-8")) for tmpu in lines])

    shuffle(data)

    ens = "\n".encode("utf-8")
    for du, rsf in zip(zip(*data), rsfl):
        with open(rsf, "wb") as fwrt:
            fwrt.write("\n".join(du).encode("utf-8"))
            fwrt.write(ens)
Пример #11
0
def handle(srcfl, rsfl):

	files = [open(srcf, "rb") for srcf in srcfl]
	data = []
	for lines in zip(*files):
		data.append([clean_str(tmpu.strip().decode("utf-8")) for tmpu in lines])
	for frd in files:
		frd.close()
	shuffle(data)
	files = [open(rsf, "wb") for rsf in rsfl]
	ens = "\n".encode("utf-8")
	for du, fwrt in zip(zip(*data), files):
		fwrt.write("\n".join(du).encode("utf-8"))
		fwrt.write(ens)
		fwrt.close()