예제 #1
0
def pretokenize(args, in_path: str, out_path: str, src: str, tgt: str):
    if os.path.exists(out_path):
        print(f"{out_path} exists. skipping it.")
        return
    Args = namedtuple(
        "Args",
        [
            "source_lang",
            "target_lang",
            "moses_no_dash_splits",
            "moses_no_escape",
        ],
    )
    _args = Args(
        source_lang=args.source_lang,
        target_lang=args.target_lang,
        moses_no_dash_splits=False,
        moses_no_escape=False,
    )
    pretokenizer = MosesTokenizer(_args)
    with open(in_path) as f, open(out_path, "w") as f_o:
        s_o = []
        for s in f:
            #s_o.append(pretokenizer.encode(s.strip()) + "\n")
            f_o.write(pretokenizer.encode(s.strip()) + "\n")
예제 #2
0
def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
    Args = namedtuple('Args', ['moses_source_lang', 'moses_target_lang',
                               'moses_no_dash_splits', 'moses_no_escape'])
    args = Args(moses_source_lang=src, moses_target_lang=tgt,
                moses_no_dash_splits=False, moses_no_escape=False)
    pretokenizer = MosesTokenizer(args)
    with open(in_path) as f, open(out_path, 'w') as f_o:
        for s in f:
            f_o.write(pretokenizer.encode(s.strip()) + '\n')
예제 #3
0
        def tokenizer(sentence):

            attrs = Args()

            tokenizer = MosesTokenizer(attrs)
            bpe = SubwordNMTBPE(attrs)
            dictionary = Dictionary.load('dict.en.txt')

            return dictionary.encode_line(bpe.encode(sentence),
                                          add_if_not_exist=False)
예제 #4
0
def pretokenize(in_path: str, out_path: str, src: str, tgt: str):
    Args = namedtuple(
        "Args",
        [
            "moses_source_lang",
            "moses_target_lang",
            "moses_no_dash_splits",
            "moses_no_escape",
        ],
    )
    args = Args(
        moses_source_lang=src,
        moses_target_lang=tgt,
        moses_no_dash_splits=False,
        moses_no_escape=False,
    )
    pretokenizer = MosesTokenizer(args)
    with open(in_path) as f, open(out_path, "w") as f_o:
        for s in f:
            f_o.write(pretokenizer.encode(s.strip()) + "\n")