def pretokenize(args, in_path: str, out_path: str, src: str, tgt: str): if os.path.exists(out_path): print(f"{out_path} exists. skipping it.") return Args = namedtuple( "Args", [ "source_lang", "target_lang", "moses_no_dash_splits", "moses_no_escape", ], ) _args = Args( source_lang=args.source_lang, target_lang=args.target_lang, moses_no_dash_splits=False, moses_no_escape=False, ) pretokenizer = MosesTokenizer(_args) with open(in_path) as f, open(out_path, "w") as f_o: s_o = [] for s in f: #s_o.append(pretokenizer.encode(s.strip()) + "\n") f_o.write(pretokenizer.encode(s.strip()) + "\n")
def pretokenize(in_path: str, out_path: str, src: str, tgt: str): Args = namedtuple('Args', ['moses_source_lang', 'moses_target_lang', 'moses_no_dash_splits', 'moses_no_escape']) args = Args(moses_source_lang=src, moses_target_lang=tgt, moses_no_dash_splits=False, moses_no_escape=False) pretokenizer = MosesTokenizer(args) with open(in_path) as f, open(out_path, 'w') as f_o: for s in f: f_o.write(pretokenizer.encode(s.strip()) + '\n')
def tokenizer(sentence): attrs = Args() tokenizer = MosesTokenizer(attrs) bpe = SubwordNMTBPE(attrs) dictionary = Dictionary.load('dict.en.txt') return dictionary.encode_line(bpe.encode(sentence), add_if_not_exist=False)
def pretokenize(in_path: str, out_path: str, src: str, tgt: str): Args = namedtuple( "Args", [ "moses_source_lang", "moses_target_lang", "moses_no_dash_splits", "moses_no_escape", ], ) args = Args( moses_source_lang=src, moses_target_lang=tgt, moses_no_dash_splits=False, moses_no_escape=False, ) pretokenizer = MosesTokenizer(args) with open(in_path) as f, open(out_path, "w") as f_o: for s in f: f_o.write(pretokenizer.encode(s.strip()) + "\n")