def test_detokenize_with_aggressive_split(self): mt = MosesTokenizer() md = MosesDetokenizer() text = 'foo-bar' assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
def test_opening_brackets(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_mixed_cjk_tokenization(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = u"Japan is 日本 in Japanese." assert tokenizer.tokenize(text) == [ u"Japan", u"is", u"日", u"本", u"in", u"Japanese", u".", ] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = mt.tokenize(text) expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [ u"This", u"ain", u"'t", u"funny", u".", u"It", u"'s", u"actually", u"hillarious", u",", u"yet", u"double", u"Ls", u".", u"|", u"[", u"]", u"<", u">", u"[", u"]", u"&", u"You", u"'re", u"gonna", u"shake", u"it", u"off", u"?", u"Don", u"'t", u"?", ] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' expected_tokens = mt.tokenize(text) expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?'] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def main(): parser = argparse.ArgumentParser() parser.add_argument("-i", "--input-pe-file", required=True, help="output of the pe system to be post processed") parser.add_argument("-tlang", "--target-language", dest="target_language", help="required for detokenization") parser.add_argument("-detok", "--detokenize", action="store_true", help="do detokenization (-tlang is necessary)") parser.add_argument("-c", "--codes-file", required=True, help="codes file used to post process the output") args = parser.parse_args() detokenizer = None if args.detokenize == True: assert ( args.target_language is not None), "--target-language is required for detokenization" detokenizer = MosesDetokenizer(lang=args.target_language) postprocess(ape_file=args.input_pe_file, codes_file=args.codes_file, detokenizer=detokenizer)
def detokenize_file( iterator, language, processes, quiet, xml_unescape, ): moses = MosesDetokenizer(lang=language) moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape) return parallel_or_not( list(map(str.split, iterator)), moses_detokenize, processes, quiet )
def detokenize_file(language, processes, xml_unescape, encoding): moses = MosesDetokenizer(lang=language) moses_detokenize = partial(moses.detokenize, return_str=True, unescape=xml_unescape) with click.get_text_stream("stdin", encoding=encoding) as fin: with click.get_text_stream("stdout", encoding=encoding) as fout: # If it's single process, joblib parallization is slower, # so just process line by line normally. if processes == 1: for line in tqdm(fin.readlines()): print(moses_detokenize(str.split(line)), end="\n", file=fout) else: document_iterator = map(str.split, fin.readlines()) for outline in parallelize_preprocess( moses_detokenize, document_iterator, processes, progress_bar=True ): print(outline, end="\n", file=fout)
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_french_apostrophes(self): tokenizer = MosesTokenizer(lang="fr") detokenizer = MosesDetokenizer(lang="fr") text = u"L'amitié nous a fait forts d'esprit" assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.'] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
import logging from sacremoses.tokenize import MosesTokenizer, MosesDetokenizer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = flask.Flask(__name__) CORS(app) MODEL_PATH = None PSEUDO_SERVICE_URL = True MODEL = None KEYWORD_PROCESSOR = None TRAINING_TAGS = None MOSES_TOKENIZER = MosesTokenizer(lang="fr") MOSES_DETOKENIZER = MosesDetokenizer(lang="fr") # pattern = r"\@-\@|\w+['´`]|\w+|\S+" # regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE) def load_names_processor(): # :: Load vocabulary for is_name features :: global KEYWORD_PROCESSOR from flashtext import KeywordProcessor KEYWORD_PROCESSOR = KeywordProcessor() KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) logging.info("Loaded french proper names...") return KEYWORD_PROCESSOR