def test_chinese_tokenization(self): tokenizer = MosesTokenizer(lang="zh") text = u"记者 应谦 美国" assert tokenizer.tokenize(text) == [u"记者", u"应谦", u"美国"]
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_opening_brackets(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_french_apostrophes(self): tokenizer = MosesTokenizer(lang="fr") detokenizer = MosesDetokenizer(lang="fr") text = u"L'amitié nous a fait forts d'esprit" assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_trailing_dot_apostrophe(self): moses = MosesTokenizer() text = "'Hello.'" expected_tokens = "'Hello . '".split() self.assertEqual(moses.tokenize(text), expected_tokens)
def test_detokenize_with_aggressive_split(self): mt = MosesTokenizer() md = MosesDetokenizer() text = "foo-bar" assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None): """ pre-process input file before post-editing split at <br> and remove <i> tags and music symbols. store everything in a codes file in output_dir Args: src_file: src_file of the translation to be preprocessed mt_file: output of the mt system file to be preprocessed output_dir: output directory to output the preprocessed files and codes file """ punct_normalizer = MosesPunctNormalizer() # set tokenizer tokenizer = None if tokenize_lang: tokenizer = MosesTokenizer(lang=tokenize_lang) code_file = output_dir+'/codes.'+os.path.basename(mt_file) src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre' mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre' with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt: idx=0 for src,mt in zip(fsrc,fmt): src, mt = src.strip(), mt.strip() idx+=1 # standardize br tags src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE) mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE) # if number of <br> is same, split and save it as multiple lines src_split = re.split(r'\s*<br>\s*',src) mt_split = re.split(r'\s*<br>\s*',mt) # if the src, mt, do not have the same number of <br>, then do not split it if not (len(src_split) == len(mt_split)): src_split = [src] mt_split = [mt] for src_part, mt_part in zip(src_split, mt_split): code = "{}\t".format(idx) # check if they start with the hyphen has_hyphen = False if src_part.startswith('-'): has_hyphen = True src_part = src_part[1:].lstrip() if mt_part.startswith('-'): has_hyphen = True mt_part = mt_part[1:].lstrip() # check if they start with the music symbol music_syms = ('♫','♬','♪') has_music = False if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part): has_music = True src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part) #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms): if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part): has_music = True mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part) # check if it has enclosing italics tags. otherwise leave it as it is itag = '<i>' eitag = '</i>' has_itag = False if src_part.startswith(itag) or src_part.endswith(eitag): has_itag = True if mt_part.startswith(itag) or mt_part.endswith(eitag): has_itag = True #if re.match(r'^<i>[^<]*</i>$', src_part): if has_hyphen == True: code += 'HYPHENBEGIN\t' if has_music == True: code += 'MUSIC\t' if has_itag == True: code += 'ITALICTAGS\t' src_part = punct_normalizer.normalize(cleanup(src_part)) mt_part = punct_normalizer.normalize(cleanup(mt_part)) if tokenizer: src_part = " ".join(tokenizer.tokenize(src_part, escape=False)) mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False)) fosrc.write(src_part.strip()+'\n') fomt.write(mt_part.strip()+'\n') fcodes.write("{}\n".format(code))
def test_moses_tokenize(self): moses = MosesTokenizer() # Tokenize a sentence. text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" tokenized_text = moses.tokenize(text, return_str=True) assert tokenized_text == expected_tokens # The nonbreaking prefixes should tokenize the final fullstop. assert moses.tokenize("abc def.") == [u"abc", u"def", u"."] # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token. # In below example, "pp" is the last element, and there is no digit after it. assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."] # Test escape_xml text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens_with_xmlescape = [ "This", "ain", "'t", "funny", ".", "It", "'s", "actually", "hillarious", ",", "yet", "double", "Ls", ".", "|", "[", "]", "<", ">", "[", "]", "&", "You", "'re", "gonna", "shake", "it", "off", "?", "Don", "'t", "?", ] expected_tokens_wo_xmlescape = [ "This", "ain", "'t", "funny", ".", "It", "'s", "actually", "hillarious", ",", "yet", "double", "Ls", ".", "|", "[", "]", "<", ">", "[", "]", "&", "You", "'re", "gonna", "shake", "it", "off", "?", "Don", "'t", "?", ] assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape # Test to check https://github.com/alvations/sacremoses/issues/19 text = "this 'is' the thing" expected_tokens = ["this", "'", "is", "'", "the", "thing"] assert moses.tokenize(text, escape=True) == expected_tokens
def test_expected_num_only_prefixes(self): """Testing if the functionality of the NUMERIC_ONLY_PREFIXES parsing is the same without redos-able regex.""" expected_prefixes = { 'as': [], 'bn': [], 'ca': [], 'cs': [], 'de': [], 'el': [], 'en': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')], 'es': [], 'et': [], 'fi': [], 'fr': [], 'ga': [('lch', 'lch #NUMERIC_ONLY#'), ('lgh', 'lgh #NUMERIC_ONLY#'), ('uimh', 'uimh #NUMERIC_ONLY#')], 'gu': [], 'hi': [], 'hu': [('jan', 'jan #NUMERIC_ONLY#'), ('Jan', 'Jan #NUMERIC_ONLY#'), ('Feb', 'Feb #NUMERIC_ONLY#'), ('feb', 'feb #NUMERIC_ONLY#'), ('márc', 'márc #NUMERIC_ONLY#'), ('Márc', 'Márc #NUMERIC_ONLY#'), ('ápr', 'ápr #NUMERIC_ONLY#'), ('Ápr', 'Ápr #NUMERIC_ONLY#'), ('máj', 'máj #NUMERIC_ONLY#'), ('Máj', 'Máj #NUMERIC_ONLY#'), ('jún', 'jún #NUMERIC_ONLY#'), ('Jún', 'Jún #NUMERIC_ONLY#'), ('Júl', 'Júl #NUMERIC_ONLY#'), ('júl', 'júl #NUMERIC_ONLY#'), ('aug', 'aug #NUMERIC_ONLY#'), ('Aug', 'Aug #NUMERIC_ONLY#'), ('Szept', 'Szept #NUMERIC_ONLY#'), ('szept', 'szept #NUMERIC_ONLY#'), ('okt', 'okt #NUMERIC_ONLY#'), ('Okt', 'Okt #NUMERIC_ONLY#'), ('nov', 'nov #NUMERIC_ONLY#'), ('Nov', 'Nov #NUMERIC_ONLY#'), ('dec', 'dec #NUMERIC_ONLY#'), ('Dec', 'Dec #NUMERIC_ONLY#'), ('tel', 'tel #NUMERIC_ONLY#'), ('Tel', 'Tel #NUMERIC_ONLY#'), ('Fax', 'Fax #NUMERIC_ONLY#'), ('fax', 'fax #NUMERIC_ONLY#')], 'is': [('no', 'no #NUMERIC_ONLY#'), ('No', 'No #NUMERIC_ONLY#'), ('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'), ('nR', 'nR #NUMERIC_ONLY#'), ('NR', 'NR #NUMERIC_ONLY#')], 'it': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')], 'kn': [], 'lt': [('No', 'No #NUMERIC_ONLY#')], 'lv': [('Nr', 'Nr #NUMERIC_ONLY#')], 'ml': [], 'mni': [], 'mr': [], 'nl': [('Nr', 'Nr #NUMERIC_ONLY#'), ('nr', 'nr #NUMERIC_ONLY#')], 'or': [], 'pa': [], 'pl': [('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'), ('pkt', 'pkt #NUMERIC_ONLY#'), ('str', 'str #NUMERIC_ONLY#'), ('tab', 'tab #NUMERIC_ONLY#'), ('Tab', 'Tab #NUMERIC_ONLY#'), ('ust', 'ust #NUMERIC_ONLY#'), ('par', 'par #NUMERIC_ONLY#'), ('r', 'r #NUMERIC_ONLY#'), ('l', 'l #NUMERIC_ONLY#'), ('s', 's #NUMERIC_ONLY#')], 'pt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'), ('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')], 'ro': [], 'ru': [], 'sk': [], 'sl': [('št', 'št #NUMERIC_ONLY#'), ('Št', 'Št #NUMERIC_ONLY#')], 'sv': [], 'ta': [], 'te': [], 'tdt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'), ('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')], 'yue': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')], 'zh': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')] } nonbreaking_prefixes = NonbreakingPrefixes() moses = MosesTokenizer() lang2numonlyprefix = defaultdict(list) for lang in nonbreaking_prefixes.available_langs.values(): lang2numonlyprefix[lang] = [ (w.rpartition(" ")[0], w) for w in nonbreaking_prefixes.words(lang) if moses.has_numeric_only(w) ] assert lang2numonlyprefix == expected_prefixes
def test_dot_splitting(self): moses = MosesTokenizer() text = "The meeting will take place at 11:00 a.m. Tuesday." expected_tokens = "The meeting will take place at 11 : 00 a.m. Tuesday .".split( ) self.assertEqual(moses.tokenize(text), expected_tokens)
import numpy as np import pandas as pd from argopt import argopt from helpers import pre_treat_text, tokenize_text_parallel from nltk.tokenize.regexp import RegexpTokenizer from sacremoses.tokenize import MosesTokenizer from tqdm import tqdm logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) pattern = r"\@-\@|\w+['´`]|\w+|\S+" regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE) moses_tokenizer = MosesTokenizer(lang="fr") MONTHS = [ "janvier", "février", "mars", "avril", "mai", "june", "juillet", "août", "septembre", "octobre", "novembre", "décembre" ] NAMES_TOKENIZER = re.compile(r"(?:\@-\@)|\s") def _load_names(filter_n=10): df_names = pd.read_csv("resources/names/prenom.csv") df_last_names = pd.read_csv("resources/names/patronymes.csv") # Filter top filter_n names df_names = df_names[df_names["sum"] >= filter_n]
def test_japanese_tokenization(self): tokenizer = MosesTokenizer(lang="ja") text = u"電話でんわの邪魔じゃまをしないでください" assert tokenizer.tokenize(text) == [text]
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.'] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_chinese_tokenization(self): tokenizer = MosesTokenizer(lang="zh") text = u"记者 应谦 美国" assert tokenizer.tokenize(text) == [u'记者', u'应谦', u'美国']
import re import logging from sacremoses.tokenize import MosesTokenizer, MosesDetokenizer logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) app = flask.Flask(__name__) CORS(app) MODEL_PATH = None PSEUDO_SERVICE_URL = True MODEL = None KEYWORD_PROCESSOR = None TRAINING_TAGS = None MOSES_TOKENIZER = MosesTokenizer(lang="fr") MOSES_DETOKENIZER = MosesDetokenizer(lang="fr") # pattern = r"\@-\@|\w+['´`]|\w+|\S+" # regex_tokenizer = RegexpTokenizer(pattern, flags=re.UNICODE | re.IGNORECASE) def load_names_processor(): # :: Load vocabulary for is_name features :: global KEYWORD_PROCESSOR from flashtext import KeywordProcessor KEYWORD_PROCESSOR = KeywordProcessor() KEYWORD_PROCESSOR.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) logging.info("Loaded french proper names...")