def test_protect_patterns(self): moses = MosesTokenizer() text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass" expected_tokens = [ "this", "is", "a", "webpage", "https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl", "that", "kicks", "ass", ] assert ( moses.tokenize(text, protected_patterns=moses.BASIC_PROTECTED_PATTERNS) == expected_tokens ) # Testing against pattern from https://github.com/alvations/sacremoses/issues/35 noe_patterns = [ r"(?:http|ftp)s?://" # http:// or https:// r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))" r"(?::\d+)?" # optional port r"(?:/\w+)*" r"(?:(?:\.[a-z]+)|/?)" ] assert moses.tokenize(text, protected_patterns=noe_patterns) == expected_tokens
def test_aggressive_split(self): moses = MosesTokenizer() expected_tokens_wo_aggressive_dash_split = ['foo-bar'] expected_tokens_with_aggressive_dash_split = ['foo', '@-@', 'bar'] assert moses.tokenize('foo-bar') == expected_tokens_wo_aggressive_dash_split assert moses.tokenize('foo-bar', aggressive_dash_splits=True) == expected_tokens_with_aggressive_dash_split
def test_moses_tokenize(self): moses = MosesTokenizer() # Tokenize a sentence. text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' expected_tokens = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' tokenized_text = moses.tokenize(text, return_str=True) assert tokenized_text == expected_tokens # The nonbreaking prefixes should tokenize the final fullstop. assert moses.tokenize('abc def.') == [u'abc', u'def', u'.'] # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token. # In below example, "pp" is the last element, and there is no digit after it. assert moses.tokenize('2016, pp.') == [u'2016', u',', u'pp', u'.'] # Test escape_xml text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens_with_xmlescape = [ 'This', 'ain', ''t', 'funny', '.', 'It', ''s', 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', ''re', 'gonna', 'shake', 'it', 'off', '?', 'Don', ''t', '?' ] expected_tokens_wo_xmlescape = [ 'This', 'ain', "'t", 'funny', '.', 'It', "'s", 'actually', 'hillarious', ',', 'yet', 'double', 'Ls', '.', '|', '[', ']', '<', '>', '[', ']', '&', 'You', "'re", 'gonna', 'shake', 'it', 'off', '?', 'Don', "'t", '?' ] assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape
def test_aggressive_split(self): moses = MosesTokenizer() expected_tokens_wo_aggressive_dash_split = ["foo-bar"] expected_tokens_with_aggressive_dash_split = ["foo", "@-@", "bar"] assert moses.tokenize( "foo-bar") == expected_tokens_wo_aggressive_dash_split assert (moses.tokenize("foo-bar", aggressive_dash_splits=True) == expected_tokens_with_aggressive_dash_split)
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = mt.tokenize(text) expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [ u"This", u"ain", u"'t", u"funny", u".", u"It", u"'s", u"actually", u"hillarious", u",", u"yet", u"double", u"Ls", u".", u"|", u"[", u"]", u"<", u">", u"[", u"]", u"&", u"You", u"'re", u"gonna", u"shake", u"it", u"off", u"?", u"Don", u"'t", u"?", ] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' expected_tokens = mt.tokenize(text) expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?'] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def test_mixed_cjk_tokenization(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = u"Japan is 日本 in Japanese." assert tokenizer.tokenize(text) == [ u"Japan", u"is", u"日", u"本", u"in", u"Japanese", u".", ] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_opening_brackets(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_detokenize_with_aggressive_split(self): mt = MosesTokenizer() md = MosesDetokenizer() text = 'foo-bar' assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
def test_dot_splitting(self): moses = MosesTokenizer() text = "The meeting will take place at 11:00 a.m. Tuesday." expected_tokens = ( "The meeting will take place at 11 : 00 a.m. Tuesday .".split() ) self.assertEqual(moses.tokenize(text), expected_tokens)
def test_final_comma_split_after_number(self): moses = MosesTokenizer() text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). " expected_tokens = [ "Sie", "sollten", "vor", "dem", "Upgrade", "eine", "Sicherung", "dieser", "Daten", "erstellen", "(", "wie", "unter", "Abschnitt", "4.1.1", ",", u"„", "Sichern", "aller", "Daten", "und", "Konfigurationsinformationen", u"“", "beschrieben", ")", ".", ] self.assertEqual(moses.tokenize(text), expected_tokens)
def test_opening_brackets(self): moses = MosesTokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." # echo By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities). | perl mosesdecoder\scripts\tokenizer\tokenizer.perl en expected_tokens = "By the mid 1990s a version of the game became a Latvian television series ( with a parliamentary setting , and played by Latvian celebrities ) .".split() assert moses.tokenize(text) == expected_tokens
def test_final_comma_split_after_number(self): moses = MosesTokenizer() text = u"Sie sollten vor dem Upgrade eine Sicherung dieser Daten erstellen (wie unter Abschnitt 4.1.1, „Sichern aller Daten und Konfigurationsinformationen“ beschrieben). " expected_tokens = [ 'Sie', 'sollten', 'vor', 'dem', 'Upgrade', 'eine', 'Sicherung', 'dieser', 'Daten', 'erstellen', '(', 'wie', 'unter', 'Abschnitt', '4.1.1', ',', u'„', 'Sichern', 'aller', 'Daten', 'und', 'Konfigurationsinformationen', u'“', 'beschrieben', ')', '.' ] self.assertEqual(moses.tokenize(text), expected_tokens)
def test_final_dot_unconditionally(self): # Make sure that it works for examples on # https://github.com/moses-smt/mosesdecoder/pull/204 text = "'So am I." expected_tokens = "'So am I .".split() self.assertEqual(moses.tokenize(text), expected_tokens) moses = MosesTokenizer(lang="fr") text = "Des gens admirent une œuvre d'art." expected_tokens = "Des gens admirent une œuvre d' art .".split() self.assertEqual(moses.tokenize(text), expected_tokens) moses = MosesTokenizer(lang="de") text = "...schwer wie ein iPhone 5." expected_tokens = "... schwer wie ein iPhone 5 .".split() self.assertEqual(moses.tokenize(text), expected_tokens) moses = MosesTokenizer(lang="cz") text = "Dvě děti, které běží bez bot." expected_tokens = "Dvě děti , které běží bez bot .".split() self.assertEqual(moses.tokenize(text), expected_tokens)
def test_protect_patterns(self): moses = MosesTokenizer() text = "this is a webpage https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl that kicks ass" expected_tokens = [ 'this', 'is', 'a', 'webpage', 'https://stackoverflow.com/questions/6181381/how-to-print-variables-in-perl', 'that', 'kicks', 'ass' ] assert moses.tokenize(text, protected_patterns=moses.BASIC_PROTECTED_PATTERNS ) == expected_tokens # Testing against pattern from https://github.com/alvations/sacremoses/issues/35 noe_patterns = [ r'(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?))' r'(?::\d+)?' # optional port r'(?:/\w+)*' r'(?:(?:\.[a-z]+)|/?)' ] assert moses.tokenize( text, protected_patterns=noe_patterns) == expected_tokens
def test_japanese_tokenization(self): tokenizer = MosesTokenizer(lang="ja") text = u"電話でんわの邪魔じゃまをしないでください" assert tokenizer.tokenize(text) == [text]
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_chinese_tokenization(self): tokenizer = MosesTokenizer(lang="zh") text = u"记者 应谦 美国" assert tokenizer.tokenize(text) == [u"记者", u"应谦", u"美国"]
def test_french_apostrophes(self): tokenizer = MosesTokenizer(lang="fr") detokenizer = MosesDetokenizer(lang="fr") text = u"L'amitié nous a fait forts d'esprit" assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_chinese_tokenization(self): tokenizer = MosesTokenizer(lang="zh") text = u"记者 应谦 美国" assert tokenizer.tokenize(text) == [u'记者', u'应谦', u'美国']
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.'] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_trailing_dot_apostrophe(self): moses = MosesTokenizer() text = "'Hello.'" expected_tokens = "'Hello . '".split() self.assertEqual(moses.tokenize(text), expected_tokens)
def preprocess(src_file, mt_file, output_dir, tokenize_lang=None): """ pre-process input file before post-editing split at <br> and remove <i> tags and music symbols. store everything in a codes file in output_dir Args: src_file: src_file of the translation to be preprocessed mt_file: output of the mt system file to be preprocessed output_dir: output directory to output the preprocessed files and codes file """ punct_normalizer = MosesPunctNormalizer() # set tokenizer tokenizer = None if tokenize_lang: tokenizer = MosesTokenizer(lang=tokenize_lang) code_file = output_dir+'/codes.'+os.path.basename(mt_file) src_out_file = output_dir+'/'+os.path.basename(src_file)+'.pre' mt_out_file = output_dir+'/'+os.path.basename(mt_file)+'.pre' with open(src_out_file,'w') as fosrc, open(mt_out_file,'w') as fomt, open(code_file,'w') as fcodes, open(src_file) as fsrc, open(mt_file) as fmt: idx=0 for src,mt in zip(fsrc,fmt): src, mt = src.strip(), mt.strip() idx+=1 # standardize br tags src = re.sub('<\s*br\s*\/*>', '<br>', src, flags=re.IGNORECASE) mt = re.sub('<\s*br\s*\/*>', '<br>', mt, flags=re.IGNORECASE) # if number of <br> is same, split and save it as multiple lines src_split = re.split(r'\s*<br>\s*',src) mt_split = re.split(r'\s*<br>\s*',mt) # if the src, mt, do not have the same number of <br>, then do not split it if not (len(src_split) == len(mt_split)): src_split = [src] mt_split = [mt] for src_part, mt_part in zip(src_split, mt_split): code = "{}\t".format(idx) # check if they start with the hyphen has_hyphen = False if src_part.startswith('-'): has_hyphen = True src_part = src_part[1:].lstrip() if mt_part.startswith('-'): has_hyphen = True mt_part = mt_part[1:].lstrip() # check if they start with the music symbol music_syms = ('♫','♬','♪') has_music = False if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), src_part): has_music = True src_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', src_part) #if mt_part.startswith(music_syms) or mt_part.endswith(music_syms): if re.search(r'\s*[{}]\s*'.format(''.join(music_syms)), mt_part): has_music = True mt_part = re.sub(r'\s*[{}]\s*'.format(''.join(music_syms)), '', mt_part) # check if it has enclosing italics tags. otherwise leave it as it is itag = '<i>' eitag = '</i>' has_itag = False if src_part.startswith(itag) or src_part.endswith(eitag): has_itag = True if mt_part.startswith(itag) or mt_part.endswith(eitag): has_itag = True #if re.match(r'^<i>[^<]*</i>$', src_part): if has_hyphen == True: code += 'HYPHENBEGIN\t' if has_music == True: code += 'MUSIC\t' if has_itag == True: code += 'ITALICTAGS\t' src_part = punct_normalizer.normalize(cleanup(src_part)) mt_part = punct_normalizer.normalize(cleanup(mt_part)) if tokenizer: src_part = " ".join(tokenizer.tokenize(src_part, escape=False)) mt_part = " ".join(tokenizer.tokenize(mt_part, escape=False)) fosrc.write(src_part.strip()+'\n') fomt.write(mt_part.strip()+'\n') fcodes.write("{}\n".format(code))
# write to file for src, mt, pe in zip(src_split, mt_split, pe_split): src = cleanup(src, args) mt = cleanup(mt, args) pe = cleanup(pe, args) if len(src) < args.min_l or len(mt) < args.min_l or len( pe) < args.min_l: continue if len(src) > args.max_l or len(mt) > args.max_l or len( pe) > args.max_l: continue if tokenizer: src = " ".join(tokenizer.tokenize(src, escape=False)) mt = " ".join(tokenizer.tokenize(mt, escape=False)) pe = " ".join(tokenizer.tokenize(pe, escape=False)) if args.output_stdout == False: # remove all tags including <br> and write to file fos[0].write(src + '\n') fos[1].write(mt + '\n') fos[2].write(pe + '\n') else: print("{}\t{}\t{}".format(src, mt, pe)) if args.output_stdout == False: for fo in fos: fo.close()
def test_moses_tokenize(self): moses = MosesTokenizer() # Tokenize a sentence. text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = u"This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" tokenized_text = moses.tokenize(text, return_str=True) assert tokenized_text == expected_tokens # The nonbreaking prefixes should tokenize the final fullstop. assert moses.tokenize("abc def.") == [u"abc", u"def", u"."] # The nonbreaking prefixes should deal the situation when numeric only prefix is the last token. # In below example, "pp" is the last element, and there is no digit after it. assert moses.tokenize("2016, pp.") == [u"2016", u",", u"pp", u"."] # Test escape_xml text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens_with_xmlescape = [ "This", "ain", "'t", "funny", ".", "It", "'s", "actually", "hillarious", ",", "yet", "double", "Ls", ".", "|", "[", "]", "<", ">", "[", "]", "&", "You", "'re", "gonna", "shake", "it", "off", "?", "Don", "'t", "?", ] expected_tokens_wo_xmlescape = [ "This", "ain", "'t", "funny", ".", "It", "'s", "actually", "hillarious", ",", "yet", "double", "Ls", ".", "|", "[", "]", "<", ">", "[", "]", "&", "You", "'re", "gonna", "shake", "it", "off", "?", "Don", "'t", "?", ] assert moses.tokenize(text, escape=True) == expected_tokens_with_xmlescape assert moses.tokenize(text, escape=False) == expected_tokens_wo_xmlescape # Test to check https://github.com/alvations/sacremoses/issues/19 text = "this 'is' the thing" expected_tokens = ["this", "'", "is", "'", "the", "thing"] assert moses.tokenize(text, escape=True) == expected_tokens