def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = ( u"This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf" ) expected_tokens = mt.tokenize(text) expected_detokens = u"This, is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf" assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [ u"This", u"ain", u"'t", u"funny", u".", u"It", u"'s", u"actually", u"hillarious", u",", u"yet", u"double", u"Ls", u".", u"|", u"[", u"]", u"<", u">", u"[", u"]", u"&", u"You", u"'re", u"gonna", u"shake", u"it", u"off", u"?", u"Don", u"'t", u"?", ] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def test_moses_detokenize(self): mt = MosesTokenizer() md = MosesDetokenizer() text = u'This, is a sentence with weird\xbb symbols\u2026 appearing everywhere\xbf' expected_tokens = mt.tokenize(text) expected_detokens = = u'This , is a sentence with weird \xbb symbols \u2026 appearing everywhere \xbf' assert md.detokenize(expected_tokens) == expected_detokens text = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [ ] & You're gonna shake it off? Don't?" expected_tokens = [u'This', u'ain', u''t', u'funny', u'.', u'It', u''s', u'actually', u'hillarious', u',', u'yet', u'double', u'Ls', u'.', u'|', u'[', u']', u'<', u'>', u'[', u']', u'&', u'You', u''re', u'gonna', u'shake', u'it', u'off', u'?', u'Don', u''t', u'?'] expected_detokens = "This ain't funny. It's actually hillarious, yet double Ls. | [] < > [] & You're gonna shake it off? Don't?" assert mt.tokenize(text) == expected_tokens assert md.detokenize(expected_tokens) == expected_detokens
def test_detokenize_with_aggressive_split(self): mt = MosesTokenizer() md = MosesDetokenizer() text = 'foo-bar' assert md.detokenize(mt.tokenize(text, aggressive_dash_splits=True)) == text
def test_opening_brackets(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = "By the mid 1990s a version of the game became a Latvian television series (with a parliamentary setting, and played by Latvian celebrities)." assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_mixed_cjk_tokenization(self): tokenizer = MosesTokenizer() detokenizer = MosesDetokenizer() text = u"Japan is 日本 in Japanese." assert tokenizer.tokenize(text) == [ u"Japan", u"is", u"日", u"本", u"in", u"Japanese", u".", ] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u"세계", u"에서", u"가장", u"강력한", u"."] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_french_apostrophes(self): tokenizer = MosesTokenizer(lang="fr") detokenizer = MosesDetokenizer(lang="fr") text = u"L'amitié nous a fait forts d'esprit" assert detokenizer.detokenize(tokenizer.tokenize(text)) == text
def test_korean_tokenization(self): tokenizer = MosesTokenizer(lang="ko") detokenizer = MosesDetokenizer(lang="ko") text = u"세계 에서 가장 강력한." assert tokenizer.tokenize(text) == [u'세계', u'에서', u'가장', u'강력한', u'.'] assert detokenizer.detokenize(tokenizer.tokenize(text)) == text