def test_tokenizer_convert_url(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') words = tokenizer.texts_to_words("http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words)) words_en = tokenizer._texts_to_words_en( "http://192.168.1.10/index.html") self.assertEqual(["http://192.168.1.10/index.html"], words_en) self.assertEqual("http://192.168.1.10/index.html", tokenizer.words_to_texts(words_en)) words_jp = tokenizer._texts_to_words_jp( "http://192.168.1.10/index.html") self.assertEqual([ "http", "://", "192", ".", "168", ".", "1", ".", "10", "/", "index", ".", "html" ], words_jp) self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html", tokenizer.words_to_texts(words_jp)) words_mix = tokenizer.texts_to_words( "URLはhttp://192.168.1.10/index.html") self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"], words_mix) self.assertEqual("URLはhttp://192.168.1.10/index.html", tokenizer.words_to_texts(words_mix))
def test_tokenizer_texts_to_words_en(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello,", "he", "is", "Mr.A", "(No", "name)"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_tokenizer_texts_to_words_en_with_punctation(self): punctations = ';\'",!()[]:’”;、。!()「」' tokenizer = TokenizerJP(punctuation_chars=punctations) self.assertIsNotNone(tokenizer) self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello", "he", "is", "Mr.A", "No", "name"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_tokenizer_template_texts_to_words_en(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual( ["Hello, he is Mr.A (No name)"], tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
def test_tokenizer_template_texts_to_words_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual(["こんにちはhappyですか"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは happy ですか"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは(happy)ですか"], tokenizer.texts_to_words("こんにちは(happy)ですか"))
def test_tokenizer_texts_to_words_mix_with_punctation(self): punctations = ';\'",!()[]:’”;、。!()「」' tokenizer = TokenizerJP(punctuation_chars=punctations) self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは(happy)ですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは「happy]ですか")) self.assertEqual(["こんにちは", "happy", "unhappy", "です", "か"], tokenizer.texts_to_words("こんにちは happy, unhappy ですか"))
def test_tokenizer_template_texts_to_words_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.is_template = True self.assertEqual(["こんにちは「良い天気」ですね"], tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
def test_tokenizer_texts_to_words_mix(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "happy"], tokenizer.texts_to_words("こんにちはhappy")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちはhappyですか")) self.assertEqual(["こんにちは", "happy", "です", "か"], tokenizer.texts_to_words("こんにちは happy ですか")) self.assertEqual(["こんにちは", "(happy)", "です", "か"], tokenizer.texts_to_words("こんにちは(happy)ですか")) self.assertEqual(["Hello", "ハッピー"], tokenizer.texts_to_words("Hello ハッピー")) self.assertEqual(["Hello", "ハッピー", "です", "か"], tokenizer.texts_to_words("Helloハッピーですか")) self.assertEqual(["Hello", "(", "ハッピー", ")", "です", "か"], tokenizer.texts_to_words("Hello (ハッピー)ですか")) self.assertEqual(["Hello", "ハッピー", "you"], tokenizer.texts_to_words("Helloハッピーyou")) self.assertEqual(["Hello", "ハッピー", "you"], tokenizer.texts_to_words("Hello ハッピー you"))
def test_tokenizer_normal_texts(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) tokenizer.set_configuration_before_concatenation_rule('.*[ -~]') tokenizer.set_configuration_after_concatenation_rule('[ -~].*') self.assertEqual([], tokenizer.texts_to_words("")) self.assertEqual(["Hello"], tokenizer.texts_to_words("Hello")) self.assertEqual(["Hello", "World"], tokenizer.texts_to_words("Hello World")) self.assertEqual(["Hello", "World"], tokenizer.texts_to_words(" Hello World ")) self.assertEqual(["こんにちは"], tokenizer.texts_to_words("こんにちは")) self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"], tokenizer.texts_to_words("こんにちは良い天気ですね")) self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"], tokenizer.texts_to_words(" こんにちは 良い天気ですね ")) self.assertEqual("", tokenizer.words_to_texts([])) self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"])) self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "World"])) self.assertEqual("Hello World", tokenizer.words_to_texts(["Hello", "", "World"])) self.assertEqual("Hello World", tokenizer.words_to_texts([" Hello ", " World "])) self.assertEqual("Hello", tokenizer.words_to_texts(["Hello"])) self.assertEqual("こんにちは", tokenizer.words_to_texts(["こんにちは"])) self.assertEqual( "こんにちは 良い天気ですね", tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"])) self.assertEqual( "こんにちは 良い 天気 です ね", tokenizer.words_to_texts( [" こんにちは ", " 良い ", " 天気 ", " です ", " ね "]))
def test_tokenizer_texts_to_words_jp(self): tokenizer = TokenizerJP() self.assertIsNotNone(tokenizer) self.assertEqual(["こんにちは", "「", "良い", "天気", "」", "です", "ね"], tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
def test_tokenizer_texts_to_words_jp_with_punctation(self): punctations = ';\'",!()[]:’”;、。!()「」' tokenizer = TokenizerJP(punctuation_chars=punctations) self.assertEqual(["こんにちは", "良い", "天気", "です", "ね"], tokenizer.texts_to_words("こんにちは「良い天気」ですね"))