示例#1
0
    def test_tokenizer_convert_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words = tokenizer.texts_to_words("http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words))

        words_en = tokenizer._texts_to_words_en(
            "http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words_en)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_en))

        words_jp = tokenizer._texts_to_words_jp(
            "http://192.168.1.10/index.html")
        self.assertEqual([
            "http", "://", "192", ".", "168", ".", "1", ".", "10", "/",
            "index", ".", "html"
        ], words_jp)
        self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html",
                         tokenizer.words_to_texts(words_jp))

        words_mix = tokenizer.texts_to_words(
            "URLはhttp://192.168.1.10/index.html")
        self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"],
                         words_mix)
        self.assertEqual("URLはhttp://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_mix))
示例#2
0
    def test_tokenizer_no_test(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        words = tokenizer._texts_to_words_en(None)
        self.assertEqual(0, len(words))
        words = tokenizer._texts_to_words_en('')
        self.assertEqual(0, len(words))

        words = tokenizer._texts_to_words_jp(None)
        self.assertEqual(0, len(words))
        words = tokenizer._texts_to_words_jp('')
        self.assertEqual(0, len(words))

        words = tokenizer._template_texts_to_words_jp(None)
        self.assertEqual(0, len(words))
        words = tokenizer._template_texts_to_words_jp('')
        self.assertEqual(0, len(words))

        texts = tokenizer._words_to_texts(None)
        self.assertEqual("", texts)
        words = tokenizer._words_to_texts('')
        self.assertEqual("", texts)