示例#1
0
    def test_tokenizer_template_texts_to_words_jp(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual(["こんにちは「良い天気」ですね"],
                         tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
示例#2
0
    def test_tokenizer_texts_to_words_en(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello,", "he", "is", "Mr.A", "(No", "name)"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
示例#3
0
    def test_tokenizer_template_texts_to_words_en(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello, he is Mr.A (No name)"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
示例#4
0
    def test_tokenizer_texts_to_words_en_with_punctation(self):
        punctations = ';\'",!()[]:’”;、。!()「」'
        tokenizer = TokenizerJP(punctuation_chars=punctations)
        self.assertIsNotNone(tokenizer)

        self.assertEqual([], tokenizer.texts_to_words(""))
        self.assertEqual(
            ["Hello", "he", "is", "Mr.A", "No", "name"],
            tokenizer.texts_to_words("Hello, he is Mr.A (No name)"))
示例#5
0
    def test_reload_jp(self):
        storage_factory = StorageFactory()
        tokenizer = TokenizerJP()

        file_store_config = FileStorageConfiguration()
        file_store_config._normal_storage = FileStoreConfiguration(
            file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep +
            "normal_jp.txt",
            format="text",
            extension="txt",
            encoding="utf-8",
            delete_on_start=False)

        storage_engine = FileStorageEngine(file_store_config)

        storage_factory._storage_engines[
            StorageFactory.NORMAL] = storage_engine
        storage_factory._store_to_engine_map[
            StorageFactory.NORMAL] = storage_engine

        collection = NormalCollection()
        self.assertIsNotNone(collection)

        collection.load(storage_factory)

        self.assertEqual("丸1の回答",
                         collection.normalise_string(tokenizer, "①の回答"))

        collection.reload(storage_factory)

        self.assertEqual("丸1の回答",
                         collection.normalise_string(tokenizer, "①の回答"))
示例#6
0
    def test_reload_jp(self):
        storage_factory = StorageFactory()
        tokenizer = TokenizerJP()

        file_store_config = FileStorageConfiguration()
        file_store_config._gender_storage = FileStoreConfiguration(
            file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep +
            "gender_jp.txt",
            format="text",
            extension="txt",
            encoding="utf-8",
            delete_on_start=False)

        storage_engine = FileStorageEngine(file_store_config)

        storage_factory._storage_engines[
            StorageFactory.GENDER] = storage_engine
        storage_factory._store_to_engine_map[
            StorageFactory.GENDER] = storage_engine

        collection = GenderCollection()
        self.assertIsNotNone(collection)

        collection.load(storage_factory)

        self.assertEqual(collection.gender("彼"), '彼女')
        self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"),
                         "彼女が来た")

        collection.reload(storage_factory)

        self.assertEqual(collection.gender("彼"), '彼女')
        self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"),
                         "彼女が来た")
示例#7
0
    def test_reload_jp(self):
        storage_factory = StorageFactory()
        tokenizer = TokenizerJP()

        file_store_config = FileStorageConfiguration()
        file_store_config._person2_storage = FileStoreConfiguration(
            file=os.path.dirname(__file__) + os.sep + "test_files" + os.sep +
            "person2_jp.txt",
            format="text",
            extension="txt",
            encoding="utf-8",
            delete_on_start=False)

        storage_engine = FileStorageEngine(file_store_config)

        storage_factory._storage_engines[
            StorageFactory.PERSON2] = storage_engine
        storage_factory._store_to_engine_map[
            StorageFactory.PERSON2] = storage_engine

        collection = Person2Collection()
        self.assertIsNotNone(collection)

        collection.load(storage_factory)

        self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女")
        self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"),
                         "私か私が来た")

        collection.reload(storage_factory)

        self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女")
        self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"),
                         "私か私が来た")
示例#8
0
    def test_collection_duplicate_jp(self):
        collection = NormalCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("①", '丸1')
        collection.add_to_lookup("①", '丸2')

        tokenizer = TokenizerJP()
        self.assertEqual("丸1の回答",
                         collection.normalise_string(tokenizer, "①の回答"))
示例#9
0
    def test_tokenizer_words_to_texts_with_quote(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            'Hello " very good " World',
            tokenizer.words_to_texts(
                ["Hello", '"', "very", "good", '"', "World"]))
        self.assertEqual(
            'Hello "very good" World',
            tokenizer.words_to_texts(["Hello", '"very', 'good"', "World"]))
        self.assertEqual(
            'こんにちは"良い天気"ですね',
            tokenizer.words_to_texts(["こんにちは", '"', "良い天気", '"', "です", "ね"]))
        self.assertEqual(
            'こんにちは"良い天気"ですね',
            tokenizer.words_to_texts(["こんにちは", '"良い天気"', "です", "ね"]))
示例#10
0
    def test_collection_operations_JP(self):
        collection = NormalCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("①", '丸1')
        tokenizer = TokenizerJP()

        self.assertTrue(collection.has_keyVal("①"))
        self.assertEqual('丸1', collection.value("①"))

        self.assertEqual("丸1の回答",
                         collection.normalise_string(tokenizer, "①の回答"))
示例#11
0
    def test_collection_invalid_jp(self):
        collection = NormalCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("彼岸", 'お彼岸')

        self.assertFalse(collection.has_keyVal("彼氏"))
        self.assertIsNone(collection.value("彼氏"))

        tokenizer = TokenizerJP()
        self.assertIsNone(collection.normalise("彼氏"))
        self.assertEqual("彼氏の回答",
                         collection.normalise_string(tokenizer, "彼氏の回答"))
示例#12
0
    def test_collection_invalid_JP(self):
        collection = GenderCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("彼", '彼女')

        self.assertFalse(collection.has_keyVal("彼氏"))
        self.assertIsNone(collection.value("彼氏"))

        tokenizer = TokenizerJP()
        self.assertIsNone(collection.gender("彼氏"))
        self.assertEqual(collection.genderise_string(tokenizer, "彼氏が来た"),
                         "彼氏が来た")
示例#13
0
    def test_tokenizer_words_to_texts_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            "http :// 192.168.1.10 / index.html",
            tokenizer.words_to_texts(
                ["http", "://", "192.168.1.10", "/", "index.html"]))
示例#14
0
    def test_collection_invalid_JP(self):
        collection = Person2Collection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("私", "彼か彼女")

        self.assertFalse(collection.has_keyVal("彼"))
        self.assertIsNone(collection.value("彼"))

        tokenizer = TokenizerJP()
        self.assertIsNone(collection.person("彼"))
        self.assertEqual(collection.personalise_string(tokenizer, "彼が来た"),
                         "彼が来た")
    def test_collection_invalid_jp(self):
        collection = DenormalCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("丸1", "①")

        self.assertFalse(collection.has_keyVal("丸"))
        self.assertIsNone(collection.value("丸"))

        tokenizer = TokenizerJP()
        self.assertIsNone(collection.denormalise("丸"))
        self.assertEqual(collection.denormalise_string(tokenizer, "丸の回答"),
                         "丸の回答")
示例#16
0
    def test_collection_operations_JP(self):
        collection = GenderCollection()
        self.assertIsNotNone(collection)

        collection.add_to_lookup("彼", '彼女')
        tokenizer = TokenizerJP()

        self.assertTrue(collection.has_keyVal("彼"))
        self.assertEqual('彼女', collection.value("彼"))

        self.assertEqual(collection.gender("彼"), '彼女')
        self.assertEqual(collection.genderise_string(tokenizer, "彼が来た"),
                         "彼女が来た")
示例#17
0
    def test_tokenizer_words_to_texts_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        self.assertEqual(
            "こんにちは10日はHappy dayですね",
            tokenizer.words_to_texts(
                ["こんにちは", "10", "日", "は", "Happy", "day", "です", "ね"]))
        self.assertEqual(
            "=こんにちは10日はHappy dayですね=",
            tokenizer.words_to_texts(
                ["=", "こんにちは", "10", "日", "は", "Happy", "day", "です", "ね",
                 "="]))
        self.assertEqual(
            "pen lightはありますか",
            tokenizer.words_to_texts(['pen', 'light', 'は', 'あり', 'ます', 'か']))
示例#18
0
    def test_tokenizer_no_test(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        words = tokenizer._texts_to_words_en(None)
        self.assertEqual(0, len(words))
        words = tokenizer._texts_to_words_en('')
        self.assertEqual(0, len(words))

        words = tokenizer._texts_to_words_jp(None)
        self.assertEqual(0, len(words))
        words = tokenizer._texts_to_words_jp('')
        self.assertEqual(0, len(words))

        words = tokenizer._template_texts_to_words_jp(None)
        self.assertEqual(0, len(words))
        words = tokenizer._template_texts_to_words_jp('')
        self.assertEqual(0, len(words))

        texts = tokenizer._words_to_texts(None)
        self.assertEqual("", texts)
        words = tokenizer._words_to_texts('')
        self.assertEqual("", texts)
示例#19
0
    def test_tokenizer_template_texts_to_words_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)
        tokenizer.is_template = True

        self.assertEqual(["こんにちはhappyですか"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは happy ですか"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは(happy)ですか"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))
示例#20
0
    def test_tokenizer_texts_to_words_mix_with_punctation(self):
        punctations = ';\'",!()[]:’”;、。!()「」'
        tokenizer = TokenizerJP(punctuation_chars=punctations)
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは「happy]ですか"))
        self.assertEqual(["こんにちは", "happy", "unhappy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy, unhappy ですか"))
示例#21
0
    def test_collection_operations_JP(self):
        person2_text = """
        "私","彼か彼女"
        "彼","私"
        "彼女","私"
        """

        collection = Person2Collection()
        self.assertIsNotNone(collection)

        collection.load_from_text(person2_text)
        tokenizer = TokenizerJP()

        self.assertEqual(collection.personalise_string(tokenizer, "私"), "彼か彼女")
        self.assertEqual(collection.personalise_string(tokenizer, "彼か彼女が来た"),
                         "私か私が来た")

        pattern = collection.person("私")
        self.assertIsNotNone(pattern)
        self.assertEqual("彼か彼女", pattern)
示例#22
0
    def test_collection_operations_jp(self):
        person_text = """
        "貴方","私"
        "私","貴方"
        "あなた","わたし"
        "わたし","あなた"
        """

        collection = PersonCollection()
        self.assertIsNotNone(collection)

        collection.load_from_text(person_text)
        tokenizer = TokenizerJP()

        self.assertEqual(collection.personalise_string(tokenizer, "私が正しい"),
                         "貴方が正しい")
        self.assertEqual(collection.personalise_string(tokenizer, "あなたは変"),
                         "わたしは変")

        pattern = collection.person("貴方")
        self.assertIsNotNone(pattern)
        self.assertEqual("私", pattern)
示例#23
0
    def test_tokenizer_words_to_texts_jp(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual(
            "こんにちは 良い天気ですね",
            tokenizer.words_to_texts(["こんにちは", "", "良い", "天気", "です", "ね"]))
        self.assertEqual(
            "こんにちは<良い天気>ですね",
            tokenizer.words_to_texts(
                ["こんにちは", "<", "良い", "天気", ">", "です", "ね"]))
        self.assertEqual(
            "こんにちは<良い天気>ですね",
            tokenizer.words_to_texts(
                ["こんにちは", "<", "良い", "天気", ">", "です", "ね"]))
        self.assertEqual(
            "<こんにちは良い天気ですね>",
            tokenizer.words_to_texts(
                ["<", "こんにちは", "良い", "天気", "です", "ね", ">"]))
示例#24
0
    def test_tokenizer_convert_url(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words = tokenizer.texts_to_words("http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words))

        words_en = tokenizer._texts_to_words_en(
            "http://192.168.1.10/index.html")
        self.assertEqual(["http://192.168.1.10/index.html"], words_en)
        self.assertEqual("http://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_en))

        words_jp = tokenizer._texts_to_words_jp(
            "http://192.168.1.10/index.html")
        self.assertEqual([
            "http", "://", "192", ".", "168", ".", "1", ".", "10", "/",
            "index", ".", "html"
        ], words_jp)
        self.assertEqual("http :// 192 . 168 . 1 . 10 / index . html",
                         tokenizer.words_to_texts(words_jp))

        words_mix = tokenizer.texts_to_words(
            "URLはhttp://192.168.1.10/index.html")
        self.assertEqual(["URL", "は", "http://192.168.1.10/index.html"],
                         words_mix)
        self.assertEqual("URLはhttp://192.168.1.10/index.html",
                         tokenizer.words_to_texts(words_mix))
示例#25
0
    def test_tokenizer_words_to_texts_json_tag(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = [JSON_CHILD_IN, "json", "data", JSON_CHILD_OUT]
        self.assertEqual("\uF010json data\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010データ設定\uF011", tokenizer.words_to_texts(words2))

        words1 = [JSON_CHILD_IN, "json", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010json設定\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "json", JSON_CHILD_OUT]
        self.assertEqual("\uF010データjson\uF011",
                         tokenizer.words_to_texts(words2))

        words1 = [JSON_CHILD_IN, "json", "設定", "data", JSON_CHILD_OUT]
        self.assertEqual("\uF010json設定data\uF011",
                         tokenizer.words_to_texts(words1))

        words2 = [JSON_CHILD_IN, "データ", "json", "設定", JSON_CHILD_OUT]
        self.assertEqual("\uF010データjson設定\uF011",
                         tokenizer.words_to_texts(words2))
示例#26
0
    def test_tokenizer_words_to_texts_with_text_jp_json_en(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = [
            "こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね"
        ]
        self.assertEqual("こんにちは\uF010json-data\uF011ですね",
                         tokenizer.words_to_texts(words1))

        words2 = [
            "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"',
            "です", "ね"
        ]
        self.assertEqual('こんにちは"\uF010json-data\uF011"ですね',
                         tokenizer.words_to_texts(words2))

        words3 = ["こんにちは", JSON_CHILD_IN, "json-data", JSON_CHILD_OUT]
        self.assertEqual('こんにちは\uF010json-data\uF011',
                         tokenizer.words_to_texts(words3))

        words4 = [
            "こんにちは", '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"'
        ]
        self.assertEqual('こんにちは"\uF010json-data\uF011"',
                         tokenizer.words_to_texts(words4))

        words5 = [JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, "です", "ね"]
        self.assertEqual('\uF010json-data\uF011ですね',
                         tokenizer.words_to_texts(words5))

        words6 = [
            '"', JSON_CHILD_IN, "json-data", JSON_CHILD_OUT, '"', "です", "ね"
        ]
        self.assertEqual('"\uF010json-data\uF011"ですね',
                         tokenizer.words_to_texts(words6))
示例#27
0
    def test_tokenizer_words_to_texts_with_text_en_json_jp(self):
        JSON_CHILD_IN = '\uF010'
        JSON_CHILD_OUT = '\uF011'

        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        words1 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"]
        self.assertEqual("Hello\uF010データ\uF011you",
                         tokenizer.words_to_texts(words1))

        words2 = [
            "Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"
        ]
        self.assertEqual('Hello "\uF010データ\uF011" you',
                         tokenizer.words_to_texts(words2))

        words3 = ["Hello", JSON_CHILD_IN, "データ", JSON_CHILD_OUT]
        self.assertEqual('Hello\uF010データ\uF011',
                         tokenizer.words_to_texts(words3))

        words4 = ["Hello", '"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"']
        self.assertEqual('Hello "\uF010データ\uF011"',
                         tokenizer.words_to_texts(words4))

        words5 = [JSON_CHILD_IN, "データ", JSON_CHILD_OUT, "you"]
        self.assertEqual('\uF010データ\uF011you',
                         tokenizer.words_to_texts(words5))

        words6 = ['"', JSON_CHILD_IN, "データ", JSON_CHILD_OUT, '"', "you"]
        self.assertEqual('"\uF010データ\uF011" you',
                         tokenizer.words_to_texts(words6))
示例#28
0
    def test_tokenizer_words_from_current_pos_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        tokenizer.set_configuration_before_concatenation_rule('.*[ -~]')
        tokenizer.set_configuration_after_concatenation_rule('[ -~].*')

        with self.assertRaises(Exception):
            tokenizer.words_from_current_pos(None, 0)

        words = ["Yes", "か", "No"]
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, 0))
        self.assertEqual("かNo", tokenizer.words_from_current_pos(words, 1))
        self.assertEqual("No", tokenizer.words_from_current_pos(words, 2))
        self.assertEqual("", tokenizer.words_from_current_pos(words, 3))
        self.assertEqual("No", tokenizer.words_from_current_pos(words, -1))
        self.assertEqual("かNo", tokenizer.words_from_current_pos(words, -2))
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -3))
        self.assertEqual("YesかNo", tokenizer.words_from_current_pos(words, -4))
示例#29
0
    def test_tokenizer_texts_to_words_jp(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "「", "良い", "天気", "」", "です", "ね"],
                         tokenizer.texts_to_words("こんにちは「良い天気」ですね"))
示例#30
0
    def test_tokenizer_texts_to_words_mix(self):
        tokenizer = TokenizerJP()
        self.assertIsNotNone(tokenizer)

        self.assertEqual(["こんにちは", "happy"],
                         tokenizer.texts_to_words("こんにちはhappy"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちはhappyですか"))
        self.assertEqual(["こんにちは", "happy", "です", "か"],
                         tokenizer.texts_to_words("こんにちは happy ですか"))
        self.assertEqual(["こんにちは", "(happy)", "です", "か"],
                         tokenizer.texts_to_words("こんにちは(happy)ですか"))

        self.assertEqual(["Hello", "ハッピー"],
                         tokenizer.texts_to_words("Hello ハッピー"))
        self.assertEqual(["Hello", "ハッピー", "です", "か"],
                         tokenizer.texts_to_words("Helloハッピーですか"))
        self.assertEqual(["Hello", "(", "ハッピー", ")", "です", "か"],
                         tokenizer.texts_to_words("Hello (ハッピー)ですか"))
        self.assertEqual(["Hello", "ハッピー", "you"],
                         tokenizer.texts_to_words("Helloハッピーyou"))
        self.assertEqual(["Hello", "ハッピー", "you"],
                         tokenizer.texts_to_words("Hello ハッピー you"))