예제 #1
0
    def test_enrich_utterance(self):
        # Given
        u = text_to_utterance("a b c d e f")
        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 8,
                "end": 9
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 4,
                "end": 5
            },
            "entity_kind": "the_c_entity"
        }]

        vectorizer = CooccurrenceVectorizer()
        vectorizer._language = "en"

        # When
        preprocessed = vectorizer._enrich_utterance(u, builtin_ents,
                                                    custom_ents)

        # Then
        expected = ["a", "b", "THE_C_ENTITY", "d", "THE_SNIPS_E_ENTITY", "f"]
        self.assertSequenceEqual(expected, preprocessed)
예제 #2
0
    def test_cooccurrence_vectorizer_should_persist(self):
        # Given
        x = [text_to_utterance("yoo yoo")]
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)
        vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset)
        vectorizer.builtin_entity_scope = {"snips/entity"}

        # When
        vectorizer.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        expected_metadata = {"unit_name": "cooccurrence_vectorizer"}
        self.assertJsonContent(metadata_path, expected_metadata)

        vectorizer_path = self.tmp_file_path / "vectorizer.json"
        expected_vectorizer = {
            "word_pairs": {
                "0": ["yoo", "yoo"]
            },
            "language_code": "en",
            "config": vectorizer.config.to_dict(),
            "builtin_entity_scope": ["snips/entity"]
        }
        self.assertJsonContent(vectorizer_path, expected_vectorizer)
예제 #3
0
    def test_limit_vocabulary_should_raise(self):
        # Given
        vectorizer = CooccurrenceVectorizer()
        vectorizer._word_pairs = {
            ("a", "b"): 0,
            ("a", "c"): 1,
            ("a", "d"): 2,
            ("a", "e"): 3
        }

        # When / Then
        with self.assertRaises(ValueError):
            vectorizer.limit_word_pairs([("a", "f")])
예제 #4
0
    def test_limit_vocabulary(self):
        # Given
        config = CooccurrenceVectorizerConfig(filter_stop_words=False)
        vectorizer = CooccurrenceVectorizer(config=config)
        train_data = [
            text_to_utterance(t) for t in ("a b", "a c", "a d", "a e")
        ]

        data = [text_to_utterance(t) for t in ("a c e", "a d e")]
        vectorizer.fit(train_data, get_empty_dataset("en"))
        x_0 = vectorizer.transform(data)
        pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3}
        kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")]
        self.assertDictEqual(pairs, vectorizer.word_pairs)

        # When
        kept_pairs_indexes = [pairs[p] for p in kept_pairs]
        vectorizer.limit_word_pairs(kept_pairs)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2}
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
        x_1 = vectorizer.transform(data)
        self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(),
                             x_1.todense().tolist())
예제 #5
0
    def test_cooccurrence_vectorizer_should_load(self):
        # Given
        config = CooccurrenceVectorizerConfig()

        word_pairs = {("a", "b"): 0, ("a", 'c'): 12}

        serializable_word_pairs = {0: ["a", "b"], 12: ["a", "c"]}

        vectorizer_dict = {
            "unit_name": "cooccurrence_vectorizer",
            "language_code": "en",
            "word_pairs": serializable_word_pairs,
            "builtin_entity_scope": ["snips/datetime"],
            "config": config.to_dict(),
        }

        self.tmp_file_path.mkdir()
        self.writeJsonContent(self.tmp_file_path / "vectorizer.json",
                              vectorizer_dict)

        # When
        vectorizer = CooccurrenceVectorizer.from_path(self.tmp_file_path)

        # Then
        self.assertDictEqual(config.to_dict(), vectorizer.config.to_dict())
        self.assertEqual("en", vectorizer.language)
        self.assertDictEqual(vectorizer.word_pairs, word_pairs)
        self.assertEqual({"snips/datetime"}, vectorizer.builtin_entity_scope)
    def test_fit_unordered(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False,
            keep_order=False,
        )
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)

        # When
        expected_pairs = {
            ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0,
            ("THE_C_ENTITY", "a"): 1,
            ("THE_C_ENTITY", "d"): 2,
            ("THE_C_ENTITY", "f"): 3,
            ("THE_SNIPS_E_ENTITY", "a"): 4,
            ("THE_SNIPS_E_ENTITY", "d"): 5,
            ("THE_SNIPS_E_ENTITY", "f"): 6,
            ("a", "d"): 7,
            ("d", "f"): 8,
        }
        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)

        # Then
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
    def test_fit_transform(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False
        )

        dataset = get_empty_dataset("en")

        builtin_parser = EntityParserMock({t: builtin_ents})
        custom_parser = EntityParserMock({t: custom_ents})
        resources = {STOP_WORDS: set()}
        vectorizer1 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)
        vectorizer2 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)

        # When
        x = [u]
        x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist()
        x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist()

        # Then
        self.assertListEqual(x_0, x_1)
예제 #8
0
    def test_transform(self):
        # Given
        config = CooccurrenceVectorizerConfig(
            filter_stop_words=True,
            window_size=3,
            unknown_words_replacement_string="d")

        t_0 = "yo a b c d e f yo"
        t_1 = "yo a b c d e"
        u_0 = text_to_utterance(t_0)
        u_1 = text_to_utterance(t_1)

        resources = {STOP_WORDS: {"b"}}

        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 11,
                "end": 12
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 7,
                "end": 8
            },
            "entity_kind": "the_c_entity"
        }]

        builtin_parser = EntityParserMock({
            t_0: builtin_ents,
            t_1: builtin_ents
        })
        custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents})

        vectorizer = CooccurrenceVectorizer(
            config,
            builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser,
            resources=resources)

        vectorizer._language = "en"
        vectorizer._word_pairs = {
            ("THE_SNIPS_E_ENTITY", "f"): 0,
            ("a", "THE_C_ENTITY"): 1,
            ("a", "THE_SNIPS_E_ENTITY"): 2,
            ("b", "THE_SNIPS_E_ENTITY"): 3,
            ("yo", "yo"): 4,
            ("d", "THE_SNIPS_E_ENTITY"): 5
        }

        data = [u_0, u_1]

        # When
        x = vectorizer.transform(data)

        # Then
        expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]]
        self.assertEqual(expected, x.todense().tolist())
예제 #9
0
    def test_training_should_be_reproducible(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        x = [text_to_utterance("please make me two hots cups of tea")]
        shared = self.get_shared_data(dataset)
        shared["random_state"] = 42

        # When
        vectorizer1 = CooccurrenceVectorizer(**shared)
        vectorizer1.fit(x, dataset)

        vectorizer2 = CooccurrenceVectorizer(**shared)
        vectorizer2.fit(x, dataset)

        # Then
        with temp_dir() as tmp_dir:
            dir_vectorizer1 = tmp_dir / "vectorizer1"
            dir_vectorizer2 = tmp_dir / "vectorizer2"
            vectorizer1.persist(dir_vectorizer1)
            vectorizer2.persist(dir_vectorizer2)
            hash1 = dirhash(str(dir_vectorizer1), 'sha256')
            hash2 = dirhash(str(dir_vectorizer2), 'sha256')
            self.assertEqual(hash1, hash2)
예제 #10
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
    """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        u_0 = text_to_utterance("hÉllo wOrld Éntity_2")
        u_1 = text_to_utterance("beauTiful World entity 1")
        u_2 = text_to_utterance("Bird bïrdy")
        u_3 = text_to_utterance("Bird birdy")
        utterances = [u_0, u_1, u_2, u_3]

        vectorizer = CooccurrenceVectorizer(
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)

        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        ent_0 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "entity 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            }
        }

        expected_data = [(u_0, [num_0], [ent_0]),
                         (u_1, [num_1], [ent_11, ent_12]), (u_2, [], []),
                         (u_3, [], [])]

        self.assertSequenceEqual(expected_data, processed_data)
    def test_preprocess_for_training(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
    """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITHOUT_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": " yo "
            }, {
                "text": " yo "
            }, {
                "text": "yo "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }, {
                "text": " "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird birdy"
            }]
        }]

        vectorizer = CooccurrenceVectorizer(
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances, training=True)
        processed_data = list(zip(*processed_data))

        # Then
        ent_00 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 23,
                "end": 31
            }
        }
        ent_01 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 32,
                "end": 40
            }
        }
        ent_1 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }

        expected_data = [(utterances[0], [], [ent_00, ent_01]),
                         (utterances[1], [], [ent_1]), (utterances[2], [], []),
                         (utterances[3], [], [])]

        self.assertSequenceEqual(expected_data, processed_data)