示例#1
0
    def test_limit_vocabulary(self):
        # Given
        vectorizer = TfidfVectorizer()
        dataset = get_empty_dataset("en")

        utterances = [
            text_to_utterance("5 55 6 66 666"),
            text_to_utterance("55 66")
        ]

        voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4}
        kept_unigrams = ["5", "6", "666"]
        vectorizer.fit(utterances, dataset)
        self.assertDictEqual(voca, vectorizer.vocabulary)
        diag = vectorizer.idf_diag.copy()

        # When
        vectorizer.limit_vocabulary(kept_unigrams)

        # Then
        expected_voca = {"5": 0, "6": 1, "666": 2}
        self.assertDictEqual(expected_voca, vectorizer.vocabulary)

        expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist()
        self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
示例#2
0
    def test_limit_vocabulary_should_raise(self):
        # Given
        vectorizer = TfidfVectorizer()
        dataset = {"language": "en", "entities": dict(), "intents": dict()}
        utterances = [text_to_utterance("5 55 6 66 666")]

        vectorizer.fit(utterances, dataset)

        # When / Then
        kept_indexes = ["7", "8"]
        with self.assertRaises(ValueError):
            vectorizer.limit_vocabulary(kept_indexes)
示例#3
0
    def test_training_should_be_reproducible(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        x = [text_to_utterance("please make me two hots cups of tea")]
        shared = self.get_shared_data(dataset)
        shared["random_state"] = 42

        # When
        vectorizer1 = TfidfVectorizer(**shared)
        vectorizer1.fit(x, dataset)

        vectorizer2 = TfidfVectorizer(**shared)
        vectorizer2.fit(x, dataset)

        # Then
        with temp_dir() as tmp_dir:
            dir_vectorizer1 = tmp_dir / "vectorizer1"
            dir_vectorizer2 = tmp_dir / "vectorizer2"
            vectorizer1.persist(dir_vectorizer1)
            vectorizer2.persist(dir_vectorizer2)
            hash1 = dirhash(str(dir_vectorizer1), 'sha256')
            hash2 = dirhash(str(dir_vectorizer2), 'sha256')
            self.assertEqual(hash1, hash2)
示例#4
0
    def test_preprocess(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")

        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
            text_to_utterance("Bird birdy"),
        ]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language
        vectorizer.builtin_entity_scope = {"snips/number"}

        # When
        processed_data = vectorizer._preprocess(utterances)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {"data": [{"text": "hello world entity_2"}]}

        u_1 = {"data": [{"text": "beauty world ent 1"}]}

        u_2 = {"data": [{"text": "bird bird"}]}

        u_3 = {"data": [{"text": "bird bird"}]}

        ent_0 = {
            "entity_kind": "entity_2",
            "value": "entity_2",
            "resolved_value": "Éntity 2",
            "range": {
                "start": 12,
                "end": 20
            }
        }
        num_0 = {
            "entity_kind": "snips/number",
            "value": "2",
            "resolved_value": {
                "value": 2.0,
                "kind": "Number"
            },
            "range": {
                "start": 19,
                "end": 20
            }
        }
        ent_11 = {
            "entity_kind": "entity_1",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        ent_12 = {
            "entity_kind": "entity_2",
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 13,
                "end": 18
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "1",
            "range": {
                "start": 23,
                "end": 24
            },
            "resolved_value": {
                "value": 1.0,
                "kind": "Number"
            },
        }

        expected_data = [(u_0, [num_0], [ent_0], []),
                         (u_1, [num_1], [ent_11,
                                         ent_12], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_3, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)
示例#5
0
    def test_enrich_utterance(self):
        # Given
        utterances = [
            {
                "data": [
                    {
                        "text": "one",
                        "entity": "snips/number"
                    },
                    {
                        "text": "beauty world",
                    },
                    {
                        "text": "ent 1",
                        "entity": "dummy_entity_1"
                    },
                ]
            },
            text_to_utterance("one beauty world ent 1"),
            text_to_utterance("hello world entity_2"),
            text_to_utterance("bird bird"),
        ]

        builtin_ents = [[{
            "value": "one",
            "resolved_value": 1,
            "range": {
                "start": 0,
                "end": 3
            },
            "entity_kind": "snips/number"
        }],
                        [{
                            "value": "one",
                            "resolved_value": 1,
                            "range": {
                                "start": 0,
                                "end": 3
                            },
                            "entity_kind": "snips/number"
                        }, {
                            "value": "1",
                            "resolved_value": 1,
                            "range": {
                                "start": 27,
                                "end": 28
                            },
                            "entity_kind": "snips/number"
                        }],
                        [{
                            "value": "2",
                            "resolved_value": 2,
                            "range": {
                                "start": 19,
                                "end": 20
                            },
                            "entity_kind": "snips/number"
                        }], []]

        custom_ents = [[{
            "value": "ent 1",
            "resolved_value": "entity 1",
            "range": {
                "start": 20,
                "end": 28
            },
            "entity_kind": "dummy_entity_1"
        }],
                       [{
                           "value": "ent 1",
                           "resolved_value": "entity 1",
                           "range": {
                               "start": 20,
                               "end": 28
                           },
                           "entity_kind": "dummy_entity_1"
                       }],
                       [{
                           "value": "entity_2",
                           "resolved_value": "Éntity_2",
                           "range": {
                               "start": 12,
                               "end": 20
                           },
                           "entity_kind": "dummy_entity_2"
                       }], []]

        w_clusters = [["111", "112"], ["111", "112"], [], []]

        vectorizer = TfidfVectorizer()
        vectorizer._language = "en"

        # When
        enriched_utterances = [
            vectorizer._enrich_utterance(*data)
            for data in zip(utterances, builtin_ents, custom_ents, w_clusters)
        ]

        # Then
        expected_u0 = "beauty world ent 1 " \
                      "builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_1 111 112"

        expected_u1 = "one beauty world ent 1 " \
                      "builtinentityfeaturesnipsnumber " \
                      "builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_1 111 112"

        expected_u2 = "hello world entity_2 builtinentityfeaturesnipsnumber " \
                      "entityfeaturedummy_entity_2"

        expected_u3 = "bird bird"

        expected_utterances = [
            expected_u0, expected_u1, expected_u2, expected_u3
        ]

        self.assertEqual(expected_utterances, enriched_utterances)
    def test_preprocess_for_training(self):
        # Given
        language = LANGUAGE_EN
        resources = {
            STEMS: {
                "beautiful": "beauty",
                "birdy": "bird",
                "entity": "ent"
            },
            WORD_CLUSTERS: {
                "my_word_clusters": {
                    "beautiful": "cluster_1",
                    "birdy": "cluster_2",
                    "entity": "cluster_3"
                }
            },
            STOP_WORDS: set()
        }

        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        custom_entity_parser = CustomEntityParser.build(
            dataset, CustomEntityParserUsage.WITH_STEMS, resources)

        builtin_entity_parser = BuiltinEntityParser.build(dataset, language)
        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": " yo "
            }, {
                "text": " yo "
            }, {
                "text": "yo "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }, {
                "text": " "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }, {
                "text": " "
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird birdy"
            }]
        }]

        config = TfidfVectorizerConfig(use_stemming=True,
                                       word_clusters_name="my_word_clusters")
        vectorizer = TfidfVectorizer(
            config=config,
            custom_entity_parser=custom_entity_parser,
            builtin_entity_parser=builtin_entity_parser,
            resources=resources)
        vectorizer._language = language

        # When
        processed_data = vectorizer._preprocess(utterances, training=True)
        processed_data = list(zip(*processed_data))

        # Then
        u_0 = {
            "data": [{
                "text": "hello world"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "yo"
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }, {
                "text": ""
            }, {
                "text": "entity_2",
                "entity": "entity_2"
            }]
        }
        u_1 = {
            "data": [{
                "text": "beauty world"
            }, {
                "text": "ent 1",
                "entity": "entity_1"
            }, {
                "text": ""
            }, {
                "text": "2",
                "entity": "snips/number"
            }]
        }
        u_2 = {"data": [{"text": "bird bird"}]}

        ent_00 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 23,
                "end": 31
            }
        }
        ent_01 = {
            "entity_kind": "entity_2",
            "value": "Éntity_2",
            "range": {
                "start": 32,
                "end": 40
            }
        }

        ent_1 = {
            "entity_kind": "entity_1",
            "value": "entity 1",
            "range": {
                "start": 16,
                "end": 24
            }
        }
        num_1 = {
            "entity_kind": "snips/number",
            "value": "2",
            "range": {
                "start": 25,
                "end": 26
            }
        }

        expected_data = [(u_0, [], [ent_00, ent_01], []),
                         (u_1, [num_1], [ent_1], ["cluster_1", "cluster_3"]),
                         (u_2, [], [], []), (u_2, [], [], ["cluster_2"])]

        self.assertSequenceEqual(expected_data, processed_data)