예제 #1
0
    def test_limit_vocabulary(self):
        # Given
        config = CooccurrenceVectorizerConfig(filter_stop_words=False)
        vectorizer = CooccurrenceVectorizer(config=config)
        train_data = [
            text_to_utterance(t) for t in ("a b", "a c", "a d", "a e")
        ]

        data = [text_to_utterance(t) for t in ("a c e", "a d e")]
        vectorizer.fit(train_data, get_empty_dataset("en"))
        x_0 = vectorizer.transform(data)
        pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3}
        kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")]
        self.assertDictEqual(pairs, vectorizer.word_pairs)

        # When
        kept_pairs_indexes = [pairs[p] for p in kept_pairs]
        vectorizer.limit_word_pairs(kept_pairs)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2}
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
        x_1 = vectorizer.transform(data)
        self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(),
                             x_1.todense().tolist())
예제 #2
0
    def test_transform(self):
        # Given
        config = CooccurrenceVectorizerConfig(
            filter_stop_words=True,
            window_size=3,
            unknown_words_replacement_string="d")

        t_0 = "yo a b c d e f yo"
        t_1 = "yo a b c d e"
        u_0 = text_to_utterance(t_0)
        u_1 = text_to_utterance(t_1)

        resources = {STOP_WORDS: {"b"}}

        builtin_ents = [{
            "value": "e",
            "resolved_value": "e",
            "range": {
                "start": 11,
                "end": 12
            },
            "entity_kind": "the_snips_e_entity"
        }]
        custom_ents = [{
            "value": "c",
            "resolved_value": "c",
            "range": {
                "start": 7,
                "end": 8
            },
            "entity_kind": "the_c_entity"
        }]

        builtin_parser = EntityParserMock({
            t_0: builtin_ents,
            t_1: builtin_ents
        })
        custom_parser = EntityParserMock({t_0: custom_ents, t_1: custom_ents})

        vectorizer = CooccurrenceVectorizer(
            config,
            builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser,
            resources=resources)

        vectorizer._language = "en"
        vectorizer._word_pairs = {
            ("THE_SNIPS_E_ENTITY", "f"): 0,
            ("a", "THE_C_ENTITY"): 1,
            ("a", "THE_SNIPS_E_ENTITY"): 2,
            ("b", "THE_SNIPS_E_ENTITY"): 3,
            ("yo", "yo"): 4,
            ("d", "THE_SNIPS_E_ENTITY"): 5
        }

        data = [u_0, u_1]

        # When
        x = vectorizer.transform(data)

        # Then
        expected = [[1, 1, 1, 0, 0, 0], [0, 1, 1, 0, 0, 0]]
        self.assertEqual(expected, x.todense().tolist())