Exemplo n.º 1
0
    def test_cooccurrence_vectorizer_should_persist(self):
        # Given
        x = [text_to_utterance("yoo yoo")]
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)
        vectorizer = CooccurrenceVectorizer(**shared).fit(x, dataset)
        vectorizer.builtin_entity_scope = {"snips/entity"}

        # When
        vectorizer.persist(self.tmp_file_path)

        # Then
        metadata_path = self.tmp_file_path / "metadata.json"
        expected_metadata = {"unit_name": "cooccurrence_vectorizer"}
        self.assertJsonContent(metadata_path, expected_metadata)

        vectorizer_path = self.tmp_file_path / "vectorizer.json"
        expected_vectorizer = {
            "word_pairs": {
                "0": ["yoo", "yoo"]
            },
            "language_code": "en",
            "config": vectorizer.config.to_dict(),
            "builtin_entity_scope": ["snips/entity"]
        }
        self.assertJsonContent(vectorizer_path, expected_vectorizer)
Exemplo n.º 2
0
    def test_limit_vocabulary(self):
        # Given
        config = CooccurrenceVectorizerConfig(filter_stop_words=False)
        vectorizer = CooccurrenceVectorizer(config=config)
        train_data = [
            text_to_utterance(t) for t in ("a b", "a c", "a d", "a e")
        ]

        data = [text_to_utterance(t) for t in ("a c e", "a d e")]
        vectorizer.fit(train_data, get_empty_dataset("en"))
        x_0 = vectorizer.transform(data)
        pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2, ("a", "e"): 3}
        kept_pairs = [("a", "b"), ("a", "c"), ("a", "d")]
        self.assertDictEqual(pairs, vectorizer.word_pairs)

        # When
        kept_pairs_indexes = [pairs[p] for p in kept_pairs]
        vectorizer.limit_word_pairs(kept_pairs)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "c"): 1, ("a", "d"): 2}
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
        x_1 = vectorizer.transform(data)
        self.assertListEqual(x_0[:, kept_pairs_indexes].todense().tolist(),
                             x_1.todense().tolist())
Exemplo n.º 3
0
    def test_limit_vocabulary(self):
        # Given
        vectorizer = TfidfVectorizer()
        dataset = get_empty_dataset("en")

        utterances = [
            text_to_utterance("5 55 6 66 666"),
            text_to_utterance("55 66")
        ]

        voca = {"5": 0, "55": 1, "6": 2, "66": 3, "666": 4}
        kept_unigrams = ["5", "6", "666"]
        vectorizer.fit(utterances, dataset)
        self.assertDictEqual(voca, vectorizer.vocabulary)
        diag = vectorizer.idf_diag.copy()

        # When
        vectorizer.limit_vocabulary(kept_unigrams)

        # Then
        expected_voca = {"5": 0, "6": 1, "666": 2}
        self.assertDictEqual(expected_voca, vectorizer.vocabulary)

        expected_diag = diag[[voca[u] for u in kept_unigrams]].tolist()
        self.assertListEqual(expected_diag, vectorizer.idf_diag.tolist())
Exemplo n.º 4
0
    def test_should_handle_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        engine = SnipsNLUEngine().fit(dataset)

        # When
        result = engine.parse("hello world")

        # Then
        self.assertEqual(empty_result("hello world"), result)
Exemplo n.º 5
0
    def test_should_handle_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        engine = SnipsNLUEngine().fit(dataset)

        # When
        result = engine.parse("hello world")

        # Then
        self.assertEqual(empty_result("hello world"), result)
Exemplo n.º 6
0
    def test_should_handle_empty_dataset(self):
        # Given
        dataset = get_empty_dataset(LANGUAGE_EN)
        shared = self.get_shared_data(dataset)
        engine = SnipsNLUEngine(**shared).fit(dataset)

        # When
        result = engine.parse("hello world")

        # Then
        self.assertEqual(empty_result("hello world", 1.0), result)
Exemplo n.º 7
0
    def test_fit_with_no_utterance_should_raise(self):
        # Given
        utterances = []
        classes = []
        dataset = get_empty_dataset("en")

        # When/Then
        with self.assertRaises(_EmptyDatasetUtterancesError) as ctx:
            Featurizer().fit_transform(dataset, utterances, classes, None)

        self.assertEqual("Tokenized utterances are empty", str(ctx.exception))
    def test_should_get_none_if_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "this is a dummy query"

        # When
        intent = classifier.get_intent(text)

        # Then
        expected_intent = None
        self.assertEqual(intent, expected_intent)
Exemplo n.º 9
0
    def test_should_get_intents_when_empty_dataset(self):
        # Given
        dataset = get_empty_dataset(LANGUAGE_EN)
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "this is a dummy query"

        # When
        results = classifier.get_intents(text)

        # Then
        expected_results = [{RES_INTENT_NAME: None, RES_PROBA: 1.0}]
        self.assertEqual(expected_results, results)
Exemplo n.º 10
0
    def test_should_get_none_intent_when_empty_dataset(self):
        # Given
        dataset = get_empty_dataset(LANGUAGE_EN)
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "this is a dummy query"

        # When
        intent = classifier.get_intent(text)

        # Then
        expected_intent = intent_classification_result(None, 1.0)
        self.assertEqual(intent, expected_intent)
    def test_should_get_none_if_empty_dataset(self):
        # Given
        dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN))
        classifier = LogRegIntentClassifier().fit(dataset)
        text = "this is a dummy query"

        # When
        intent = classifier.get_intent(text)

        # Then
        expected_intent = None
        self.assertEqual(intent, expected_intent)
    def test_fit_transform(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False
        )

        dataset = get_empty_dataset("en")

        builtin_parser = EntityParserMock({t: builtin_ents})
        custom_parser = EntityParserMock({t: custom_ents})
        resources = {STOP_WORDS: set()}
        vectorizer1 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)
        vectorizer2 = CooccurrenceVectorizer(
            config, builtin_entity_parser=builtin_parser,
            custom_entity_parser=custom_parser, resources=resources)

        # When
        x = [u]
        x_0 = vectorizer1.fit(x, dataset).transform(x).todense().tolist()
        x_1 = vectorizer2.fit_transform(x, dataset).todense().tolist()

        # Then
        self.assertListEqual(x_0, x_1)
    def test_fit_unordered(self, mocked_preprocess):
        t = "a b c d e f"
        u = text_to_utterance(t)
        builtin_ents = [
            {
                "value": "e",
                "resolved_value": "e",
                "range": {
                    "start": 8,
                    "end": 9
                },
                "entity_kind": "the_snips_e_entity"
            }
        ]
        custom_ents = [
            {
                "value": "c",
                "resolved_value": "c",
                "range": {
                    "start": 4,
                    "end": 5
                },
                "entity_kind": "the_c_entity"
            }
        ]
        mocked_preprocess.return_value = [u], [builtin_ents], [custom_ents]

        config = CooccurrenceVectorizerConfig(
            window_size=3,
            unknown_words_replacement_string="b",
            filter_stop_words=False,
            keep_order=False,
        )
        dataset = get_empty_dataset("en")
        shared = self.get_shared_data(dataset)

        # When
        expected_pairs = {
            ("THE_C_ENTITY", "THE_SNIPS_E_ENTITY"): 0,
            ("THE_C_ENTITY", "a"): 1,
            ("THE_C_ENTITY", "d"): 2,
            ("THE_C_ENTITY", "f"): 3,
            ("THE_SNIPS_E_ENTITY", "a"): 4,
            ("THE_SNIPS_E_ENTITY", "d"): 5,
            ("THE_SNIPS_E_ENTITY", "f"): 6,
            ("a", "d"): 7,
            ("d", "f"): 8,
        }
        vectorizer = CooccurrenceVectorizer(config, **shared).fit([u], dataset)

        # Then
        self.assertDictEqual(expected_pairs, vectorizer.word_pairs)
Exemplo n.º 14
0
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)