Пример #1
0
    def test_should_be_deserializable(self, mocked_cooccurrence_load,
                                      mocked_tfidf_load):
        # Given
        mocked_tfidf_load.return_value = "tfidf_vectorizer"
        mocked_cooccurrence_load.return_value = "cooccurrence_vectorizer"

        language = LANGUAGE_EN
        config = FeaturizerConfig()

        featurizer_dict = {
            "language_code": language,
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }

        self.tmp_file_path.mkdir()
        featurizer_path = self.tmp_file_path / "featurizer.json"
        with featurizer_path.open("w", encoding="utf-8") as f:
            f.write(json_string(featurizer_dict))

        # When
        featurizer = Featurizer.from_path(self.tmp_file_path)

        # Then
        self.assertEqual(language, featurizer.language)
        self.assertEqual("tfidf_vectorizer", featurizer.tfidf_vectorizer)
        self.assertEqual("cooccurrence_vectorizer",
                         featurizer.cooccurrence_vectorizer)
        self.assertDictEqual(config.to_dict(), featurizer.config.to_dict())
Пример #2
0
    def test_should_be_serializable_before_fit(self):
        # Given
        pvalue_threshold = 0.42
        config = FeaturizerConfig(pvalue_threshold=pvalue_threshold,
                                  added_cooccurrence_feature_ratio=0.2)
        featurizer = Featurizer(config=config)

        # When
        featurizer.persist(self.tmp_file_path)

        # Then
        expected_featurizer_dict = {
            "language_code": None,
            "tfidf_vectorizer": None,
            "cooccurrence_vectorizer": None,
            "config": config.to_dict()
        }
        featurizer_dict_path = self.tmp_file_path / "featurizer.json"
        self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict)

        expected_metadata = {"unit_name": "featurizer"}
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, expected_metadata)

        tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer"
        self.assertFalse(tfidf_vectorizer_path.exists())

        cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer"
        self.assertFalse(cooc_vectorizer_path.exists())
Пример #3
0
    def test_featurizer_should_exclude_replacement_string(self):
        # Given
        language = LANGUAGE_EN
        dataset = {
            "entities": {
                "dummy1": {
                    "utterances": {
                        "unknownword": "unknownword",
                        "what": "what"
                    }
                }
            }
        }
        replacement_string = "unknownword"
        featurizer = Featurizer(
            language,
            unknown_words_replacement_string=replacement_string,
            config=FeaturizerConfig())
        utterances = [text_to_utterance("hello dude")]
        y = np.array([1])

        # When
        featurizer.fit(dataset, utterances, y)

        # Then
        self.assertNotIn(replacement_string,
                         featurizer.entity_utterances_to_feature_names)
Пример #4
0
    def from_dict(cls, obj_dict):
        """Creates a :class:`Featurizer` instance from a :obj:`dict`

        The dict must have been generated with :func:`~Featurizer.to_dict`
        """
        language = obj_dict['language_code']
        config = FeaturizerConfig.from_dict(obj_dict["config"])
        tfidf_vectorizer = _deserialize_tfidf_vectorizer(
            obj_dict["tfidf_vectorizer"], language, config)
        entity_utterances_to_entity_names = {
            k: set(v) for k, v in
            iteritems(obj_dict['entity_utterances_to_feature_names'])
        }
        self = cls(
            language=language,
            tfidf_vectorizer=tfidf_vectorizer,
            pvalue_threshold=obj_dict['pvalue_threshold'],
            entity_utterances_to_feature_names=
            entity_utterances_to_entity_names,
            best_features=obj_dict['best_features'],
            config=config,
            unknown_words_replacement_string=obj_dict[
                "unknown_words_replacement_string"]
        )
        return self
Пример #5
0
    def test_fit_cooccurrence_vectorizer_feature_selection(self, mocked_chi2):
        # Given
        vectorizer_config = CooccurrenceVectorizerConfig(
            filter_stop_words=False)
        config = FeaturizerConfig(
            added_cooccurrence_feature_ratio=.3,
            cooccurrence_vectorizer_config=vectorizer_config)
        featurizer = Featurizer(config)
        mocked_dataset = {"language": "fr", "entities": {}, "intents": {}}
        utterances = [
            text_to_utterance("a b c d e"),
            text_to_utterance("f g h i j"),
            text_to_utterance("none"),
        ]

        mocked_vectorizer = MagicMock()
        mocked_vectorizer.idf_diag = range(10)

        featurizer.tfidf_vectorizer = mocked_vectorizer
        classes = [0, 0, 1]

        # When
        mocked_chi2.return_value = (None, [0.1, 1.0, 0.2, 1.0, 0.3, 1.0] +
                                    [1.0 for _ in range(100)])
        featurizer._fit_cooccurrence_vectorizer(utterances, classes, 1,
                                                mocked_dataset)

        # Then
        expected_pairs = {("a", "b"): 0, ("a", "d"): 1, ("b", "c"): 2}
        self.assertDictEqual(expected_pairs,
                             featurizer.cooccurrence_vectorizer.word_pairs)
Пример #6
0
    def test_should_be_serializable(self):
        # Given

        dataset_stream = io.StringIO("""
---
type: intent
name: dummy_intent
utterances:
  - this is the number [number:snips/number](one)
""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        pvalue_threshold = 0.42
        config = FeaturizerConfig(pvalue_threshold=pvalue_threshold,
                                  added_cooccurrence_feature_ratio=0.2)
        shared = self.get_shared_data(dataset)
        featurizer = Featurizer(config=config, **shared)
        utterances = [
            text_to_utterance("this is the number"),
            text_to_utterance("yo")
        ]
        classes = np.array([0, 1])
        featurizer.fit(dataset, utterances, classes, max(classes))

        # When
        featurizer.persist(self.tmp_file_path)

        # Then
        expected_featurizer_dict = {
            "language_code": "en",
            "tfidf_vectorizer": "tfidf_vectorizer",
            "cooccurrence_vectorizer": "cooccurrence_vectorizer",
            "config": config.to_dict()
        }
        featurizer_dict_path = self.tmp_file_path / "featurizer.json"
        self.assertJsonContent(featurizer_dict_path, expected_featurizer_dict)

        expected_metadata = {"unit_name": "featurizer"}
        metadata_path = self.tmp_file_path / "metadata.json"
        self.assertJsonContent(metadata_path, expected_metadata)

        tfidf_vectorizer_path = self.tmp_file_path / "tfidf_vectorizer"
        self.assertTrue(tfidf_vectorizer_path.exists())

        cooc_vectorizer_path = self.tmp_file_path / "cooccurrence_vectorizer"
        self.assertTrue(cooc_vectorizer_path.exists())
Пример #7
0
    def test_featurizer_config(self):
        # Given
        config_dict = {
            "sublinear_tf": True,
        }

        # When
        config = FeaturizerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Пример #8
0
    def test_featurizer_config(self):
        # Given
        config_dict = {
            "sublinear_tf": True,
            "pvalue_threshold": 0.4,
            "word_clusters_name": None
        }

        # When
        config = FeaturizerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Пример #9
0
    def test_intent_classifier_config(self):
        # Given
        config_dict = {
            "unit_name": LogRegIntentClassifier.unit_name,
            "data_augmentation_config":
                IntentClassifierDataAugmentationConfig().to_dict(),
            "featurizer_config": FeaturizerConfig().to_dict(),
            "random_seed": 42
        }

        # When
        config = LogRegIntentClassifierConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Пример #10
0
    def __init__(self, language, unknown_words_replacement_string,
                 config=FeaturizerConfig(), tfidf_vectorizer=None,
                 best_features=None, builtin_entity_parser=None,
                 custom_entity_parser=None):
        self.config = config
        self.language = language
        if tfidf_vectorizer is None:
            tfidf_vectorizer = _get_tfidf_vectorizer(
                self.language, sublinear_tf=self.config.sublinear_tf)
        self.tfidf_vectorizer = tfidf_vectorizer
        self.best_features = best_features
        self.unknown_words_replacement_string = \
            unknown_words_replacement_string

        self.builtin_entity_parser = builtin_entity_parser
        self.custom_entity_parser = custom_entity_parser
    def test_should_be_deserializable(self):
        # Given
        language = LANGUAGE_EN
        idf_diag = [1.52, 1.21, 1.04]
        vocabulary = {"hello": 0, "beautiful": 1, "world": 2}

        best_features = [0, 1]
        pvalue_threshold = 0.4
        entity_utterances_to_feature_names = {
            "entity_1": ["entityfeatureentity_1"]
        }

        featurizer_dict = {
            "config": FeaturizerConfig().to_dict(),
            "language_code": language,
            "tfidf_vectorizer": {"idf_diag": idf_diag, "vocab": vocabulary},
            "best_features": best_features,
            "pvalue_threshold": pvalue_threshold,
            "entity_utterances_to_feature_names":
                entity_utterances_to_feature_names,
            "unknown_words_replacement_string": None
        }

        # When
        featurizer = Featurizer.from_dict(featurizer_dict)

        # Then
        self.assertEqual(featurizer.language, language)
        # pylint: disable=W0212
        self.assertListEqual(
            featurizer.tfidf_vectorizer._tfidf._idf_diag.data.tolist(),
            idf_diag)
        # pylint: enable=W0212
        self.assertDictEqual(featurizer.tfidf_vectorizer.vocabulary_,
                             vocabulary)
        self.assertListEqual(featurizer.best_features, best_features)
        self.assertEqual(featurizer.pvalue_threshold, pvalue_threshold)

        self.assertDictEqual(
            featurizer.entity_utterances_to_feature_names,
            {
                k: set(v) for k, v
                in iteritems(entity_utterances_to_feature_names)
            })
Пример #12
0
    def test_featurizer_config(self):
        # Given
        tfid_vectorizer_config = TfidfVectorizerConfig()
        cooccurrence_vectorizer_config = CooccurrenceVectorizerConfig()
        config_dict = {
            "unit_name": "featurizer",
            "pvalue_threshold": 0.2,
            "added_cooccurrence_feature_ratio": 0.2,
            "tfidf_vectorizer_config": tfid_vectorizer_config.to_dict(),
            "cooccurrence_vectorizer_config":
                cooccurrence_vectorizer_config.to_dict()
        }

        # When
        config = FeaturizerConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Пример #13
0
    def __init__(self,
                 language,
                 unknown_words_replacement_string,
                 config=FeaturizerConfig(),
                 tfidf_vectorizer=None,
                 best_features=None,
                 entity_utterances_to_feature_names=None):
        self.config = config
        self.language = language
        if tfidf_vectorizer is None:
            tfidf_vectorizer = _get_tfidf_vectorizer(
                self.language, sublinear_tf=self.config.sublinear_tf)
        self.tfidf_vectorizer = tfidf_vectorizer
        self.best_features = best_features
        self.entity_utterances_to_feature_names = \
            entity_utterances_to_feature_names

        self.unknown_words_replacement_string = \
            unknown_words_replacement_string
Пример #14
0
    def test_feature_index_to_feature_name(self):
        # Given
        config = FeaturizerConfig(added_cooccurrence_feature_ratio=.75)
        featurizer = Featurizer(config=config)

        # When
        mocked_cooccurrence_vectorizer = MagicMock()
        mocked_cooccurrence_vectorizer.word_pairs = {("a", "b"): 0}

        mocked_tfidf_vectorizer = MagicMock()
        mocked_tfidf_vectorizer.vocabulary = {"a": 0}

        featurizer.cooccurrence_vectorizer = mocked_cooccurrence_vectorizer
        featurizer.tfidf_vectorizer = mocked_tfidf_vectorizer

        # Then
        expected = {0: "ngram:a", 1: "pair:a+b"}
        self.assertDictEqual(expected,
                             featurizer.feature_index_to_feature_name)
Пример #15
0
    def from_dict(cls, obj_dict, **shared):
        """Creates a :class:`Featurizer` instance from a :obj:`dict`

        The dict must have been generated with :func:`~Featurizer.to_dict`
        """
        language = obj_dict["language_code"]
        config = FeaturizerConfig.from_dict(obj_dict["config"])
        tfidf_vectorizer = _deserialize_tfidf_vectorizer(
            obj_dict["tfidf_vectorizer"], language, config.sublinear_tf)
        self = cls(
            language=language,
            tfidf_vectorizer=tfidf_vectorizer,
            best_features=obj_dict["best_features"],
            config=config,
            unknown_words_replacement_string=obj_dict[
                "unknown_words_replacement_string"],
            builtin_entity_parser=shared.get(BUILTIN_ENTITY_PARSER),
            custom_entity_parser=shared.get(CUSTOM_ENTITY_PARSER)
        )
        return self
Пример #16
0
    def __init__(self,
                 language,
                 unknown_words_replacement_string,
                 config=FeaturizerConfig(),
                 tfidf_vectorizer=None,
                 best_features=None,
                 entity_utterances_to_feature_names=None,
                 pvalue_threshold=0.4):
        self.config = config
        self.language = language
        if tfidf_vectorizer is None:
            tfidf_vectorizer = _get_tfidf_vectorizer(self.language,
                                                     self.config.to_dict())
        self.tfidf_vectorizer = tfidf_vectorizer
        self.best_features = best_features
        self.pvalue_threshold = pvalue_threshold
        self.entity_utterances_to_feature_names = \
            entity_utterances_to_feature_names

        self.unknown_words_replacement_string = \
            unknown_words_replacement_string
Пример #17
0
    def from_dict(cls, obj_dict):
        """Creates a :class:`Featurizer` instance from a :obj:`dict`

        The dict must have been generated with :func:`~Featurizer.to_dict`
        """
        language = obj_dict['language_code']
        config = FeaturizerConfig.from_dict(obj_dict["config"])
        tfidf_vectorizer = _deserialize_tfidf_vectorizer(
            obj_dict["tfidf_vectorizer"], language, config.sublinear_tf)
        entity_utterances_to_entity_names = {
            k: set(v)
            for k, v in iteritems(
                obj_dict['entity_utterances_to_feature_names'])
        }
        self = cls(language=language,
                   tfidf_vectorizer=tfidf_vectorizer,
                   entity_utterances_to_feature_names=
                   entity_utterances_to_entity_names,
                   best_features=obj_dict['best_features'],
                   config=config,
                   unknown_words_replacement_string=obj_dict[
                       "unknown_words_replacement_string"])
        return self
Пример #18
0
    def test_should_be_serializable(self):
        # Given
        language = LANGUAGE_EN
        tfidf_vectorizer = _get_tfidf_vectorizer(language)

        pvalue_threshold = 0.42
        featurizer = Featurizer(language,
                                config=FeaturizerConfig(
                                    pvalue_threshold=pvalue_threshold,
                                    word_clusters_name="brown_clusters"),
                                unknown_words_replacement_string=None,
                                tfidf_vectorizer=tfidf_vectorizer)
        dataset = {
            "entities": {
                "entity2": {
                    "data": [{
                        "value": "entity1",
                        "synonyms": ["entity1"]
                    }],
                    "use_synonyms": True,
                    "automatically_extensible": True
                }
            },
            "intents": {},
            "language": "en"
        }
        dataset = validate_and_format_dataset(dataset)

        utterances = [
            "hello world", "beautiful world", "hello here", "bird birdy",
            "beautiful bird"
        ]
        utterances = [text_to_utterance(u) for u in utterances]
        classes = np.array([0, 0, 0, 1, 1])

        featurizer.fit(dataset, utterances, classes)

        # When
        serialized_featurizer = featurizer.to_dict()

        # Then
        msg = "Featurizer dict should be json serializable to utf8."
        with self.fail_if_exception(msg):
            dumped = json_string(serialized_featurizer)

        msg = "SnipsNLUEngine should be deserializable from dict with unicode" \
              " values"
        with self.fail_if_exception(msg):
            _ = Featurizer.from_dict(json.loads(dumped))

        vocabulary = tfidf_vectorizer.vocabulary_
        # pylint: disable=W0212
        idf_diag = tfidf_vectorizer._tfidf._idf_diag.data.tolist()
        # pylint: enable=W0212

        best_features = featurizer.best_features
        entity_utterances_to_feature_names = {
            "entity1": ["entityfeatureentity2"]
        }

        expected_serialized = {
            "config": {
                'sublinear_tf': False,
                'pvalue_threshold': pvalue_threshold,
                'word_clusters_name': "brown_clusters"
            },
            "language_code": "en",
            "tfidf_vectorizer": {
                "idf_diag": idf_diag,
                "vocab": vocabulary
            },
            "best_features": best_features,
            "entity_utterances_to_feature_names":
            entity_utterances_to_feature_names,
            "unknown_words_replacement_string": None
        }
        self.assertDictEqual(expected_serialized, serialized_featurizer)
Пример #19
0
    def test_preprocess_utterances(self, mocked_stem, mocked_word_cluster):
        # Given
        language = LANGUAGE_EN

        def _stem(t):
            if t == "beautiful":
                s = "beauty"
            elif t == "birdy":
                s = "bird"
            elif t == "entity":
                s = "ent"
            else:
                s = t
            return s

        def stem_function(text, language):
            return get_default_sep(language).join(
                [_stem(t) for t in tokenize_light(text, language)])

        mocked_word_cluster.return_value = {
            "beautiful": "cluster_1",
            "birdy": "cluster_2",
            "entity": "cluster_3"
        }

        mocked_stem.side_effect = stem_function

        dataset = {
            "intents": {
                "intent1": {
                    "utterances": []
                }
            },
            "entities": {
                "entity_1": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": ["alternative entity 1"]
                    }, {
                        "value": "éntity 1",
                        "synonyms": ["alternative entity 1"]
                    }],
                    "use_synonyms":
                    False,
                    "automatically_extensible":
                    False
                },
                "entity_2": {
                    "data": [{
                        "value": "entity 1",
                        "synonyms": []
                    }, {
                        "value": "Éntity 2",
                        "synonyms": ["Éntity_2", "Alternative entity 2"]
                    }],
                    "use_synonyms":
                    True,
                    "automatically_extensible":
                    False
                },
                "snips/number": {}
            },
            "language": "en",
        }

        dataset = validate_and_format_dataset(dataset)

        utterances = [
            text_to_utterance("hÉllo wOrld Éntity_2"),
            text_to_utterance("beauTiful World entity 1"),
            text_to_utterance("Bird bïrdy"),
        ]

        labeled_utterance = {
            DATA: [{
                TEXT: "beauTiful éntity "
            }, {
                TEXT: "1",
                ENTITY: "snips/number",
                SLOT_NAME: "number"
            }, {
                TEXT: " bIrd Éntity_2"
            }]
        }
        utterances.append(labeled_utterance)
        labels = np.array([0, 0, 1, 1])

        featurizer = Featurizer(
            language,
            None,
            config=FeaturizerConfig(word_clusters_name="brown_clusters")).fit(
                dataset, utterances, labels)

        # When
        utterances = featurizer.preprocess_utterances(utterances)

        # Then
        expected_utterances = [
            "hello world entity_2 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_2",
            "beauty world ent 1 builtinentityfeaturesnipsnumber "
            "entityfeatureentity_1 entityfeatureentity_2 "
            "cluster_1 cluster_3", "bird bird",
            "beauty ent bird entity_2 builtinentityfeaturesnipsnumber "
            "builtinentityfeaturesnipsnumber entityfeatureentity_1 "
            "entityfeatureentity_2 entityfeatureentity_2 cluster_1"
        ]

        self.assertListEqual(utterances, expected_utterances)
Пример #20
0
    def test_fit_transform_should_be_consistent_with_transform(self):
        # Here we mainly test that the output of fit_transform is
        # the same as the result of fit and then transform.
        # We're trying to avoid that for some reason indexes of features
        # get mixed up after feature selection

        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
    - dummy utterance

---
type: entity
name: entity_1
automatically_extensible: false
use_synononyms: false
matching_strictness: 1.0
values:
  - [entity 1, alternative entity 1]
  - [éntity 1, alternative entity 1]

---
type: entity
name: entity_2
automatically_extensible: false
use_synononyms: true
matching_strictness: 1.0
values:
  - entity 1
  - [Éntity 2, Éntity_2, Alternative entity 2]
        """)
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        config = FeaturizerConfig(added_cooccurrence_feature_ratio=.5)
        shared = self.get_shared_data(dataset)
        featurizer = Featurizer(config=config, **shared)

        utterances = [{
            "data": [{
                "text": "hÉllo wOrld "
            }, {
                "text": "Éntity_2",
                "entity": "entity_2"
            }]
        }, {
            "data": [{
                "text": "beauTiful World "
            }, {
                "text": "entity 1",
                "entity": "entity_1"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }, {
            "data": [{
                "text": "Bird bïrdy"
            }]
        }]

        classes = [0, 0, 1, 1]

        # When
        x_0 = featurizer.fit_transform(dataset, utterances, classes,
                                       max(classes))
        x_1 = featurizer.transform(utterances)

        # Then
        self.assertListEqual(x_0.todense().tolist(), x_1.todense().tolist())