Exemplo n.º 1
0
    def test_should_get_intents(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - yala yili

---
type: intent
name: intent2
utterances:
  - yala yili yulu

---
type: intent
name: intent3
utterances:
  - yili yulu yele""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        classifier_config = LogRegIntentClassifierConfig(random_seed=42)
        parser_config = ProbabilisticIntentParserConfig(classifier_config)
        parser = ProbabilisticIntentParser(parser_config).fit(dataset)
        text = "yala yili yulu"

        # When
        results = parser.get_intents(text)
        intents = [res[RES_INTENT_NAME] for res in results]

        # Then
        expected_intents = ["intent2", "intent1", "intent3", None]

        self.assertEqual(expected_intents, intents)
    def test_should_get_intent(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: my_first_intent
utterances:
- how are you
- hello how are you?
- what's up

---
type: intent
name: my_second_intent
utterances:
- what is the weather today ?
- does it rain
- will it rain tomorrow""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = LogRegIntentClassifierConfig(random_seed=42)
        classifier = LogRegIntentClassifier(config).fit(dataset)
        text = "hey how are you doing ?"

        # When
        res = classifier.get_intent(text)
        intent = res[RES_INTENT_NAME]

        # Then
        self.assertEqual("my_first_intent", intent)
    def test_should_be_serializable(self, mock_to_dict):
        # Given
        mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"}

        mock_to_dict.return_value = mocked_dict

        dataset = validate_and_format_dataset(SAMPLE_DATASET)

        intent_classifier = LogRegIntentClassifier().fit(dataset)
        coeffs = intent_classifier.classifier.coef_.tolist()
        intercept = intent_classifier.classifier.intercept_.tolist()

        # When
        intent_classifier.persist(self.tmp_file_path)

        # Then
        intent_list = sorted(SAMPLE_DATASET[INTENTS])
        intent_list.append(None)
        expected_dict = {
            "unit_name": "log_reg_intent_classifier",
            "config": LogRegIntentClassifierConfig().to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": 701.0,
            "intent_list": intent_list,
            "featurizer": mocked_dict
        }
        metadata = {"unit_name": "log_reg_intent_classifier"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_classifier.json",
                               expected_dict)
Exemplo n.º 4
0
    def test_fitting_should_be_reproducible_after_serialization(self):
        # Given
        dataset = BEVERAGE_DATASET
        validated_dataset = validate_and_format_dataset(dataset)

        seed1 = 666
        seed2 = 42
        config = ProbabilisticIntentParserConfig(
            intent_classifier_config=LogRegIntentClassifierConfig(
                random_seed=seed1),
            slot_filler_config=CRFSlotFillerConfig(random_seed=seed2))
        parser = ProbabilisticIntentParser(config)
        parser_dict = parser.to_dict()

        # When
        fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit(
            validated_dataset)

        # Then
        feature_weights_1 = fitted_parser_1.slot_fillers[
            "MakeTea"].crf_model.state_features_
        feature_weights_2 = fitted_parser_2.slot_fillers[
            "MakeTea"].crf_model.state_features_
        self.assertEqual(feature_weights_1, feature_weights_2)
Exemplo n.º 5
0
 def __init__(self, config=None):
     """The LogReg intent classifier can be configured by passing a
     :class:`.LogRegIntentClassifierConfig`"""
     if config is None:
         config = LogRegIntentClassifierConfig()
     super(LogRegIntentClassifier, self).__init__(config)
     self.classifier = None
     self.intent_list = None
     self.featurizer = None
Exemplo n.º 6
0
 def __init__(self, intent_classifier_config=None, slot_filler_config=None):
     if intent_classifier_config is None:
         from snips_nlu.pipeline.configs import LogRegIntentClassifierConfig
         intent_classifier_config = LogRegIntentClassifierConfig()
     if slot_filler_config is None:
         from snips_nlu.pipeline.configs import CRFSlotFillerConfig
         slot_filler_config = CRFSlotFillerConfig()
     self.intent_classifier_config = get_processing_unit_config(
         intent_classifier_config)
     self.slot_filler_config = get_processing_unit_config(
         slot_filler_config)
Exemplo n.º 7
0
    def test_probabilistic_intent_parser_config(self):
        # Given
        config_dict = {
            "unit_name": "probabilistic_intent_parser",
            "intent_classifier_config":
            LogRegIntentClassifierConfig().to_dict(),
            "slot_filler_config": CRFSlotFillerConfig().to_dict(),
        }

        # When
        config = ProbabilisticIntentParserConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
Exemplo n.º 8
0
    def test_intent_classifier_config(self):
        # Given
        config_dict = {
            "unit_name": LogRegIntentClassifier.unit_name,
            "data_augmentation_config":
                IntentClassifierDataAugmentationConfig().to_dict(),
            "featurizer_config": FeaturizerConfig().to_dict(),
            "random_seed": 42
        }

        # When
        config = LogRegIntentClassifierConfig.from_dict(config_dict)
        serialized_config = config.to_dict()

        # Then
        self.assertDictEqual(config_dict, serialized_config)
    def test_should_be_deserializable(self, mock_from_dict):
        # Given
        mocked_featurizer = Featurizer(LANGUAGE_EN, None)
        mock_from_dict.return_value = mocked_featurizer

        intent_list = ["MakeCoffee", "MakeTea", None]

        coeffs = [
            [1.23, 4.5],
            [6.7, 8.90],
            [1.01, 2.345],
        ]

        intercept = [
            0.34,
            0.41,
            -0.98
        ]

        t_ = 701.

        config = LogRegIntentClassifierConfig().to_dict()

        classifier_dict = {
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": t_,
            "intent_list": intent_list,
            "config": config,
            "featurizer": mocked_featurizer.to_dict(),
        }
        self.tmp_file_path.mkdir()
        metadata = {"unit_name": "log_reg_intent_classifier"}
        self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.writeJsonContent(self.tmp_file_path / "intent_classifier.json",
                              classifier_dict)

        # When
        classifier = LogRegIntentClassifier.from_path(self.tmp_file_path)

        # Then
        self.assertEqual(classifier.intent_list, intent_list)
        self.assertIsNotNone(classifier.featurizer)
        self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs)
        self.assertListEqual(classifier.classifier.intercept_.tolist(),
                             intercept)
        self.assertDictEqual(classifier.config.to_dict(), config)
Exemplo n.º 10
0
    def test_should_build_training_data_with_no_data(self):
        # Given
        language = LANGUAGE_EN
        dataset = validate_and_format_dataset(get_empty_dataset(language))
        random_state = np.random.RandomState(1)

        # When
        data_augmentation_config = LogRegIntentClassifierConfig() \
            .data_augmentation_config
        utterances, _, intent_mapping = build_training_data(
            dataset, language, data_augmentation_config, random_state)

        # Then
        expected_utterances = []
        expected_intent_mapping = []
        self.assertListEqual(utterances, expected_utterances)
        self.assertListEqual(intent_mapping, expected_intent_mapping)
Exemplo n.º 11
0
    def from_path(cls, path, **shared):
        """Loads a :class:`LogRegIntentClassifier` instance from a path

        The data at the given path must have been generated using
        :func:`~LogRegIntentClassifier.persist`
        """
        import numpy as np
        from sklearn.linear_model import SGDClassifier

        path = Path(path)
        model_path = path / "intent_classifier.json"
        if not model_path.exists():
            raise LoadingError("Missing intent classifier model file: %s"
                               % model_path.name)

        with model_path.open(encoding="utf8") as f:
            model_dict = json.load(f)

        # Create the classifier
        config = LogRegIntentClassifierConfig.from_dict(model_dict["config"])
        intent_classifier = cls(config=config, **shared)
        intent_classifier.intent_list = model_dict['intent_list']

        # Create the underlying SGD classifier
        sgd_classifier = None
        coeffs = model_dict['coeffs']
        intercept = model_dict['intercept']
        t_ = model_dict["t_"]
        if coeffs is not None and intercept is not None:
            sgd_classifier = SGDClassifier(**LOG_REG_ARGS)
            sgd_classifier.coef_ = np.array(coeffs)
            sgd_classifier.intercept_ = np.array(intercept)
            sgd_classifier.t_ = t_
        intent_classifier.classifier = sgd_classifier

        # Add the featurizer
        featurizer = model_dict['featurizer']
        if featurizer is not None:
            featurizer_path = path / featurizer
            intent_classifier.featurizer = Featurizer.from_path(
                featurizer_path, **shared)

        return intent_classifier
Exemplo n.º 12
0
    def test_should_be_serializable(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - foo bar

---
type: intent
name: intent2
utterances:
  - lorem ipsum""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        intent_classifier = LogRegIntentClassifier(
            random_state=42).fit(dataset)
        coeffs = intent_classifier.classifier.coef_.tolist()
        intercept = intent_classifier.classifier.intercept_.tolist()
        t_ = intent_classifier.classifier.t_

        # When
        intent_classifier.persist(self.tmp_file_path)

        # Then
        intent_list = ["intent1", "intent2", None]
        expected_dict = {
            "config": LogRegIntentClassifierConfig().to_dict(),
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": t_,
            "intent_list": intent_list,
            "featurizer": "featurizer"
        }
        metadata = {"unit_name": "log_reg_intent_classifier"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_classifier.json",
                               expected_dict)
        featurizer_path = self.tmp_file_path / "featurizer"
        self.assertTrue(featurizer_path.exists())
        self.assertTrue(featurizer_path.is_dir())
Exemplo n.º 13
0
    def test_fitting_should_be_reproducible_after_serialization(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a [beverage_temperature:Temperature](hot) cup of tea
- make me [number_of_cups:snips/number](five) tea cups

---
type: intent
name: MakeCoffee
utterances:
- make me [number_of_cups:snips/number](one) cup of coffee please
- brew [number_of_cups] cups of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json

        seed1 = 666
        seed2 = 42
        config = ProbabilisticIntentParserConfig(
            intent_classifier_config=LogRegIntentClassifierConfig(
                random_seed=seed1),
            slot_filler_config=CRFSlotFillerConfig(random_seed=seed2))
        shared = self.get_shared_data(dataset)
        parser = ProbabilisticIntentParser(config, **shared)
        parser.persist(self.tmp_file_path)

        # When
        fitted_parser_1 = ProbabilisticIntentParser.from_path(
            self.tmp_file_path, **shared).fit(dataset)

        fitted_parser_2 = ProbabilisticIntentParser.from_path(
            self.tmp_file_path, **shared).fit(dataset)

        # Then
        feature_weights_1 = fitted_parser_1.slot_fillers[
            "MakeTea"].crf_model.state_features_
        feature_weights_2 = fitted_parser_2.slot_fillers[
            "MakeTea"].crf_model.state_features_
        self.assertEqual(feature_weights_1, feature_weights_2)
Exemplo n.º 14
0
    def test_should_be_serializable_before_fitting(self):
        # Given
        parser = ProbabilisticIntentParser()

        # When
        parser.persist(self.tmp_file_path)

        # Then
        expected_parser_dict = {
            "config": {
                "unit_name": "probabilistic_intent_parser",
                "slot_filler_config": CRFSlotFillerConfig().to_dict(),
                "intent_classifier_config":
                    LogRegIntentClassifierConfig().to_dict()
            },
            "slot_fillers": []
        }
        metadata = {"unit_name": "probabilistic_intent_parser"}
        self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata)
        self.assertJsonContent(self.tmp_file_path / "intent_parser.json",
                               expected_parser_dict)
Exemplo n.º 15
0
    def test_should_parse_top_intents(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - "[entity1](foo) bar"

---
type: intent
name: intent2
utterances:
  - foo bar [entity2](baz)

---
type: intent
name: intent3
utterances:
  - foz for [entity3](baz)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        classifier_config = LogRegIntentClassifierConfig(random_seed=42)
        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        parser_config = ProbabilisticIntentParserConfig(
            classifier_config, slot_filler_config)
        parser = ProbabilisticIntentParser(parser_config)
        parser.fit(dataset)
        text = "foo bar baz"

        # When
        results = parser.parse(text, top_n=2)
        intents = [res[RES_INTENT][RES_INTENT_NAME] for res in results]
        entities = [[s[RES_VALUE] for s in res[RES_SLOTS]] for res in results]

        # Then
        expected_intents = ["intent2", "intent1"]
        expected_entities = [["baz"], ["foo"]]

        self.assertListEqual(expected_intents, intents)
        self.assertListEqual(expected_entities, entities)
Exemplo n.º 16
0
    def test_should_be_serializable_before_fitting(self):
        # Given
        parser = ProbabilisticIntentParser()

        # When
        actual_parser_dict = parser.to_dict()

        # Then
        expected_parser_dict = {
            "unit_name": "probabilistic_intent_parser",
            "config": {
                "unit_name":
                "probabilistic_intent_parser",
                "slot_filler_config":
                CRFSlotFillerConfig().to_dict(),
                "intent_classifier_config":
                LogRegIntentClassifierConfig().to_dict()
            },
            "intent_classifier": None,
            "slot_fillers": dict(),
        }
        self.assertDictEqual(actual_parser_dict, expected_parser_dict)
Exemplo n.º 17
0
    def test_should_be_deserializable(self, mock_from_dict):
        # Given
        mocked_featurizer = Featurizer(LANGUAGE_EN, None)
        mock_from_dict.return_value = mocked_featurizer

        intent_list = ["MakeCoffee", "MakeTea", None]

        coeffs = [
            [1.23, 4.5],
            [6.7, 8.90],
            [1.01, 2.345],
        ]

        intercept = [0.34, 0.41, -0.98]

        t_ = 701.

        config = LogRegIntentClassifierConfig().to_dict()

        classifier_dict = {
            "coeffs": coeffs,
            "intercept": intercept,
            "t_": t_,
            "intent_list": intent_list,
            "config": config,
            "featurizer": mocked_featurizer.to_dict(),
        }

        # When
        classifier = LogRegIntentClassifier.from_dict(classifier_dict)

        # Then
        self.assertEqual(classifier.intent_list, intent_list)
        self.assertIsNotNone(classifier.featurizer)
        self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs)
        self.assertListEqual(classifier.classifier.intercept_.tolist(),
                             intercept)
        self.assertDictEqual(classifier.config.to_dict(), config)
Exemplo n.º 18
0
    def from_dict(cls, unit_dict):
        """Creates a :class:`LogRegIntentClassifier` instance from a dict

        The dict must have been generated with
        :func:`~LogRegIntentClassifier.to_dict`
        """
        config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"])
        intent_classifier = cls(config=config)
        sgd_classifier = None
        coeffs = unit_dict['coeffs']
        intercept = unit_dict['intercept']
        t_ = unit_dict["t_"]
        if coeffs is not None and intercept is not None:
            sgd_classifier = SGDClassifier(**LOG_REG_ARGS)
            sgd_classifier.coef_ = np.array(coeffs)
            sgd_classifier.intercept_ = np.array(intercept)
            sgd_classifier.t_ = t_
        intent_classifier.classifier = sgd_classifier
        intent_classifier.intent_list = unit_dict['intent_list']
        featurizer = unit_dict['featurizer']
        if featurizer is not None:
            intent_classifier.featurizer = Featurizer.from_dict(featurizer)
        return intent_classifier
Exemplo n.º 19
0
    def test_should_parse_with_filter(self):
        dataset_stream = io.StringIO("""
---
type: intent
name: intent1
utterances:
  - "[slot1:entity1](foo) bar"

---
type: intent
name: intent2
utterances:
  - foo bar [slot2:entity2](baz)

---
type: intent
name: intent3
utterances:
  - foz for [slot3:entity3](baz)""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        classifier_config = LogRegIntentClassifierConfig(random_seed=42)
        slot_filler_config = CRFSlotFillerConfig(random_seed=42)
        parser_config = ProbabilisticIntentParserConfig(
            classifier_config, slot_filler_config)
        parser = ProbabilisticIntentParser(parser_config)
        parser.fit(dataset)
        text = "foo bar baz"

        # When
        result = parser.parse(text, intents=["intent1", "intent3"])

        # Then
        expected_slots = [unresolved_slot((0, 3), "foo", "entity1", "slot1")]

        self.assertEqual("intent1", result[RES_INTENT][RES_INTENT_NAME])
        self.assertEqual(expected_slots, result[RES_SLOTS])
    def test_should_get_intent_when_filter(self):
        # Given
        dataset_stream = io.StringIO("""
---
type: intent
name: MakeTea
utterances:
- make me a cup of tea
- i want two cups of tea please
- can you prepare one cup of tea ?

---
type: intent
name: MakeCoffee
utterances:
- make me a cup of coffee please
- brew two cups of coffee
- can you prepare one cup of coffee""")
        dataset = Dataset.from_yaml_files("en", [dataset_stream]).json
        config = LogRegIntentClassifierConfig(random_seed=42)
        classifier = LogRegIntentClassifier(config).fit(dataset)

        # When
        text1 = "Make me two cups of tea"
        res1 = classifier.get_intent(text1, ["MakeCoffee", "MakeTea"])

        text2 = "Make me two cups of tea"
        res2 = classifier.get_intent(text2, ["MakeCoffee"])

        text3 = "bla bla bla"
        res3 = classifier.get_intent(text3, ["MakeCoffee"])

        # Then
        self.assertEqual("MakeTea", res1[RES_INTENT_NAME])
        self.assertEqual("MakeCoffee", res2[RES_INTENT_NAME])
        self.assertEqual(None, res3[RES_INTENT_NAME])