def test_should_get_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - yala yili --- type: intent name: intent2 utterances: - yala yili yulu --- type: intent name: intent3 utterances: - yili yulu yele""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig(classifier_config) parser = ProbabilisticIntentParser(parser_config).fit(dataset) text = "yala yili yulu" # When results = parser.get_intents(text) intents = [res[RES_INTENT_NAME] for res in results] # Then expected_intents = ["intent2", "intent1", "intent3", None] self.assertEqual(expected_intents, intents)
def test_should_get_intent(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = LogRegIntentClassifierConfig(random_seed=42) classifier = LogRegIntentClassifier(config).fit(dataset) text = "hey how are you doing ?" # When res = classifier.get_intent(text) intent = res[RES_INTENT_NAME] # Then self.assertEqual("my_first_intent", intent)
def test_should_be_serializable(self, mock_to_dict): # Given mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"} mock_to_dict.return_value = mocked_dict dataset = validate_and_format_dataset(SAMPLE_DATASET) intent_classifier = LogRegIntentClassifier().fit(dataset) coeffs = intent_classifier.classifier.coef_.tolist() intercept = intent_classifier.classifier.intercept_.tolist() # When intent_classifier.persist(self.tmp_file_path) # Then intent_list = sorted(SAMPLE_DATASET[INTENTS]) intent_list.append(None) expected_dict = { "unit_name": "log_reg_intent_classifier", "config": LogRegIntentClassifierConfig().to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": 701.0, "intent_list": intent_list, "featurizer": mocked_dict } metadata = {"unit_name": "log_reg_intent_classifier"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_classifier.json", expected_dict)
def test_fitting_should_be_reproducible_after_serialization(self): # Given dataset = BEVERAGE_DATASET validated_dataset = validate_and_format_dataset(dataset) seed1 = 666 seed2 = 42 config = ProbabilisticIntentParserConfig( intent_classifier_config=LogRegIntentClassifierConfig( random_seed=seed1), slot_filler_config=CRFSlotFillerConfig(random_seed=seed2)) parser = ProbabilisticIntentParser(config) parser_dict = parser.to_dict() # When fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) # Then feature_weights_1 = fitted_parser_1.slot_fillers[ "MakeTea"].crf_model.state_features_ feature_weights_2 = fitted_parser_2.slot_fillers[ "MakeTea"].crf_model.state_features_ self.assertEqual(feature_weights_1, feature_weights_2)
def __init__(self, config=None): """The LogReg intent classifier can be configured by passing a :class:`.LogRegIntentClassifierConfig`""" if config is None: config = LogRegIntentClassifierConfig() super(LogRegIntentClassifier, self).__init__(config) self.classifier = None self.intent_list = None self.featurizer = None
def __init__(self, intent_classifier_config=None, slot_filler_config=None): if intent_classifier_config is None: from snips_nlu.pipeline.configs import LogRegIntentClassifierConfig intent_classifier_config = LogRegIntentClassifierConfig() if slot_filler_config is None: from snips_nlu.pipeline.configs import CRFSlotFillerConfig slot_filler_config = CRFSlotFillerConfig() self.intent_classifier_config = get_processing_unit_config( intent_classifier_config) self.slot_filler_config = get_processing_unit_config( slot_filler_config)
def test_probabilistic_intent_parser_config(self): # Given config_dict = { "unit_name": "probabilistic_intent_parser", "intent_classifier_config": LogRegIntentClassifierConfig().to_dict(), "slot_filler_config": CRFSlotFillerConfig().to_dict(), } # When config = ProbabilisticIntentParserConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_intent_classifier_config(self): # Given config_dict = { "unit_name": LogRegIntentClassifier.unit_name, "data_augmentation_config": IntentClassifierDataAugmentationConfig().to_dict(), "featurizer_config": FeaturizerConfig().to_dict(), "random_seed": 42 } # When config = LogRegIntentClassifierConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_should_be_deserializable(self, mock_from_dict): # Given mocked_featurizer = Featurizer(LANGUAGE_EN, None) mock_from_dict.return_value = mocked_featurizer intent_list = ["MakeCoffee", "MakeTea", None] coeffs = [ [1.23, 4.5], [6.7, 8.90], [1.01, 2.345], ] intercept = [ 0.34, 0.41, -0.98 ] t_ = 701. config = LogRegIntentClassifierConfig().to_dict() classifier_dict = { "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "config": config, "featurizer": mocked_featurizer.to_dict(), } self.tmp_file_path.mkdir() metadata = {"unit_name": "log_reg_intent_classifier"} self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) self.writeJsonContent(self.tmp_file_path / "intent_classifier.json", classifier_dict) # When classifier = LogRegIntentClassifier.from_path(self.tmp_file_path) # Then self.assertEqual(classifier.intent_list, intent_list) self.assertIsNotNone(classifier.featurizer) self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs) self.assertListEqual(classifier.classifier.intercept_.tolist(), intercept) self.assertDictEqual(classifier.config.to_dict(), config)
def test_should_build_training_data_with_no_data(self): # Given language = LANGUAGE_EN dataset = validate_and_format_dataset(get_empty_dataset(language)) random_state = np.random.RandomState(1) # When data_augmentation_config = LogRegIntentClassifierConfig() \ .data_augmentation_config utterances, _, intent_mapping = build_training_data( dataset, language, data_augmentation_config, random_state) # Then expected_utterances = [] expected_intent_mapping = [] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def from_path(cls, path, **shared): """Loads a :class:`LogRegIntentClassifier` instance from a path The data at the given path must have been generated using :func:`~LogRegIntentClassifier.persist` """ import numpy as np from sklearn.linear_model import SGDClassifier path = Path(path) model_path = path / "intent_classifier.json" if not model_path.exists(): raise LoadingError("Missing intent classifier model file: %s" % model_path.name) with model_path.open(encoding="utf8") as f: model_dict = json.load(f) # Create the classifier config = LogRegIntentClassifierConfig.from_dict(model_dict["config"]) intent_classifier = cls(config=config, **shared) intent_classifier.intent_list = model_dict['intent_list'] # Create the underlying SGD classifier sgd_classifier = None coeffs = model_dict['coeffs'] intercept = model_dict['intercept'] t_ = model_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier # Add the featurizer featurizer = model_dict['featurizer'] if featurizer is not None: featurizer_path = path / featurizer intent_classifier.featurizer = Featurizer.from_path( featurizer_path, **shared) return intent_classifier
def test_should_be_serializable(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - foo bar --- type: intent name: intent2 utterances: - lorem ipsum""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json intent_classifier = LogRegIntentClassifier( random_state=42).fit(dataset) coeffs = intent_classifier.classifier.coef_.tolist() intercept = intent_classifier.classifier.intercept_.tolist() t_ = intent_classifier.classifier.t_ # When intent_classifier.persist(self.tmp_file_path) # Then intent_list = ["intent1", "intent2", None] expected_dict = { "config": LogRegIntentClassifierConfig().to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "featurizer": "featurizer" } metadata = {"unit_name": "log_reg_intent_classifier"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_classifier.json", expected_dict) featurizer_path = self.tmp_file_path / "featurizer" self.assertTrue(featurizer_path.exists()) self.assertTrue(featurizer_path.is_dir())
def test_fitting_should_be_reproducible_after_serialization(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json seed1 = 666 seed2 = 42 config = ProbabilisticIntentParserConfig( intent_classifier_config=LogRegIntentClassifierConfig( random_seed=seed1), slot_filler_config=CRFSlotFillerConfig(random_seed=seed2)) shared = self.get_shared_data(dataset) parser = ProbabilisticIntentParser(config, **shared) parser.persist(self.tmp_file_path) # When fitted_parser_1 = ProbabilisticIntentParser.from_path( self.tmp_file_path, **shared).fit(dataset) fitted_parser_2 = ProbabilisticIntentParser.from_path( self.tmp_file_path, **shared).fit(dataset) # Then feature_weights_1 = fitted_parser_1.slot_fillers[ "MakeTea"].crf_model.state_features_ feature_weights_2 = fitted_parser_2.slot_fillers[ "MakeTea"].crf_model.state_features_ self.assertEqual(feature_weights_1, feature_weights_2)
def test_should_be_serializable_before_fitting(self): # Given parser = ProbabilisticIntentParser() # When parser.persist(self.tmp_file_path) # Then expected_parser_dict = { "config": { "unit_name": "probabilistic_intent_parser", "slot_filler_config": CRFSlotFillerConfig().to_dict(), "intent_classifier_config": LogRegIntentClassifierConfig().to_dict() }, "slot_fillers": [] } metadata = {"unit_name": "probabilistic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_parser_dict)
def test_should_parse_top_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - "[entity1](foo) bar" --- type: intent name: intent2 utterances: - foo bar [entity2](baz) --- type: intent name: intent3 utterances: - foz for [entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) slot_filler_config = CRFSlotFillerConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig( classifier_config, slot_filler_config) parser = ProbabilisticIntentParser(parser_config) parser.fit(dataset) text = "foo bar baz" # When results = parser.parse(text, top_n=2) intents = [res[RES_INTENT][RES_INTENT_NAME] for res in results] entities = [[s[RES_VALUE] for s in res[RES_SLOTS]] for res in results] # Then expected_intents = ["intent2", "intent1"] expected_entities = [["baz"], ["foo"]] self.assertListEqual(expected_intents, intents) self.assertListEqual(expected_entities, entities)
def test_should_be_serializable_before_fitting(self): # Given parser = ProbabilisticIntentParser() # When actual_parser_dict = parser.to_dict() # Then expected_parser_dict = { "unit_name": "probabilistic_intent_parser", "config": { "unit_name": "probabilistic_intent_parser", "slot_filler_config": CRFSlotFillerConfig().to_dict(), "intent_classifier_config": LogRegIntentClassifierConfig().to_dict() }, "intent_classifier": None, "slot_fillers": dict(), } self.assertDictEqual(actual_parser_dict, expected_parser_dict)
def test_should_be_deserializable(self, mock_from_dict): # Given mocked_featurizer = Featurizer(LANGUAGE_EN, None) mock_from_dict.return_value = mocked_featurizer intent_list = ["MakeCoffee", "MakeTea", None] coeffs = [ [1.23, 4.5], [6.7, 8.90], [1.01, 2.345], ] intercept = [0.34, 0.41, -0.98] t_ = 701. config = LogRegIntentClassifierConfig().to_dict() classifier_dict = { "coeffs": coeffs, "intercept": intercept, "t_": t_, "intent_list": intent_list, "config": config, "featurizer": mocked_featurizer.to_dict(), } # When classifier = LogRegIntentClassifier.from_dict(classifier_dict) # Then self.assertEqual(classifier.intent_list, intent_list) self.assertIsNotNone(classifier.featurizer) self.assertListEqual(classifier.classifier.coef_.tolist(), coeffs) self.assertListEqual(classifier.classifier.intercept_.tolist(), intercept) self.assertDictEqual(classifier.config.to_dict(), config)
def from_dict(cls, unit_dict): """Creates a :class:`LogRegIntentClassifier` instance from a dict The dict must have been generated with :func:`~LogRegIntentClassifier.to_dict` """ config = LogRegIntentClassifierConfig.from_dict(unit_dict["config"]) intent_classifier = cls(config=config) sgd_classifier = None coeffs = unit_dict['coeffs'] intercept = unit_dict['intercept'] t_ = unit_dict["t_"] if coeffs is not None and intercept is not None: sgd_classifier = SGDClassifier(**LOG_REG_ARGS) sgd_classifier.coef_ = np.array(coeffs) sgd_classifier.intercept_ = np.array(intercept) sgd_classifier.t_ = t_ intent_classifier.classifier = sgd_classifier intent_classifier.intent_list = unit_dict['intent_list'] featurizer = unit_dict['featurizer'] if featurizer is not None: intent_classifier.featurizer = Featurizer.from_dict(featurizer) return intent_classifier
def test_should_parse_with_filter(self): dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - "[slot1:entity1](foo) bar" --- type: intent name: intent2 utterances: - foo bar [slot2:entity2](baz) --- type: intent name: intent3 utterances: - foz for [slot3:entity3](baz)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json classifier_config = LogRegIntentClassifierConfig(random_seed=42) slot_filler_config = CRFSlotFillerConfig(random_seed=42) parser_config = ProbabilisticIntentParserConfig( classifier_config, slot_filler_config) parser = ProbabilisticIntentParser(parser_config) parser.fit(dataset) text = "foo bar baz" # When result = parser.parse(text, intents=["intent1", "intent3"]) # Then expected_slots = [unresolved_slot((0, 3), "foo", "entity1", "slot1")] self.assertEqual("intent1", result[RES_INTENT][RES_INTENT_NAME]) self.assertEqual(expected_slots, result[RES_SLOTS])
def test_should_get_intent_when_filter(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a cup of tea - i want two cups of tea please - can you prepare one cup of tea ? --- type: intent name: MakeCoffee utterances: - make me a cup of coffee please - brew two cups of coffee - can you prepare one cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = LogRegIntentClassifierConfig(random_seed=42) classifier = LogRegIntentClassifier(config).fit(dataset) # When text1 = "Make me two cups of tea" res1 = classifier.get_intent(text1, ["MakeCoffee", "MakeTea"]) text2 = "Make me two cups of tea" res2 = classifier.get_intent(text2, ["MakeCoffee"]) text3 = "bla bla bla" res3 = classifier.get_intent(text3, ["MakeCoffee"]) # Then self.assertEqual("MakeTea", res1[RES_INTENT_NAME]) self.assertEqual("MakeCoffee", res2[RES_INTENT_NAME]) self.assertEqual(None, res3[RES_INTENT_NAME])