def test_missing_intent_key_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity" } ] } ] } }, "entities": {}, "language": "en", "snips_nlu_version": "1.1.1" } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual(str(ctx.exception.args[0]), "Expected chunk to have key: 'slot_name'")
def test_should_not_require_data_for_builtin_entities(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "this is ", }, { "text": "10p.m", "entity": SNIPS_DATETIME, "slot_name": "startTime" } ] } ] } }, "entities": { SNIPS_DATETIME: {} }, "language": "en", } # When / Then with self.fail_if_exception("Could not validate dataset"): validate_and_format_dataset(dataset)
def test_missing_intent_key_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity" } ] } ] } }, "entities": {}, "language": "en", } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected chunk to have key: 'slot_name'", str(ctx.exception.args[0]))
def test_unknown_entity_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity", "slot_name": "unknown_entity_slot" } ] } ] } }, "entities": { "entity1": { "data": [], "use_synonyms": True, "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "1.1.1" } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual(str(ctx.exception.args[0]), "Expected entities to have key: 'unknown_entity'")
def test_unknown_entity_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity", "slot_name": "unknown_entity_slot" } ] } ] } }, "entities": { "entity1": { "data": [], "use_synonyms": True, "automatically_extensible": False } }, "language": "en", } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected entities to have key: 'unknown_entity'", str(ctx.exception.args[0]))
def test_should_not_require_data_for_builtin_entities(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "this is ", }, { "text": "10p.m", "entity": SNIPS_DATETIME, "slot_name": "startTime" } ] } ] } }, "entities": { SNIPS_DATETIME: {} }, "language": "en", "snips_nlu_version": "0.1.0" } # When / Then with self.fail_if_exception("Could not validate dataset"): validate_and_format_dataset(dataset)
def test_should_generate_dataset_from_yaml_files(self, mock_io): # Given intent_file_1 = "whoIsGame.yaml" intent_file_2 = "getWeather.yaml" entity_file_1 = "location.yaml" who_is_game_yaml = """ # whoIsGame Intent --- type: intent name: whoIsGame utterances: - who is the [role](president) of [country](France) - who is the [role](CEO) of [company](Google) please """ get_weather_yaml = """ # getWeather Intent --- type: intent name: getWeather utterances: - what is the weather in [weatherLocation:location](Paris)? - is it raining in [weatherLocation] [weatherDate:snips/datetime] """ location_yaml = """ # Location Entity --- type: entity name: location automatically_extensible: true values: - [new york, big apple] - london """ # pylint:disable=unused-argument def mock_open(filename, **kwargs): if filename == intent_file_1: return io.StringIO(who_is_game_yaml) if filename == intent_file_2: return io.StringIO(get_weather_yaml) if filename == entity_file_1: return io.StringIO(location_yaml) return None # pylint:enable=unused-argument mock_io.open.side_effect = mock_open dataset_files = [intent_file_1, intent_file_2, entity_file_1] # When dataset = Dataset.from_yaml_files("en", dataset_files) dataset_dict = dataset.json # Then validate_and_format_dataset(dataset_dict) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
def test_should_generate_dataset_from_file(self): # Given dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "whoIsGame.txt") dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "getWeather.txt") dataset = AssistantDataset.from_files("en", [dataset_path_1, dataset_path_2]) dataset_dict = dataset.json # When / Then validate_and_format_dataset(dataset_dict)
def test_should_generate_dataset_from_file(self): # Given dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "whoIsGame.txt") dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "getWeather.txt") dataset = AssistantDataset.from_files( "en", [dataset_path_1, dataset_path_2]) dataset_dict = dataset.json # When / Then validate_and_format_dataset(dataset_dict)
def test_invalid_language_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": {}, "language": "eng", } # When/Then with self.assertRaises(ValueError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0]))
def test_invalid_language_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": {}, "language": "eng", "snips_nlu_version": "1.1.1" } # When/Then with self.assertRaises(ValueError) as ctx: validate_and_format_dataset(dataset) self.assertEqual(str(ctx.exception.args[0]), "Unknown language: 'eng'")
def fit(self, dataset, force_retrain=True): """Fits the intent parser with a valid Snips dataset""" logger.info("Fitting lookup intent parser...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] self._entity_scopes = _get_entity_scopes(dataset) self._map = dict() self._stop_words_whitelist = get_stop_words_whitelist( dataset, self._stop_words) entity_placeholders = _get_entity_placeholders(dataset, self.language) ambiguous_keys = set() for (key, val) in self._generate_io_mapping(dataset[INTENTS], entity_placeholders): key = hash_str(key) # handle key collisions -*- flag ambiguous entries -*- if key in self._map and self._map[key] != val: ambiguous_keys.add(key) else: self._map[key] = val # delete ambiguous keys for key in ambiguous_keys: self._map.pop(key) return self
def test_should_support_int_or_float_for_matching_strictness(self): # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False, "use_synonyms": True, "matching_strictness": 0.5 }, "entity2": { "data": [], "automatically_extensible": False, "use_synonyms": True, "matching_strictness": 1 } }, "language": "en", } # When/Then dataset = validate_and_format_dataset(dataset) self.assertEqual( 0.5, dataset["entities"]["entity1"].get("matching_strictness")) self.assertEqual( 1, dataset["entities"]["entity2"].get("matching_strictness"))
def test_engine_should_fit_with_builtins_entities(self): # Given dataset = validate_and_format_dataset({ "intents": { "dummy": { "utterances": [ { "data": [ { "text": "10p.m.", "entity": "snips/datetime", "slot_name": "startTime" } ] } ] } }, "entities": { "snips/datetime": {} }, "language": "en", "snips_nlu_version": "0.0.1" }) # When / Then SnipsNLUEngine().fit(dataset) # This should not raise any error
def test_should_not_build_builtin_parser_when_provided(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json dataset = validate_and_format_dataset(dataset) builtin_entity_parser = BuiltinEntityParser.build(language="en") # When with patch("snips_nlu.entity_parser.builtin_entity_parser" ".BuiltinEntityParser.build") as mocked_build_parser: engine = SnipsNLUEngine( builtin_entity_parser=builtin_entity_parser) engine.fit(dataset) # Then mocked_build_parser.assert_not_called()
def test_should_be_serializable(self, mock_to_dict): # Given mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"} mock_to_dict.return_value = mocked_dict dataset = validate_and_format_dataset(SAMPLE_DATASET) intent_classifier = LogRegIntentClassifier().fit(dataset) coeffs = intent_classifier.classifier.coef_.tolist() intercept = intent_classifier.classifier.intercept_.tolist() # When intent_classifier.persist(self.tmp_file_path) # Then intent_list = sorted(SAMPLE_DATASET[INTENTS]) intent_list.append(None) expected_dict = { "unit_name": "log_reg_intent_classifier", "config": LogRegIntentClassifierConfig().to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": 701.0, "intent_list": intent_list, "featurizer": mocked_dict } metadata = {"unit_name": "log_reg_intent_classifier"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_classifier.json", expected_dict)
def test_should_be_serializable(self, mocked_generate_regexes): # Given # pylint: disable=unused-argument def mock_generate_patterns(utterances, joined_entity_utterances, group_names_to_slot_names, language): patterns = ["mocked_regex_%s" % i for i in range(len(utterances))] group_to_slot = {"group_0": "dummy slot name"} return patterns, group_to_slot # pylint: enable=unused-argument mocked_generate_regexes.side_effect = mock_generate_patterns dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=100) parser = DeterministicIntentParser(config=config).fit(dataset) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 100 }, "language_code": "en", "group_names_to_slot_names": { "group_0": "dummy slot name" }, "patterns": { "dummy_intent_1": [ "mocked_regex_0", "mocked_regex_1", "mocked_regex_2", "mocked_regex_3" ], "dummy_intent_2": [ "mocked_regex_0" ] }, "slot_names_to_entities": { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name3": "dummy_entity_2", "dummy_slot_name2": "dummy_entity_2" }, "dummy_intent_2": { "dummy slot nàme": "dummy_entity_1" } } } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_be_serializable(self, mock_to_dict): # Given mocked_dict = {"mocked_featurizer_key": "mocked_featurizer_value"} mock_to_dict.return_value = mocked_dict dataset = validate_and_format_dataset(SAMPLE_DATASET) intent_classifier = LogRegIntentClassifier().fit(dataset) coeffs = intent_classifier.classifier.coef_.tolist() intercept = intent_classifier.classifier.intercept_.tolist() # When classifier_dict = intent_classifier.to_dict() # Then intent_list = sorted(SAMPLE_DATASET[INTENTS]) intent_list.append(None) expected_dict = { "unit_name": "log_reg_intent_classifier", "config": LogRegIntentClassifierConfig().to_dict(), "coeffs": coeffs, "intercept": intercept, "t_": 701.0, "intent_list": intent_list, "featurizer": mocked_dict } self.assertEqual(expected_dict, classifier_dict)
def build(cls, dataset, parser_usage): from snips_nlu.dataset import validate_and_format_dataset dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] custom_entities = { entity_name: deepcopy(entity) for entity_name, entity in iteritems(dataset[ENTITIES]) if not is_builtin_entity(entity_name) } if parser_usage == CustomEntityParserUsage.WITH_AND_WITHOUT_STEMS: for ent in viewvalues(custom_entities): stemmed_utterances = _stem_entity_utterances( ent[UTTERANCES], language) ent[UTTERANCES] = _merge_entity_utterances( ent[UTTERANCES], stemmed_utterances) elif parser_usage == CustomEntityParserUsage.WITH_STEMS: for ent in viewvalues(custom_entities): ent[UTTERANCES] = _stem_entity_utterances( ent[UTTERANCES], language) elif parser_usage is None: raise ValueError("A parser usage must be defined in order to fit " "a CustomEntityParser") configuration = _create_custom_entity_parser_configuration( custom_entities) parser = GazetteerEntityParser.build(configuration) return cls(parser, language, parser_usage)
def fit(self, dataset, intent): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on Returns: :class:`CRFSlotFiller`: The same instance, trained """ logger.debug("Fitting %s slot filler...", intent) dataset = validate_and_format_dataset(dataset) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) if not self.slot_name_mapping: # No need to train the CRF if the intent has no slots return self random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances ] for factory in self.features_factories: factory.fit(dataset, intent) # Ensure that X, Y are safe and that the OUTSIDE label is learnt to # avoid segfault at inference time # pylint: disable=C0103 X = [ self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples ] Y = [[tag for tag in sample[TAGS]] for sample in crf_samples] X, Y = _ensure_safe(X, Y) # ensure ascii tags Y = [[_encode_tag(tag) for tag in y] for y in Y] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) logger.debug("Most relevant features for %s:\n%s", self.intent, DifferedLoggingMessage(self.log_weights)) return self
def fit(self, dataset, force_retrain=True): """Fit the slot filler Args: dataset (dict): A valid Snips dataset force_retrain (bool, optional): If *False*, will not retrain intent classifier and slot fillers when they are already fitted. Default to *True*. Returns: :class:`ProbabilisticIntentParser`: The same instance, trained """ dataset = validate_and_format_dataset(dataset) intents = list(dataset[INTENTS]) if self.intent_classifier is None: self.intent_classifier = build_processing_unit( self.config.intent_classifier_config) if force_retrain or not self.intent_classifier.fitted: self.intent_classifier.fit(dataset) if self.slot_fillers is None: self.slot_fillers = dict() for intent_name in intents: # We need to copy the slot filler config as it may be mutated if self.slot_fillers.get(intent_name) is None: slot_filler_config = deepcopy(self.config.slot_filler_config) self.slot_fillers[intent_name] = build_processing_unit( slot_filler_config) if force_retrain or not self.slot_fillers[intent_name].fitted: self.slot_fillers[intent_name].fit(dataset, intent_name) return self
def fit(self, dataset, force_retrain=True): """Fit the NLU engine Args: dataset (dict): A valid Snips dataset force_retrain (bool, optional): If *False*, will not retrain intent parsers when they are already fitted. Default to *True*. Returns: The same object, trained. """ logger.info("Fitting NLU engine...") dataset = validate_and_format_dataset(dataset) self._dataset_metadata = _get_dataset_metadata(dataset) if self.config is None: language = self._dataset_metadata["language_code"] self.config = self.config_type.from_dict(DEFAULT_CONFIGS[language]) parsers = [] for parser_config in self.config.intent_parsers_configs: # Re-use existing parsers to allow pre-training recycled_parser = None for parser in self.intent_parsers: if parser.unit_name == parser_config.unit_name: recycled_parser = parser break if recycled_parser is None: recycled_parser = build_processing_unit(parser_config) if force_retrain or not recycled_parser.fitted: recycled_parser.fit(dataset, force_retrain) parsers.append(recycled_parser) self.intent_parsers = parsers return self
def fit(self, dataset, force_retrain=True): """Fit the NLU engine Args: dataset (dict): A valid Snips dataset force_retrain (bool, optional): If *False*, will not retrain intent parsers when they are already fitted. Default to *True*. Returns: The same object, trained. """ dataset = validate_and_format_dataset(dataset) self._dataset_metadata = _get_dataset_metadata(dataset) if self.config is None: language = self._dataset_metadata["language_code"] self.config = self.config_type.from_dict(DEFAULT_CONFIGS[language]) parsers = [] for parser_config in self.config.intent_parsers_configs: # Re-use existing parsers to allow pre-training recycled_parser = None for parser in self.intent_parsers: if parser.unit_name == parser_config.unit_name: recycled_parser = parser break if recycled_parser is None: recycled_parser = build_processing_unit(parser_config) if force_retrain or not recycled_parser.fitted: recycled_parser.fit(dataset, force_retrain) parsers.append(recycled_parser) self.intent_parsers = parsers return self
def test_should_normalize_synonyms(self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation.lower(), variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": { "intent1": { "utterances": [{ "data": [{ "text": "ëNtity", "entity": "entity1", "slot_name": "startTime" }] }] } }, "entities": { "entity1": { "data": [], "use_synonyms": True, "automatically_extensible": True } }, "language": "en", } expected_dataset = { "intents": { "intent1": { "utterances": [{ "data": [{ "text": "ëNtity", "entity": "entity1", "slot_name": "startTime" }] }] } }, "entities": { "entity1": { "utterances": { "ëntity": "ëNtity", "Ëntity": "ëNtity", "ëNtity": "ëNtity" }, "automatically_extensible": True, "capitalize": False } }, "language": "en", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertDictEqual(expected_dataset, dataset)
def test_should_get_builtin_slots(self): # Given dataset = validate_and_format_dataset(WEATHER_DATASET) config = CRFSlotFillerConfig(random_seed=42) intent = "SearchWeatherForecast" slot_filler = CRFSlotFiller(config) slot_filler.fit(dataset, intent) # When slots = slot_filler.get_slots("Give me the weather at 9p.m. in Paris") # Then expected_slots = [ unresolved_slot(match_range={ START: 20, END: 28 }, value='at 9p.m.', entity='snips/datetime', slot_name='datetime'), unresolved_slot(match_range={ START: 32, END: 37 }, value='Paris', entity='weather_location', slot_name='location') ] self.assertListEqual(expected_slots, slots)
def fit(self, dataset, force_retrain=True): """Fit the slot filler Args: dataset (dict): A valid Snips dataset force_retrain (bool, optional): If *False*, will not retrain intent classifier and slot fillers when they are already fitted. Default to *True*. Returns: :class:`ProbabilisticIntentParser`: The same instance, trained """ logger.info("Fitting probabilistic intent parser...") dataset = validate_and_format_dataset(dataset) intents = list(dataset[INTENTS]) if self.intent_classifier is None: self.intent_classifier = build_processing_unit( self.config.intent_classifier_config) if force_retrain or not self.intent_classifier.fitted: self.intent_classifier.fit(dataset) if self.slot_fillers is None: self.slot_fillers = dict() slot_fillers_start = datetime.now() for intent_name in intents: # We need to copy the slot filler config as it may be mutated if self.slot_fillers.get(intent_name) is None: slot_filler_config = deepcopy(self.config.slot_filler_config) self.slot_fillers[intent_name] = build_processing_unit( slot_filler_config) if force_retrain or not self.slot_fillers[intent_name].fitted: self.slot_fillers[intent_name].fit(dataset, intent_name) logger.debug("Fitted slot fillers in %s", elapsed_since(slot_fillers_start)) return self
def fit(self, dataset): """Fit the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ logger.debug("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( language, data_augmentation_config.unknown_words_replacement_string, self.config.featurizer_config) self.featurizer = self.featurizer.fit(dataset, utterances, classes) if self.featurizer is None: return self X = self.featurizer.transform(utterances) # pylint: disable=C0103 alpha = get_regularization_factor(dataset) self.classifier = SGDClassifier(random_state=random_state, alpha=alpha, **LOG_REG_ARGS) self.classifier.fit(X, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_should_compute_features(self): # Given features_factories = [ { "factory_name": NgramFactory.name, "args": { "n": 1, "use_stemming": False, "common_words_gazetteer_name": None }, "offsets": [0], "drop_out": 0.3 }, ] slot_filler_config = CRFSlotFillerConfig( feature_factory_configs=features_factories, random_seed=40) slot_filler = CRFSlotFiller(slot_filler_config) tokens = tokenize("foo hello world bar", LANGUAGE_EN) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler.fit(dataset, intent="dummy_intent_1") # When features_with_drop_out = slot_filler.compute_features(tokens, True) # Then expected_features = [ {"ngram_1": "foo"}, {}, {"ngram_1": "world"}, {}, ] self.assertListEqual(expected_features, features_with_drop_out)
def fit(self, dataset, force_retrain=True): """Fits the intent parser with a valid Snips dataset""" logger.info("Fitting deterministic parser...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) self.language = dataset[LANGUAGE] self.regexes_per_intent = dict() entity_placeholders = _get_entity_placeholders(dataset, self.language) self.slot_names_to_entities = get_slot_name_mappings(dataset) self.group_names_to_slot_names = _get_group_names_to_slot_names( self.slot_names_to_entities) # Do not use ambiguous patterns that appear in more than one intent all_patterns = set() ambiguous_patterns = set() intent_patterns = dict() for intent_name, intent in iteritems(dataset[INTENTS]): patterns = self._generate_patterns(intent[UTTERANCES], entity_placeholders) patterns = [ p for p in patterns if len(p) < self.config.max_pattern_length ] existing_patterns = {p for p in patterns if p in all_patterns} ambiguous_patterns.update(existing_patterns) all_patterns.update(set(patterns)) intent_patterns[intent_name] = patterns for intent_name, patterns in iteritems(intent_patterns): patterns = [p for p in patterns if p not in ambiguous_patterns] patterns = patterns[:self.config.max_queries] regexes = [re.compile(p, re.IGNORECASE) for p in patterns] self.regexes_per_intent[intent_name] = regexes return self
def test_fitting_should_be_reproducible_after_serialization(self): # Given dataset = BEVERAGE_DATASET validated_dataset = validate_and_format_dataset(dataset) seed1 = 666 seed2 = 42 config = ProbabilisticIntentParserConfig( intent_classifier_config=LogRegIntentClassifierConfig( random_seed=seed1), slot_filler_config=CRFSlotFillerConfig(random_seed=seed2)) parser = ProbabilisticIntentParser(config) parser_dict = parser.to_dict() # When fitted_parser_1 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) fitted_parser_2 = ProbabilisticIntentParser.from_dict(parser_dict).fit( validated_dataset) # Then feature_weights_1 = fitted_parser_1.slot_fillers[ "MakeTea"].crf_model.state_features_ feature_weights_2 = fitted_parser_2.slot_fillers[ "MakeTea"].crf_model.state_features_ self.assertEqual(feature_weights_1, feature_weights_2)
def fit(self, dataset): """Fits the intent classifier with a valid Snips dataset Returns: :class:`LogRegIntentClassifier`: The same instance, trained """ from sklearn.linear_model import SGDClassifier from sklearn.utils import compute_class_weight logger.info("Fitting LogRegIntentClassifier...") dataset = validate_and_format_dataset(dataset) self.load_resources_if_needed(dataset[LANGUAGE]) self.fit_builtin_entity_parser_if_needed(dataset) self.fit_custom_entity_parser_if_needed(dataset) language = dataset[LANGUAGE] data_augmentation_config = self.config.data_augmentation_config utterances, classes, intent_list = build_training_data( dataset, language, data_augmentation_config, self.resources, self.random_state) self.intent_list = intent_list if len(self.intent_list) <= 1: return self self.featurizer = Featurizer( config=self.config.featurizer_config, builtin_entity_parser=self.builtin_entity_parser, custom_entity_parser=self.custom_entity_parser, resources=self.resources, random_state=self.random_state, ) self.featurizer.language = language none_class = max(classes) try: x = self.featurizer.fit_transform(dataset, utterances, classes, none_class) except _EmptyDatasetUtterancesError: logger.warning("No (non-empty) utterances found in dataset") self.featurizer = None return self alpha = get_regularization_factor(dataset) class_weights_arr = compute_class_weight("balanced", range(none_class + 1), classes) # Re-weight the noise class class_weights_arr[-1] *= self.config.noise_reweight_factor class_weight = {idx: w for idx, w in enumerate(class_weights_arr)} self.classifier = SGDClassifier(random_state=self.random_state, alpha=alpha, class_weight=class_weight, **LOG_REG_ARGS) self.classifier.fit(x, classes) logger.debug("%s", DifferedLoggingMessage(self.log_best_features)) return self
def test_should_generate_dataset_from_file(self): # Given dataset_path_1 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "whoIsGame.txt") dataset_path_2 = os.path.join(ROOT_PATH, "snips_nlu_dataset", "examples", "getWeather.txt") dataset = AssistantDataset.from_files("en", [dataset_path_1, dataset_path_2]) dataset_dict = dataset.json # When / Then validate_and_format_dataset(dataset_dict) expected_intents = {"getWeather", "whoIsGame"} self.assertEqual(expected_intents, set(dataset_dict[INTENTS])) expected_entities = { "location", "snips/datetime", "role", "country", "company" } self.assertEqual(expected_entities, set(dataset_dict[ENTITIES]))
def test_should_be_serializable(self, mock_serialize_crf_model): # Given mock_serialize_crf_model.return_value = "mocked_crf_model_data" features_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=features_factories) dataset = validate_and_format_dataset(SAMPLE_DATASET) slot_filler = CRFSlotFiller(config) intent = "dummy_intent_1" slot_filler.fit(dataset, intent=intent) # When actual_slot_filler_dict = slot_filler.to_dict() # Then expected_feature_factories = [ { "factory_name": ShapeNgramFactory.name, "args": {"n": 1, "language_code": "en"}, "offsets": [0] }, { "factory_name": IsDigitFactory.name, "args": {}, "offsets": [-1, 0] } ] expected_config = CRFSlotFillerConfig( tagging_scheme=TaggingScheme.BILOU, feature_factory_configs=expected_feature_factories) expected_slot_filler_dict = { "unit_name": "crf_slot_filler", "crf_model_data": "mocked_crf_model_data", "language_code": "en", "config": expected_config.to_dict(), "intent": intent, "slot_name_mapping": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name2": "dummy_entity_2", "dummy_slot_name3": "dummy_entity_2", } } self.assertDictEqual(actual_slot_filler_dict, expected_slot_filler_dict)
def test_missing_entity_key_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False } }, "language": "en", } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected entity to have key: 'use_synonyms'", str(ctx.exception.args[0]))
def test_should_generate_dataset_from_files(self): # Given intent_file_1 = "intent_whoIsGame.txt" intent_file_2 = "intent_getWeather.txt" entity_file_1 = "entity_location.txt" who_is_game_txt = """ who is the [role:role](president) of [country:country](France) who is the [role:role](CEO) of [company:company](Google) please """ get_weather_txt = """ what is the weather in [weatherLocation:location](Paris)? is it raining in [weatherLocation] [weatherDate:snips/datetime] """ location_txt = """ new york,big apple london """ # pylint:disable=unused-argument def mock_open(self_, *args, **kwargs): if str(self_) == intent_file_1: return io.StringIO(who_is_game_txt) if str(self_) == intent_file_2: return io.StringIO(get_weather_txt) if str(self_) == entity_file_1: return io.StringIO(location_txt) return None # pylint:enable=unused-argument dataset_files = [intent_file_1, intent_file_2, entity_file_1] # When with patch("pathlib.io") as mock_io: mock_io.open.side_effect = mock_open dataset = Dataset.from_files("en", dataset_files) dataset_dict = dataset.json # When / Then validate_and_format_dataset(dataset_dict) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset_dict)
def test_should_generate_dataset_from_yaml_files(self): # Given who_is_game_yaml = io.StringIO(""" # whoIsGame Intent --- type: intent name: whoIsGame utterances: - who is the [role](president) of [country](France) - who is the [role](CEO) of [company](Google) please """) get_weather_yaml = io.StringIO(""" # getWeather Intent --- type: intent name: getWeather utterances: - what is the weather in [weatherLocation:location](Paris)? - is it raining in [weatherLocation] [weatherDate:snips/datetime] """) location_yaml = io.StringIO(""" # Location Entity --- type: entity name: location automatically_extensible: true values: - [new york, big apple] - london """) dataset_files = [who_is_game_yaml, get_weather_yaml, location_yaml] # When with mock.patch("snips_nlu_parsers.get_builtin_entity_examples", return_value=["Today"]): dataset = Dataset.from_yaml_files("en", dataset_files) # Then validate_and_format_dataset(dataset) self.assertDictEqual(EXPECTED_DATASET_DICT, dataset.json)
def test_should_handle_empty_dataset(self): # Given dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN)) engine = SnipsNLUEngine().fit(dataset) # When result = engine.parse("hello world") # Then self.assertEqual(empty_result("hello world"), result)
def test_missing_entity_key_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "1.1.1" } # When/Then with self.assertRaises(KeyError) as ctx: validate_and_format_dataset(dataset) self.assertEqual(str(ctx.exception.args[0]), "Expected entity to have key: 'use_synonyms'")
def test_should_extract_entity_values(self): # Given set_light_color_yaml = io.StringIO(""" --- type: intent name: setLightColor utterances: - set the lights to [color](blue) - change the light to [color](yellow) in the [room](bedroom)""") turn_light_on_yaml = io.StringIO(""" --- type: intent name: turnLightOn utterances: - turn the light on in the [room](kitchen) - turn the [room](bathroom)'s lights on""") color_yaml = io.StringIO(""" type: entity name: color values: - [blue, cyan] - red""") room_yaml = io.StringIO(""" type: entity name: room values: - garage - [living room, main room]""") dataset_files = [ set_light_color_yaml, turn_light_on_yaml, color_yaml, room_yaml ] dataset = Dataset.from_yaml_files("en", dataset_files).json dataset = validate_and_format_dataset(dataset) # When entity_values = extract_entity_values(dataset, apply_normalization=True) # Then expected_values = { "setLightColor": { "blue", "yellow", "cyan", "red", "bedroom", "garage", "living room", "main room", "kitchen", "bathroom" }, "turnLightOn": { "bedroom", "garage", "living room", "main room", "kitchen", "bathroom" } } self.assertDictEqual(expected_values, entity_values)
def test_should_get_none_if_empty_dataset(self): # Given dataset = validate_and_format_dataset(get_empty_dataset(LANGUAGE_EN)) classifier = LogRegIntentClassifier().fit(dataset) text = "this is a dummy query" # When intent = classifier.get_intent(text) # Then expected_intent = None self.assertEqual(intent, expected_intent)
def test_should_get_intent_after_deserialization(self): # Given dataset = validate_and_format_dataset(BEVERAGE_DATASET) classifier = LogRegIntentClassifier().fit(dataset) classifier_dict = classifier.to_dict() # When loaded_classifier = LogRegIntentClassifier.from_dict(classifier_dict) result = loaded_classifier.get_intent("Make me two cups of tea") # Then expected_intent = "MakeTea" self.assertEqual(expected_intent, result[RES_INTENT_NAME])
def test_intent_classifier_should_get_intent(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) classifier = LogRegIntentClassifier().fit(dataset) text = "This is a dummy_3 query from another intent" # When res = classifier.get_intent(text) intent = res[RES_INTENT_NAME] # Then expected_intent = "dummy_intent_2" self.assertEqual(intent, expected_intent)
def test_entity_match_factory(self): # Given config = { "factory_name": "entity_match", "args": { "tagging_scheme_code": TaggingScheme.BILOU.value, "use_stemming": False }, "offsets": [0] } tokens = tokenize("2 dummy a and dummy_c", LANGUAGE_EN) cache = [{TOKEN_NAME: token} for token in tokens] factory = get_feature_factory(config) dataset = deepcopy(SAMPLE_DATASET) dataset = validate_and_format_dataset(dataset) factory.fit(dataset, "dummy_intent_1") # When features = factory.build_features() features = sorted(features, key=lambda f: f.base_name) res0 = features[0].compute(0, cache) res1 = features[0].compute(1, cache) res2 = features[0].compute(2, cache) res3 = features[0].compute(3, cache) res4 = features[0].compute(4, cache) res5 = features[1].compute(0, cache) res6 = features[1].compute(1, cache) res7 = features[1].compute(2, cache) res8 = features[1].compute(3, cache) res9 = features[1].compute(4, cache) # Then self.assertIsInstance(factory, EntityMatchFactory) self.assertEqual(len(features), 2) self.assertEqual(features[0].base_name, "entity_match_dummy_entity_1") self.assertEqual(features[1].base_name, "entity_match_dummy_entity_2") self.assertEqual(res0, BEGINNING_PREFIX) self.assertEqual(res1, INSIDE_PREFIX) self.assertEqual(res2, LAST_PREFIX) self.assertEqual(res3, None) self.assertEqual(res4, None) self.assertEqual(res5, None) self.assertEqual(res6, None) self.assertEqual(res7, None) self.assertEqual(res8, None) self.assertEqual(res9, UNIT_PREFIX)
def test_should_format_dataset_by_adding_synonyms( self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation.lower(), variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": {}, "entities": { "entity1": { "data": [ { "value": "Entity_1", "synonyms": ["entity 2"] } ], "use_synonyms": True, "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "1.1.1" } expected_dataset = { "intents": {}, "entities": { "entity1": { "utterances": { "Entity_1": "Entity_1", "entity_1": "Entity_1", "entity 2": "Entity_1", "Entity 2": "Entity_1", }, "automatically_extensible": False, "capitalize": False } }, "language": "en", "snips_nlu_version": "1.1.1", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertDictEqual(dataset, expected_dataset)
def test_empty_vocabulary_should_fit_and_return_none_intent( self, mocked_build_training): # Given language = LANGUAGE_EN dataset = { "snips_nlu_version": "0.0.1", "entities": { "dummy_entity_1": { "automatically_extensible": True, "use_synonyms": False, "data": [ { "value": "...", "synonyms": [], } ] } }, "intents": { "dummy_intent_1": { "utterances": [ { "data": [ { "text": "...", "slot_name": "dummy_slot_name", "entity": "dummy_entity_1" } ] } ] } }, "language": language } dataset = validate_and_format_dataset(dataset) text = " " noise_size = 6 utterance = [text] + [text] * noise_size labels = [1] + [None] * noise_size intent_list = ["dummy_intent_1", None] mocked_build_training.return_value = utterance, labels, intent_list # When / Then intent_classifier = LogRegIntentClassifier().fit(dataset) intent = intent_classifier.get_intent("no intent there") self.assertEqual(intent, None)
def test_should_build_training_data_with_no_data(self): # Given language = LANGUAGE_EN dataset = validate_and_format_dataset(get_empty_dataset(language)) random_state = np.random.RandomState(1) # When data_augmentation_config = LogRegIntentClassifierConfig() \ .data_augmentation_config utterances, _, intent_mapping = build_training_data( dataset, language, data_augmentation_config, random_state) # Then expected_utterances = [] expected_intent_mapping = [] self.assertListEqual(utterances, expected_utterances) self.assertListEqual(intent_mapping, expected_intent_mapping)
def fit(self, dataset, intent, verbose=False): """Fit the slot filler Args: dataset (dict): A valid Snips dataset intent (str): The specific intent of the dataset to train the slot filler on verbose (bool, optional): If *True*, it will print the weights of the CRF once the training is done Returns: :class:`CRFSlotFiller`: The same instance, trained """ dataset = validate_and_format_dataset(dataset) self.intent = intent self.slot_name_mapping = get_slot_name_mapping(dataset, intent) self.language = dataset[LANGUAGE] random_state = check_random_state(self.config.random_seed) augmented_intent_utterances = augment_utterances( dataset, self.intent, language=self.language, random_state=random_state, **self.config.data_augmentation_config.to_dict()) crf_samples = [ utterance_to_sample(u[DATA], self.config.tagging_scheme, self.language) for u in augmented_intent_utterances] for factory in self.features_factories: factory.fit(dataset, intent) # pylint: disable=C0103 X = [self.compute_features(sample[TOKENS], drop_out=True) for sample in crf_samples] # ensure ascii tags Y = [[_encode_tag(tag) for tag in sample[TAGS]] for sample in crf_samples] # pylint: enable=C0103 self.crf_model = _get_crf_model(self.config.crf_args) self.crf_model.fit(X, Y) if verbose: self.print_weights() return self
def test_intent_classifier_should_get_intent_when_filter(self): # Given dataset = validate_and_format_dataset(BEVERAGE_DATASET) classifier = LogRegIntentClassifier().fit(dataset) # When text1 = "Make me two cups of tea" res1 = classifier.get_intent(text1, ["MakeCoffee", "MakeTea"]) text2 = "Make me two cups of tea" res2 = classifier.get_intent(text2, ["MakeCoffee"]) text3 = "bla bla bla" res3 = classifier.get_intent(text3, ["MakeCoffee"]) # Then self.assertEqual("MakeTea", res1[RES_INTENT_NAME]) self.assertEqual("MakeCoffee", res2[RES_INTENT_NAME]) self.assertEqual(None, res3)
def test_dataset_should_handle_synonyms( self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation.lower(), variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": {}, "entities": { "entity1": { "data": [ { "value": "Ëntity 1", "synonyms": ["entity 2"] } ], "use_synonyms": True, "automatically_extensible": True } }, "language": "en", "snips_nlu_version": "1.1.1" } # When dataset = validate_and_format_dataset(dataset) expected_entities = { "entity1": { AUTOMATICALLY_EXTENSIBLE: True, UTTERANCES: { "Ëntity 1": "Ëntity 1", "ëntity 1": "Ëntity 1", "entity 2": "Ëntity 1", "Entity 2": "Ëntity 1", }, CAPITALIZE: False } } # Then self.assertDictEqual(dataset[ENTITIES], expected_entities)
def fit(self, dataset, force_retrain=True): """Fit the intent parser with a valid Snips dataset""" dataset = validate_and_format_dataset(dataset) self.language = dataset[LANGUAGE] self.regexes_per_intent = dict() self.group_names_to_slot_names = dict() joined_entity_utterances = _get_joined_entity_utterances( dataset, self.language) self.slot_names_to_entities = _get_slot_names_mapping(dataset) for intent_name, intent in iteritems(dataset[INTENTS]): if not self._is_trainable(intent, dataset): self.regexes_per_intent[intent_name] = [] continue utterances = [_preprocess_builtin_entities(u, self.language) for u in intent[UTTERANCES]] regexes, self.group_names_to_slot_names = _generate_regexes( utterances, joined_entity_utterances, self.group_names_to_slot_names, self.language) self.regexes_per_intent[intent_name] = regexes return self
def test_should_add_capitalize_field( self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation, variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "My entity1", "entity": "entity1", "slot_name": "slot0" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot2" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot2" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot3" }, { "text": "My entity2", "entity": "entity2", "slot_name": "slot1" }, { "text": "myentity2", "entity": "entity2", "slot_name": "slot1" }, { "text": "m_entity3", "entity": "entity3", "slot_name": "slot1" } ] } ] } }, "entities": { "entity1": { "data": [], "use_synonyms": False, "automatically_extensible": True }, "entity2": { "data": [], "use_synonyms": False, "automatically_extensible": True }, "entity3": { "data": [ { "value": "Entity3", "synonyms": ["entity3"] } ], "use_synonyms": False, "automatically_extensible": True } }, "language": "en", "snips_nlu_version": "0.0.1" } expected_dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "My entity1", "entity": "entity1", "slot_name": "slot0" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot2" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot2" }, { "text": "entity1", "entity": "entity1", "slot_name": "slot3" }, { "text": "My entity2", "entity": "entity2", "slot_name": "slot1" }, { "text": "myentity2", "entity": "entity2", "slot_name": "slot1" }, { "text": "m_entity3", "entity": "entity3", "slot_name": "slot1" } ] } ] } }, "entities": { "entity1": { "utterances": { "My entity1": "My entity1", "My Entity1": "My entity1", "entity1": "entity1", "Entity1": "entity1", }, "automatically_extensible": True, "capitalize": True }, "entity2": { "utterances": { "My entity2": "My entity2", "My Entity2": "My entity2", "myentity2": "myentity2", "Myentity2": "myentity2" }, "automatically_extensible": True, "capitalize": True }, "entity3": { "utterances": { "Entity3": "Entity3", "m_entity3": "m_entity3", "M_Entity3": "m_entity3" }, "automatically_extensible": True, "capitalize": False } }, "language": "en", "snips_nlu_version": "0.0.1", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertDictEqual(dataset, expected_dataset)
def test_should_remove_empty_entities_value_and_empty_synonyms( self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation, variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "this is ", }, { "text": "", "entity": "entity1", "slot_name": "slot1" } ] }, { "data": [ { "text": "this is ", }, { "text": "entity 1", "entity": "entity1", "slot_name": "slot1" } ] } ] } }, "entities": { "entity1": { "data": [ { "value": "entity 1", "synonyms": [""] }, { "value": "", "synonyms": [] } ], "use_synonyms": False, "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "0.0.1" } expected_dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "this is ", }, { "text": "", "entity": "entity1", "slot_name": "slot1" } ] }, { "data": [ { "text": "this is ", }, { "text": "entity 1", "entity": "entity1", "slot_name": "slot1" } ] } ] } }, "entities": { "entity1": { "utterances": { "entity 1": "entity 1", "Entity 1": "entity 1", }, "capitalize": False, "automatically_extensible": False } }, "language": "en", "snips_nlu_version": "0.0.1", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertEqual(dataset, expected_dataset)
def test_should_normalize_synonyms( self, mocked_get_string_variations): # Given def mock_get_string_variations(variation, language): return {variation.lower(), variation.title()} mocked_get_string_variations.side_effect = mock_get_string_variations dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "ëNtity", "entity": "entity1", "slot_name": "startTime" } ] } ] } }, "entities": { "entity1": { "data": [], "use_synonyms": True, "automatically_extensible": True } }, "language": "en", "snips_nlu_version": "0.1.0" } expected_dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "ëNtity", "entity": "entity1", "slot_name": "startTime" } ] } ] } }, "entities": { "entity1": { "utterances": { "ëntity": "ëNtity", "Ëntity": "ëNtity", }, "automatically_extensible": True, "capitalize": False } }, "language": "en", "snips_nlu_version": "0.1.0", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertDictEqual(dataset, expected_dataset)