def test_should_be_serializable(self, mocked_generate_regexes): # Given # pylint: disable=unused-argument def mock_generate_patterns(utterances, joined_entity_utterances, group_names_to_slot_names, language): patterns = ["mocked_regex_%s" % i for i in range(len(utterances))] group_to_slot = {"group_0": "dummy slot name"} return patterns, group_to_slot # pylint: enable=unused-argument mocked_generate_regexes.side_effect = mock_generate_patterns dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=100) parser = DeterministicIntentParser(config=config).fit(dataset) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 100 }, "language_code": "en", "group_names_to_slot_names": { "group_0": "dummy slot name" }, "patterns": { "dummy_intent_1": [ "mocked_regex_0", "mocked_regex_1", "mocked_regex_2", "mocked_regex_3" ], "dummy_intent_2": [ "mocked_regex_0" ] }, "slot_names_to_entities": { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name3": "dummy_entity_2", "dummy_slot_name2": "dummy_entity_2" }, "dummy_intent_2": { "dummy slot nàme": "dummy_entity_1" } } } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_limit_nb_queries(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](my second entity) - this is [slot3:entity3](my third entity) --- type: intent name: my_second_intent utterances: - this is [slot4:entity4](my fourth entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=2, max_pattern_length=1000) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(len(parser.regexes_per_intent["my_first_intent"]), 2) self.assertEqual(len(parser.regexes_per_intent["my_second_intent"]), 1)
def test_should_be_serializable_before_fitting(self): # Given config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43, ignore_stop_words=True) parser = DeterministicIntentParser(config=config) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 43, "ignore_stop_words": True }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None, "stop_words_whitelist": None } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_be_deserializable_before_fitting_with_whitelist(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None, "stop_words_whitelist": None } self.tmp_file_path.mkdir() metadata = {"unit_name": "deterministic_intent_parser"} self.writeJsonContent(self.tmp_file_path / "intent_parser.json", parser_dict) self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) # When parser = DeterministicIntentParser.from_path(self.tmp_file_path) # Then config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_limit_patterns_length(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=1000, max_pattern_length=25, ignore_stop_words=False) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(2, len(parser.regexes_per_intent["my_first_intent"])) self.assertEqual(1, len(parser.regexes_per_intent["my_second_intent"]))
def test_should_be_deserializable_without_stop_words(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": "en", "group_names_to_slot_names": { "hello_group": "hello_slot", "world_group": "world_slot" }, "patterns": { "my_intent": ["(?P<hello_group>hello?)", "(?P<world_group>world$)"] }, "slot_names_to_entities": { "my_intent": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } } self.tmp_file_path.mkdir() metadata = {"unit_name": "deterministic_intent_parser"} self.writeJsonContent(self.tmp_file_path / "intent_parser.json", parser_dict) self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) # When parser = DeterministicIntentParser.from_path(self.tmp_file_path) # Then patterns = { "my_intent": ["(?P<hello_group>hello?)", "(?P<world_group>world$)"] } group_names_to_slot_names = { "hello_group": "hello_slot", "world_group": "world_slot" } slot_names_to_entities = { "my_intent": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) expected_parser.language = LANGUAGE_EN expected_parser.group_names_to_slot_names = group_names_to_slot_names expected_parser.slot_names_to_entities = slot_names_to_entities expected_parser.patterns = patterns # pylint:disable=protected-access expected_parser._stop_words_whitelist = dict() # pylint:enable=protected-access self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_limit_patterns_length(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=1000, max_pattern_length=300) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(4, len(parser.regexes_per_intent["dummy_intent_1"])) self.assertEqual(1, len(parser.regexes_per_intent["dummy_intent_2"]))
def __init__(self, intent_parsers_configs=None): if intent_parsers_configs is None: from snips_nlu.pipeline.configs import ( ProbabilisticIntentParserConfig, DeterministicIntentParserConfig) intent_parsers_configs = [ DeterministicIntentParserConfig(), ProbabilisticIntentParserConfig() ] self.intent_parsers_configs = list( map(get_processing_unit_config, intent_parsers_configs))
def test_should_be_serializable(self, mocked_generate_regexes): # Given # pylint: disable=unused-argument def mock_generate_regexes(utterances, joined_entity_utterances, group_names_to_slot_names, language): regexes = [ re.compile(r"mocked_regex_%s" % i) for i in range(len(utterances)) ] group_to_slot = {"group_0": "dummy slot name"} return regexes, group_to_slot # pylint: enable=unused-argument mocked_generate_regexes.side_effect = mock_generate_regexes dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=42, max_entities=100) parser = DeterministicIntentParser(config=config).fit(dataset) # When actual_dict = parser.to_dict() # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_entities": 100 }, "language_code": "en", "group_names_to_slot_names": { "group_0": "dummy slot name" }, "patterns": { "dummy_intent_1": [ "mocked_regex_0", "mocked_regex_1", "mocked_regex_2", "mocked_regex_3" ], "dummy_intent_2": ["mocked_regex_0"] }, "slot_names_to_entities": { "dummy_slot_name": "dummy_entity_1", "dummy slot nàme": "dummy_entity_1", "dummy_slot_name3": "dummy_entity_2", "dummy_slot_name2": "dummy_entity_2" } } self.assertDictEqual(actual_dict, expected_dict)
def test_should_be_deserializable(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": "en", "group_names_to_slot_names": { "hello_group": "hello_slot", "world_group": "world_slot" }, "patterns": { "intent_name": [ "(?P<hello_group>hello?)", "(?P<world_group>world$)" ] }, "slot_names_to_entities": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } # When parser = DeterministicIntentParser.from_dict(parser_dict) # Then patterns = { "intent_name": [ "(?P<hello_group>hello?)", "(?P<world_group>world$)" ] } group_names_to_slot_names = { "hello_group": "hello_slot", "world_group": "world_slot" } slot_names_to_entities = { "hello_slot": "hello_entity", "world_slot": "world_entity" } config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) expected_parser.language = LANGUAGE_EN expected_parser.group_names_to_slot_names = group_names_to_slot_names expected_parser.slot_names_to_entities = slot_names_to_entities expected_parser.patterns = patterns self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_deterministic_parser_config(self): # Given config_dict = { "unit_name": "deterministic_intent_parser", "max_queries": 666, "max_entities": 333 } # When config = DeterministicIntentParserConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def __init__(self, intent_parsers_configs=None): from snips_nlu.intent_parser import IntentParser if intent_parsers_configs is None: from snips_nlu.pipeline.configs import ( ProbabilisticIntentParserConfig, DeterministicIntentParserConfig) intent_parsers_configs = [ DeterministicIntentParserConfig(), ProbabilisticIntentParserConfig() ] self.intent_parsers_configs = [ IntentParser.get_config(conf) for conf in intent_parsers_configs ]
def test_should_not_train_intents_too_big(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=2, max_entities=200) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then not_fitted_intent = "dummy_intent_1" fitted_intent = "dummy_intent_2" self.assertGreater(len(parser.regexes_per_intent[fitted_intent]), 0) self.assertListEqual(parser.regexes_per_intent[not_fitted_intent], [])
def test_should_parse_stop_words_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: search utterances: - search - search [search_object](this) - search [search_object](a cat) --- type: entity name: search_object values: - [this thing, that] """) resources = self.get_resources("en") resources[STOP_WORDS] = {"a", "this", "that"} dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser_config = DeterministicIntentParserConfig(ignore_stop_words=True) parser = DeterministicIntentParser(config=parser_config, resources=resources) parser.fit(dataset) # When res_1 = parser.parse("search this") res_2 = parser.parse("search that") # Then expected_intent = intent_classification_result(intent_name="search", probability=1.0) expected_slots_1 = [ unresolved_slot(match_range=(7, 11), value="this", entity="search_object", slot_name="search_object") ] expected_slots_2 = [ unresolved_slot(match_range=(7, 11), value="that", entity="search_object", slot_name="search_object") ] self.assertEqual(expected_intent, res_1[RES_INTENT]) self.assertEqual(expected_intent, res_2[RES_INTENT]) self.assertListEqual(expected_slots_1, res_1[RES_SLOTS]) self.assertListEqual(expected_slots_2, res_2[RES_SLOTS])
def test_deterministic_parser_config(self): # Given config_dict = { "unit_name": "deterministic_intent_parser", "max_queries": 666, "max_pattern_length": 333, "ignore_stop_words": True } # When config = DeterministicIntentParserConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_nlu_config_from_dict(self): # Given config_dict = { "unit_name": "nlu_engine", "intent_parsers_configs": [ DeterministicIntentParserConfig().to_dict(), ProbabilisticIntentParserConfig().to_dict() ] } # When config = NLUEngineConfig.from_dict(config_dict) serialized_config = config.to_dict() # Then self.assertDictEqual(config_dict, serialized_config)
def test_should_parse_intent_with_stop_words(self, mock_get_stop_words): # Given mock_get_stop_words.return_value = {"a", "hey"} dataset = self.slots_dataset config = DeterministicIntentParserConfig(ignore_stop_words=True) parser = DeterministicIntentParser(config).fit(dataset) text = "Hey this is dummy_a query with another dummy_c at 10p.m. or " \ "at 12p.m." # When parsing = parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result( intent_name="dummy_intent_1", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])
def test_should_be_deserializable_before_fitting(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None } # When parser = DeterministicIntentParser.from_dict(parser_dict) # Then config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_be_serializable_before_fitting(self): # Given config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) parser = DeterministicIntentParser(config=config) # When actual_dict = parser.to_dict() # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 43 }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None } self.assertDictEqual(actual_dict, expected_dict)
def test_should_be_serializable(self, mock_get_stop_words): # Given dataset_stream = io.StringIO(""" --- type: intent name: searchFlight slots: - name: origin entity: city - name: destination entity: city utterances: - find me a flight from [origin](Paris) to [destination](New York) - I need a flight to [destination](Berlin) --- type: entity name: city values: - london - [new york, big apple] - [paris, city of lights] """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json mock_get_stop_words.return_value = {"a", "me"} config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=100, ignore_stop_words=True) parser = DeterministicIntentParser(config=config).fit(dataset) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 100, "ignore_stop_words": True }, "language_code": "en", "group_names_to_slot_names": { "group0": "destination", "group1": "origin", }, "patterns": { "searchFlight": [ "^\\s*find\\s*flight\\s*from\\s*(?P<group1>%CITY%)\\s*to" "\\s*(?P<group0>%CITY%)\\s*$", "^\\s*i\\s*need\\s*flight\\s*to\\s*(?P<group0>%CITY%)" "\\s*$", ] }, "slot_names_to_entities": { "searchFlight": { "destination": "city", "origin": "city", } }, "stop_words_whitelist": dict() } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)