def test_training_should_be_reproducible(self): # Given random_state = 42 dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me a [beverage_temperature:Temperature](hot) cup of tea - make me [number_of_cups:snips/number](five) tea cups --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](one) cup of coffee please - brew [number_of_cups] cups of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json # When parser1 = DeterministicIntentParser(random_state=random_state) parser1.fit(dataset) parser2 = DeterministicIntentParser(random_state=random_state) parser2.fit(dataset) # Then with temp_dir() as tmp_dir: dir_parser1 = tmp_dir / "parser1" dir_parser2 = tmp_dir / "parser2" parser1.persist(dir_parser1) parser2.persist(dir_parser2) hash1 = dirhash(str(dir_parser1), 'sha256') hash2 = dirhash(str(dir_parser2), 'sha256') self.assertEqual(hash1, hash2)
def test_should_be_deserializable_before_fitting_with_whitelist(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None, "stop_words_whitelist": None } self.tmp_file_path.mkdir() metadata = {"unit_name": "deterministic_intent_parser"} self.writeJsonContent(self.tmp_file_path / "intent_parser.json", parser_dict) self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) # When parser = DeterministicIntentParser.from_path(self.tmp_file_path) # Then config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_fit_with_naughty_strings_no_tags(self): # Given naughty_strings_path = os.path.join(TEST_PATH, "resources", "naughty_strings.txt") with io.open(naughty_strings_path, encoding='utf8') as f: naughty_strings = [line.strip("\n") for line in f.readlines()] utterances = [{ DATA: [{ TEXT: naughty_string }] } for naughty_string in naughty_strings] # When naughty_dataset = { "intents": { "naughty_intent": { "utterances": utterances } }, "entities": dict(), "language": "en", "snips_nlu_version": "0.0.1" } # Then with self.fail_if_exception("Exception raised"): DeterministicIntentParser().fit(naughty_dataset)
def test_should_ignore_very_ambiguous_utterances(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent_1 utterances: - "[event_type](meeting) tomorrow" --- type: intent name: intent_2 utterances: - call [time:snips/datetime](today) --- type: entity name: event_type values: - call - diner""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "call tomorrow" # When res = parser.parse(text) # Then self.assertEqual(empty_result(text, 1.0), res)
def test_should_parse_intent(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - foo bar baz --- type: intent name: intent2 utterances: - foo bar ban""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "foo bar ban" # When parsing = parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result(intent_name="intent2", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])
def test_should_parse_slightly_ambiguous_utterances(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent_1 utterances: - call tomorrow --- type: intent name: intent_2 utterances: - call [time:snips/datetime](today)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "call tomorrow" # When res = parser.parse(text) # Then expected_intent = intent_classification_result(intent_name="intent_1", probability=2. / 3.) expected_result = parsing_result(text, expected_intent, []) self.assertEqual(expected_result, res)
def test_should_parse_top_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: intent1 utterances: - hello world --- type: intent name: intent2 utterances: - foo bar""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) text = "hello world" # When results = parser.parse(text, top_n=3) # Then expected_intent = intent_classification_result(intent_name="intent1", probability=1.0) expected_results = [extraction_result(expected_intent, [])] self.assertEqual(expected_results, results)
def test_should_be_serializable_into_bytearray(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: MakeTea utterances: - make me [number_of_cups:snips/number](one) cup of tea - i want [number_of_cups] cups of tea please - can you prepare [number_of_cups] cup of tea ? --- type: intent name: MakeCoffee utterances: - make me [number_of_cups:snips/number](two) cups of coffee - brew [number_of_cups] cups of coffee - can you prepare [number_of_cups] cup of coffee""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json shared = self.get_shared_data(dataset) intent_parser = DeterministicIntentParser(**shared).fit(dataset) # When intent_parser_bytes = intent_parser.to_byte_array() loaded_intent_parser = DeterministicIntentParser.from_byte_array( intent_parser_bytes, **shared) result = loaded_intent_parser.parse("make me two cups of coffee") # Then self.assertEqual("MakeCoffee", result[RES_INTENT][RES_INTENT_NAME])
def test_should_be_serializable(self, mocked_generate_regexes): # Given # pylint: disable=unused-argument def mock_generate_patterns(utterances, joined_entity_utterances, group_names_to_slot_names, language): patterns = ["mocked_regex_%s" % i for i in range(len(utterances))] group_to_slot = {"group_0": "dummy slot name"} return patterns, group_to_slot # pylint: enable=unused-argument mocked_generate_regexes.side_effect = mock_generate_patterns dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=100) parser = DeterministicIntentParser(config=config).fit(dataset) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 100 }, "language_code": "en", "group_names_to_slot_names": { "group_0": "dummy slot name" }, "patterns": { "dummy_intent_1": [ "mocked_regex_0", "mocked_regex_1", "mocked_regex_2", "mocked_regex_3" ], "dummy_intent_2": [ "mocked_regex_0" ] }, "slot_names_to_entities": { "dummy_intent_1": { "dummy_slot_name": "dummy_entity_1", "dummy_slot_name3": "dummy_entity_2", "dummy_slot_name2": "dummy_entity_2" }, "dummy_intent_2": { "dummy slot nàme": "dummy_entity_1" } } } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_fit_with_naughty_strings_no_tags(self): # Given naughty_strings_path = TEST_PATH / "resources" / "naughty_strings.txt" with naughty_strings_path.open(encoding="utf8") as f: naughty_strings = [line.strip("\n") for line in f.readlines()] utterances = [{ DATA: [{ TEXT: naughty_string }] } for naughty_string in naughty_strings] # When naughty_dataset = { "intents": { "naughty_intent": { "utterances": utterances } }, "entities": dict(), "language": "en", } # Then with self.fail_if_exception("Exception raised"): DeterministicIntentParser().fit(naughty_dataset)
def test_should_be_serializable_before_fitting(self): # Given config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43, ignore_stop_words=True) parser = DeterministicIntentParser(config=config) # When parser.persist(self.tmp_file_path) # Then expected_dict = { "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_pattern_length": 43, "ignore_stop_words": True }, "language_code": None, "group_names_to_slot_names": None, "patterns": None, "slot_names_to_entities": None, "stop_words_whitelist": None } metadata = {"unit_name": "deterministic_intent_parser"} self.assertJsonContent(self.tmp_file_path / "metadata.json", metadata) self.assertJsonContent(self.tmp_file_path / "intent_parser.json", expected_dict)
def test_should_limit_patterns_length(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - how are you - hello how are you? - what's up --- type: intent name: my_second_intent utterances: - what is the weather today ? - does it rain - will it rain tomorrow""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=1000, max_pattern_length=25, ignore_stop_words=False) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(2, len(parser.regexes_per_intent["my_first_intent"])) self.assertEqual(1, len(parser.regexes_per_intent["my_second_intent"]))
def test_should_limit_nb_queries(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: my_first_intent utterances: - this is [slot1:entity1](my first entity) - this is [slot2:entity2](my second entity) - this is [slot3:entity3](my third entity) --- type: intent name: my_second_intent utterances: - this is [slot4:entity4](my fourth entity)""") dataset = Dataset.from_yaml_files("en", [dataset_stream]).json config = DeterministicIntentParserConfig(max_queries=2, max_pattern_length=1000) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(len(parser.regexes_per_intent["my_first_intent"]), 2) self.assertEqual(len(parser.regexes_per_intent["my_second_intent"]), 1)
def test_should_be_deserializable_without_stop_words(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": "en", "group_names_to_slot_names": { "hello_group": "hello_slot", "world_group": "world_slot" }, "patterns": { "my_intent": ["(?P<hello_group>hello?)", "(?P<world_group>world$)"] }, "slot_names_to_entities": { "my_intent": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } } self.tmp_file_path.mkdir() metadata = {"unit_name": "deterministic_intent_parser"} self.writeJsonContent(self.tmp_file_path / "intent_parser.json", parser_dict) self.writeJsonContent(self.tmp_file_path / "metadata.json", metadata) # When parser = DeterministicIntentParser.from_path(self.tmp_file_path) # Then patterns = { "my_intent": ["(?P<hello_group>hello?)", "(?P<world_group>world$)"] } group_names_to_slot_names = { "hello_group": "hello_slot", "world_group": "world_slot" } slot_names_to_entities = { "my_intent": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) expected_parser.language = LANGUAGE_EN expected_parser.group_names_to_slot_names = group_names_to_slot_names expected_parser.slot_names_to_entities = slot_names_to_entities expected_parser.patterns = patterns # pylint:disable=protected-access expected_parser._stop_words_whitelist = dict() # pylint:enable=protected-access self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_not_parse_when_not_fitted(self): # Given parser = DeterministicIntentParser() # When / Then self.assertFalse(parser.fitted) with self.assertRaises(NotTrained): parser.parse("foobar")
def test_should_get_intents(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: greeting1 utterances: - Hello John --- type: intent name: greeting2 utterances: - Hello [name](John) --- type: intent name: greeting3 utterances: - "[greeting](Hello) [name](John)" """) dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) # When top_intents = parser.get_intents("Hello John") # Then expected_intents = [ { RES_INTENT_NAME: "greeting1", RES_PROBA: 1. / (1. + 1. / 2. + 1. / 3.) }, { RES_INTENT_NAME: "greeting2", RES_PROBA: (1. / 2.) / (1. + 1. / 2. + 1. / 3.) }, { RES_INTENT_NAME: "greeting3", RES_PROBA: (1. / 3.) / (1. + 1. / 2. + 1. / 3.) }, { RES_INTENT_NAME: None, RES_PROBA: 0.0 }, ] def sorting_key(intent_res): if intent_res[RES_INTENT_NAME] is None: return "null" return intent_res[RES_INTENT_NAME] sorted_expected_intents = sorted(expected_intents, key=sorting_key) sorted_intents = sorted(top_intents, key=sorting_key) self.assertEqual(expected_intents[0], top_intents[0]) self.assertListEqual(sorted_expected_intents, sorted_intents)
def test_should_limit_patterns_length(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=1000, max_pattern_length=300) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then self.assertEqual(4, len(parser.regexes_per_intent["dummy_intent_1"])) self.assertEqual(1, len(parser.regexes_per_intent["dummy_intent_2"]))
def test_should_be_serializable(self, mocked_generate_regexes): # Given # pylint: disable=unused-argument def mock_generate_regexes(utterances, joined_entity_utterances, group_names_to_slot_names, language): regexes = [ re.compile(r"mocked_regex_%s" % i) for i in range(len(utterances)) ] group_to_slot = {"group_0": "dummy slot name"} return regexes, group_to_slot # pylint: enable=unused-argument mocked_generate_regexes.side_effect = mock_generate_regexes dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=42, max_entities=100) parser = DeterministicIntentParser(config=config).fit(dataset) # When actual_dict = parser.to_dict() # Then expected_dict = { "unit_name": "deterministic_intent_parser", "config": { "unit_name": "deterministic_intent_parser", "max_queries": 42, "max_entities": 100 }, "language_code": "en", "group_names_to_slot_names": { "group_0": "dummy slot name" }, "patterns": { "dummy_intent_1": [ "mocked_regex_0", "mocked_regex_1", "mocked_regex_2", "mocked_regex_3" ], "dummy_intent_2": ["mocked_regex_0"] }, "slot_names_to_entities": { "dummy_slot_name": "dummy_entity_1", "dummy slot nàme": "dummy_entity_1", "dummy_slot_name3": "dummy_entity_2", "dummy_slot_name2": "dummy_entity_2" } } self.assertDictEqual(actual_dict, expected_dict)
def test_should_be_serializable_into_bytearray(self): # Given dataset = BEVERAGE_DATASET intent_parser = DeterministicIntentParser().fit(dataset) # When intent_parser_bytes = intent_parser.to_byte_array() loaded_intent_parser = DeterministicIntentParser.from_byte_array( intent_parser_bytes) result = loaded_intent_parser.parse("make me two cups of coffee") # Then self.assertEqual("MakeCoffee", result[RES_INTENT][RES_INTENT_NAME])
def test_should_be_deserializable(self): # Given parser_dict = { "config": { "max_queries": 42, "max_pattern_length": 43 }, "language_code": "en", "group_names_to_slot_names": { "hello_group": "hello_slot", "world_group": "world_slot" }, "patterns": { "intent_name": [ "(?P<hello_group>hello?)", "(?P<world_group>world$)" ] }, "slot_names_to_entities": { "hello_slot": "hello_entity", "world_slot": "world_entity" } } # When parser = DeterministicIntentParser.from_dict(parser_dict) # Then patterns = { "intent_name": [ "(?P<hello_group>hello?)", "(?P<world_group>world$)" ] } group_names_to_slot_names = { "hello_group": "hello_slot", "world_group": "world_slot" } slot_names_to_entities = { "hello_slot": "hello_entity", "world_slot": "world_entity" } config = DeterministicIntentParserConfig(max_queries=42, max_pattern_length=43) expected_parser = DeterministicIntentParser(config=config) expected_parser.language = LANGUAGE_EN expected_parser.group_names_to_slot_names = group_names_to_slot_names expected_parser.slot_names_to_entities = slot_names_to_entities expected_parser.patterns = patterns self.assertEqual(parser.to_dict(), expected_parser.to_dict())
def test_should_not_train_intents_too_big(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) config = DeterministicIntentParserConfig(max_queries=2, max_entities=200) # When parser = DeterministicIntentParser(config=config).fit(dataset) # Then not_fitted_intent = "dummy_intent_1" fitted_intent = "dummy_intent_2" self.assertGreater(len(parser.regexes_per_intent[fitted_intent]), 0) self.assertListEqual(parser.regexes_per_intent[not_fitted_intent], [])
def test_should_parse_naughty_strings(self): # Given dataset = validate_and_format_dataset(SAMPLE_DATASET) naughty_strings_path = TEST_PATH / "resources" / "naughty_strings.txt" with naughty_strings_path.open(encoding='utf8') as f: naughty_strings = [line.strip("\n") for line in f.readlines()] # When parser = DeterministicIntentParser().fit(dataset) # Then for s in naughty_strings: with self.fail_if_exception("Exception raised"): parser.parse(s)
def test_should_parse_stop_words_slots(self): # Given dataset_stream = io.StringIO(""" --- type: intent name: search utterances: - search - search [search_object](this) - search [search_object](a cat) --- type: entity name: search_object values: - [this thing, that] """) resources = self.get_resources("en") resources[STOP_WORDS] = {"a", "this", "that"} dataset = Dataset.from_yaml_files("en", [dataset_stream]).json parser_config = DeterministicIntentParserConfig(ignore_stop_words=True) parser = DeterministicIntentParser(config=parser_config, resources=resources) parser.fit(dataset) # When res_1 = parser.parse("search this") res_2 = parser.parse("search that") # Then expected_intent = intent_classification_result(intent_name="search", probability=1.0) expected_slots_1 = [ unresolved_slot(match_range=(7, 11), value="this", entity="search_object", slot_name="search_object") ] expected_slots_2 = [ unresolved_slot(match_range=(7, 11), value="that", entity="search_object", slot_name="search_object") ] self.assertEqual(expected_intent, res_1[RES_INTENT]) self.assertEqual(expected_intent, res_2[RES_INTENT]) self.assertListEqual(expected_slots_1, res_1[RES_SLOTS]) self.assertListEqual(expected_slots_2, res_2[RES_SLOTS])
def test_should_fit_and_parse_with_non_ascii_tags(self): # Given inputs = ("string%s" % i for i in range(10)) utterances = [{ DATA: [{ TEXT: string, ENTITY: "non_ascìi_entïty", SLOT_NAME: "non_ascìi_slöt" }] } for string in inputs] # When naughty_dataset = { "intents": { "naughty_intent": { "utterances": utterances } }, "entities": { "non_ascìi_entïty": { "use_synonyms": False, "automatically_extensible": True, "matching_strictness": 1.0, "data": [] } }, "language": "en", } naughty_dataset = validate_and_format_dataset(naughty_dataset) # Then with self.fail_if_exception("Exception raised"): parser = DeterministicIntentParser() parser.fit(naughty_dataset) parsing = parser.parse("string0") expected_slot = { 'entity': 'non_ascìi_entïty', 'range': { "start": 0, "end": 7 }, 'slotName': u'non_ascìi_slöt', 'value': u'string0' } intent_name = parsing[RES_INTENT][RES_INTENT_NAME] self.assertEqual("naughty_intent", intent_name) self.assertListEqual([expected_slot], parsing[RES_SLOTS])
def test_should_get_no_slots_with_none_intent(self): # Given slots_dataset_stream = io.StringIO(""" --- type: intent name: greeting utterances: - Hello [name](John)""") dataset = Dataset.from_yaml_files("en", [slots_dataset_stream]).json parser = DeterministicIntentParser().fit(dataset) # When slots = parser.get_slots("Hello John", None) # Then self.assertListEqual([], slots)
def test_should_be_serializable_into_bytearray(self): # Given dataset = BEVERAGE_DATASET intent_parser = DeterministicIntentParser().fit(dataset) custom_entity_parser = intent_parser.custom_entity_parser # When intent_parser_bytes = intent_parser.to_byte_array() loaded_intent_parser = DeterministicIntentParser.from_byte_array( intent_parser_bytes, builtin_entity_parser=BuiltinEntityParser.build(language="en"), custom_entity_parser=custom_entity_parser) result = loaded_intent_parser.parse("make me two cups of coffee") # Then self.assertEqual("MakeCoffee", result[RES_INTENT][RES_INTENT_NAME])
def test_should_get_intent(self): # Given dataset = validate_and_format_dataset(self.slots_dataset) parser = DeterministicIntentParser().fit(dataset) text = "this is a dummy_a query with another dummy_c at 10p.m. or " \ "at 12p.m." # When parsing = parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result( intent_name="dummy_intent_1", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])
def test_should_get_intent_when_filter(self): # Given dataset = validate_and_format_dataset( self.duplicated_utterances_dataset) parser = DeterministicIntentParser().fit(dataset) text = "Hello world" intent_name_1 = "dummy_intent_1" intent_name_2 = "dummy_intent_2" # When res_1 = parser.parse(text, intent_name_1) res_2 = parser.parse(text, [intent_name_2]) # Then self.assertEqual(intent_name_1, res_1[RES_INTENT][RES_INTENT_NAME]) self.assertEqual(intent_name_2, res_2[RES_INTENT][RES_INTENT_NAME])
def test_should_parse_intent_with_stop_words(self, mock_get_stop_words): # Given mock_get_stop_words.return_value = {"a", "hey"} dataset = self.slots_dataset config = DeterministicIntentParserConfig(ignore_stop_words=True) parser = DeterministicIntentParser(config).fit(dataset) text = "Hey this is dummy_a query with another dummy_c at 10p.m. or " \ "at 12p.m." # When parsing = parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result( intent_name="dummy_intent_1", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])
def test_should_parse_intent_after_deserialization(self): # Given dataset = self.slots_dataset shared = self.get_shared_data(dataset) parser = DeterministicIntentParser(**shared).fit(dataset) parser.persist(self.tmp_file_path) deserialized_parser = DeterministicIntentParser.from_path( self.tmp_file_path, **shared) text = "this is a dummy_a query with another dummy_c at 10p.m. or " \ "at 12p.m." # When parsing = deserialized_parser.parse(text) # Then probability = 1.0 expected_intent = intent_classification_result( intent_name="dummy_intent_1", probability=probability) self.assertEqual(expected_intent, parsing[RES_INTENT])