def test_unknown_entity_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity", "slot_name": "unknown_entity_slot" } ] } ] } }, "entities": { "entity1": { "data": [], "use_synonyms": True, "automatically_extensible": False } }, "language": "en", } # When/Then with self.assertRaises(DatasetFormatError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected entities to have key: 'unknown_entity'", str(ctx.exception.args[0]))
def test_should_not_require_data_for_builtin_entities(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "this is ", }, { "text": "10p.m", "entity": SNIPS_DATETIME, "slot_name": "startTime" } ] } ] } }, "entities": { SNIPS_DATETIME: {} }, "language": "en", } # When / Then with self.fail_if_exception("Could not validate dataset"): validate_and_format_dataset(dataset)
def test_validate_should_be_idempotent(self): # Given dataset_stream = io.StringIO(""" # getWeather Intent --- type: intent name: getWeather utterances: - what is the weather in [weatherLocation:location](Paris)? - is it raining in [weatherLocation] [weatherDate:snips/datetime] # Location Entity --- type: entity name: location automatically_extensible: true values: - [new york, big apple] - london """) dataset = Dataset.from_yaml_files("en", [dataset_stream]) validated_dataset = validate_and_format_dataset(dataset) # When validated_dataset_2 = validate_and_format_dataset(validated_dataset) # Then self.assertDictEqual(validated_dataset, validated_dataset_2) self.assertTrue(validated_dataset.get(VALIDATED, False))
def test_missing_intent_key_should_raise_exception(self): # Given dataset = { "intents": { "intent1": { "utterances": [ { "data": [ { "text": "unknown entity", "entity": "unknown_entity" } ] } ] } }, "entities": {}, "language": "en", } # When/Then with self.assertRaises(DatasetFormatError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected chunk to have key: 'slot_name'", str(ctx.exception.args[0]))
def test_invalid_language_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": {}, "language": "eng", } # When/Then with self.assertRaises(DatasetFormatError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Unknown language: 'eng'", str(ctx.exception.args[0]))
def test_should_support_int_or_float_for_matching_strictness(self): # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False, "use_synonyms": True, "matching_strictness": 0.5 }, "entity2": { "data": [], "automatically_extensible": False, "use_synonyms": True, "matching_strictness": 1 } }, "language": "en", } # When/Then dataset = validate_and_format_dataset(dataset) self.assertEqual( 0.5, dataset["entities"]["entity1"].get("matching_strictness")) self.assertEqual( 1, dataset["entities"]["entity2"].get("matching_strictness"))
def test_missing_entity_key_should_raise_exception(self): # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False, "matching_strictness": 1.0 } }, "language": "en", } # When/Then with self.assertRaises(DatasetFormatError) as ctx: validate_and_format_dataset(dataset) self.assertEqual("Expected custom entity to have key: 'use_synonyms'", str(ctx.exception.args[0]))
def test_should_keep_license_info(self): # Given dataset = { "intents": {}, "entities": { "my_entity": { "data": [{"value": "foo", "synonyms": []}], "use_synonyms": True, "automatically_extensible": True, "matching_strictness": 1.0, "license_info": { "filename": "LICENSE", "content": "some license content here" } }, }, "language": "en" } # When validated_dataset = validate_and_format_dataset(dataset) # Then expected_dataset = { "entities": { "my_entity": { "automatically_extensible": True, "capitalize": False, "matching_strictness": 1.0, "utterances": { "Foo": "foo", "foo": "foo" }, "license_info": { "filename": "LICENSE", "content": "some license content here" } } }, "intents": {}, "language": "en", "validated": True } self.assertDictEqual(expected_dataset, validated_dataset)
def test_missing_matching_strictness_should_be_handled(self): # TODO: This test is temporary, and must be removed once the backward # compatibility with the previous dataset format, without # "matching_strictness", gets deprecated. # Given dataset = { "intents": {}, "entities": { "entity1": { "data": [], "automatically_extensible": False, "use_synonyms": True } }, "language": "en", } # When/Then dataset = validate_and_format_dataset(dataset) self.assertEqual( 1.0, dataset["entities"]["entity1"].get("matching_strictness"))
def test_should_not_avoid_synomyms_variations_collision(self): # Given dataset = { "intents": { "dummy_but_tricky_intent": { "utterances": [ { "data": [ { "text": "dummy_value", "entity": "dummy_but_tricky_entity", "slot_name": "dummy_but_tricky_slot" } ] } ] } }, "entities": { "dummy_but_tricky_entity": { "data": [ { "value": "a", "synonyms": [ "favorïte" ] }, { "value": "b", "synonyms": [ "favorite" ] } ], "use_synonyms": True, "automatically_extensible": False, "matching_strictness": 1.0 } }, "language": "en", } # When dataset = validate_and_format_dataset(dataset) # Then entity = dataset["entities"]["dummy_but_tricky_entity"] expected_utterances = { "A": "a", "B": "b", "DummyValue": "dummy_value", "Dummy_Value": "dummy_value", "Favorïte": "a", "a": "a", "b": "b", "dummy_value": "dummy_value", "dummyvalue": "dummy_value", "favorite": "b", "favorïte": "a" } self.assertDictEqual(expected_utterances, entity["utterances"])
def test_should_not_collapse_utterance_entity_variations(self): # Given dataset = { "language": "en", "intents": { "verify_length": { "utterances": [ { "data": [ { "text": "hello " }, { "text": "9", "slot_name": "expected", "entity": "expected" } ] }, { "data": [ { "text": "hello " }, { "text": "nine", "slot_name": "expected", "entity": "expected" } ] } ] } }, "entities": { "expected": { "automatically_extensible": True, "use_synonyms": True, "data": [], "matching_strictness": 1.0 } } } # When validated_dataset = validate_and_format_dataset(dataset) # Then expected_dataset = { "language": "en", "intents": { "verify_length": { "utterances": [ { "data": [ { "text": "hello " }, { "text": "9", "slot_name": "expected", "entity": "expected" } ] }, { "data": [ { "text": "hello " }, { "text": "nine", "slot_name": "expected", "entity": "expected" } ] } ] } }, "entities": { "expected": { "automatically_extensible": True, "matching_strictness": 1.0, "capitalize": False, "utterances": { "nine": "nine", "Nine": "nine", "9": "9" } } }, "validated": True } self.assertDictEqual(expected_dataset, validated_dataset)
"Entity_1": "Entity_1", "entity_1": "Entity_1", "entity 2": "Entity_1", "Entity 2": "Entity_1", }, "automatically_extensible": False, "capitalize": False, "matching_strictness": 1.0 } }, "language": "en", "validated": True } # When dataset = validate_and_format_dataset(dataset) # Then self.assertDictEqual(expected_dataset, dataset) @mock.patch("snips_nlu.dataset.validation.get_string_variations") def test_should_format_dataset_by_adding_entity_values( self, mocked_get_string_variations): # Given # pylint: disable=unused-argument def mock_get_string_variations( string, language, builtin_entity_parser, numbers=True, case=True, and_=True, punctuation=True ): return {string, string.title()}