def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it""" # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict) mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict) validate_type(dataset[INTENTS], dict) language = dataset[LANGUAGE] validate_type(language, str) if language not in get_all_languages(): raise ValueError("Unknown language: '%s'" % language) for intent in itervalues(dataset[INTENTS]): validate_and_format_intent(intent, dataset[ENTITIES]) queries_entities_values = extract_queries_entities(dataset) for entity_name, entity in iteritems(dataset[ENTITIES]): queries_entities = queries_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ validate_and_format_builtin_entity(entity, queries_entities) else: dataset[ENTITIES][entity_name] = validate_and_format_custom_entity( entity, queries_entities, language) dataset[VALIDATED] = True return dataset
def validate_and_format_intent(intent, entities): validate_type(intent, dict) validate_key(intent, UTTERANCES, object_label="intent dict") validate_type(intent[UTTERANCES], list) for utterance in intent[UTTERANCES]: validate_type(utterance, dict) validate_key(utterance, DATA, object_label="utterance") validate_type(utterance[DATA], list) for chunk in utterance[DATA]: validate_type(chunk, dict) validate_key(chunk, TEXT, object_label="chunk") if ENTITY in chunk or SLOT_NAME in chunk: mandatory_keys = [ENTITY, SLOT_NAME] validate_keys(chunk, mandatory_keys, object_label="chunk") if is_builtin_entity(chunk[ENTITY]): continue else: validate_key(entities, chunk[ENTITY], object_label=ENTITIES) return intent