def validate_and_format_custom_entity(entity, queries_entities, language): validate_type(entity, dict) mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA] validate_keys(entity, mandatory_keys, object_label="entity") validate_type(entity[USE_SYNONYMS], bool) validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool) validate_type(entity[DATA], list) formatted_entity = dict() formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[ AUTOMATICALLY_EXTENSIBLE] use_synonyms = entity[USE_SYNONYMS] # Validate format and filter out unused data valid_entity_data = [] for entry in entity[DATA]: validate_type(entry, dict) validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry") entry[VALUE] = entry[VALUE].strip() if not entry[VALUE]: continue validate_type(entry[SYNONYMS], list) entry[SYNONYMS] = [ s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0 ] valid_entity_data.append(entry) entity[DATA] = valid_entity_data # Compute capitalization before normalizing # Normalization lowercase and hence lead to bad capitalization calculation formatted_entity[CAPITALIZE] = has_any_capitalization( queries_entities, language) # Normalize validated_data = dict() for entry in entity[DATA]: entry_value = entry[VALUE] validated_data = add_variation_if_needed(validated_data, entry_value, entry_value, language) if use_synonyms: for s in entry[SYNONYMS]: validated_data = add_variation_if_needed( validated_data, s, entry_value, language) formatted_entity[UTTERANCES] = validated_data # Merge queries_entities for value in queries_entities: formatted_entity = add_entity_value_if_missing(value, formatted_entity, language) return formatted_entity
def validate_and_format_custom_entity(entity, queries_entities, language): validate_type(entity, dict) mandatory_keys = [USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA] validate_keys(entity, mandatory_keys, object_label="entity") validate_type(entity[USE_SYNONYMS], bool) validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool) validate_type(entity[DATA], list) formatted_entity = dict() formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[ AUTOMATICALLY_EXTENSIBLE] use_synonyms = entity[USE_SYNONYMS] # Validate format and filter out unused data valid_entity_data = [] for entry in entity[DATA]: validate_type(entry, dict) validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry") entry[VALUE] = entry[VALUE].strip() if not entry[VALUE]: continue validate_type(entry[SYNONYMS], list) entry[SYNONYMS] = [s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0] valid_entity_data.append(entry) entity[DATA] = valid_entity_data # Compute capitalization before normalizing # Normalization lowercase and hence lead to bad capitalization calculation formatted_entity[CAPITALIZE] = has_any_capitalization(queries_entities, language) # Normalize validated_data = dict() for entry in entity[DATA]: entry_value = entry[VALUE] validated_data = add_variation_if_needed( validated_data, entry_value, entry_value, language) if use_synonyms: for s in entry[SYNONYMS]: validated_data = add_variation_if_needed( validated_data, s, entry_value, language) formatted_entity[UTTERANCES] = validated_data # Merge queries_entities for value in queries_entities: formatted_entity = add_entity_value_if_missing( value, formatted_entity, language) return formatted_entity
def validate_and_format_intent(intent, entities): validate_type(intent, dict) validate_key(intent, UTTERANCES, object_label="intent dict") validate_type(intent[UTTERANCES], list) for utterance in intent[UTTERANCES]: validate_type(utterance, dict) validate_key(utterance, DATA, object_label="utterance") validate_type(utterance[DATA], list) for chunk in utterance[DATA]: validate_type(chunk, dict) validate_key(chunk, TEXT, object_label="chunk") if ENTITY in chunk or SLOT_NAME in chunk: mandatory_keys = [ENTITY, SLOT_NAME] validate_keys(chunk, mandatory_keys, object_label="chunk") if is_builtin_entity(chunk[ENTITY]): continue else: validate_key(entities, chunk[ENTITY], object_label=ENTITIES) return intent
def validate_and_format_custom_entity(entity, queries_entities, language, builtin_entity_parser): validate_type(entity, dict) # TODO: this is here temporarily, only to allow backward compatibility if MATCHING_STRICTNESS not in entity: strictness = entity.get("parser_threshold", 1.0) entity[MATCHING_STRICTNESS] = strictness mandatory_keys = [ USE_SYNONYMS, AUTOMATICALLY_EXTENSIBLE, DATA, MATCHING_STRICTNESS ] validate_keys(entity, mandatory_keys, object_label="entity") validate_type(entity[USE_SYNONYMS], bool) validate_type(entity[AUTOMATICALLY_EXTENSIBLE], bool) validate_type(entity[DATA], list) validate_type(entity[MATCHING_STRICTNESS], float) formatted_entity = dict() formatted_entity[AUTOMATICALLY_EXTENSIBLE] = entity[ AUTOMATICALLY_EXTENSIBLE] formatted_entity[MATCHING_STRICTNESS] = entity[MATCHING_STRICTNESS] use_synonyms = entity[USE_SYNONYMS] # Validate format and filter out unused data valid_entity_data = [] for entry in entity[DATA]: validate_type(entry, dict) validate_keys(entry, [VALUE, SYNONYMS], object_label="entity entry") entry[VALUE] = entry[VALUE].strip() if not entry[VALUE]: continue validate_type(entry[SYNONYMS], list) entry[SYNONYMS] = [ s.strip() for s in entry[SYNONYMS] if len(s.strip()) > 0 ] valid_entity_data.append(entry) entity[DATA] = valid_entity_data # Compute capitalization before normalizing # Normalization lowercase and hence lead to bad capitalization calculation formatted_entity[CAPITALIZE] = has_any_capitalization( queries_entities, language) validated_utterances = dict() # Map original values an synonyms for data in entity[DATA]: ent_value = data[VALUE] if not ent_value: continue validated_utterances[ent_value] = ent_value if use_synonyms: for s in data[SYNONYMS]: if s and s not in validated_utterances: validated_utterances[s] = ent_value # Add variations if not colliding all_original_values = _extract_entity_values(entity) variations = dict() for data in entity[DATA]: ent_value = data[VALUE] values_to_variate = {ent_value} if use_synonyms: values_to_variate.update(set(data[SYNONYMS])) variations[ent_value] = set( v for value in values_to_variate for v in get_string_variations( value, language, builtin_entity_parser)) variation_counter = Counter( [v for vars in itervalues(variations) for v in vars]) non_colliding_variations = { value: [ v for v in variations if v not in all_original_values and variation_counter[v] == 1 ] for value, variations in iteritems(variations) } for entry in entity[DATA]: entry_value = entry[VALUE] validated_utterances = add_entity_variations(validated_utterances, non_colliding_variations, entry_value) # Merge queries entities queries_entities_variations = { ent: get_string_variations(ent, language, builtin_entity_parser) for ent in queries_entities } for original_ent, variations in iteritems(queries_entities_variations): if not original_ent or original_ent in validated_utterances: continue validated_utterances[original_ent] = original_ent for variation in variations: if variation and variation not in validated_utterances: validated_utterances[variation] = original_ent formatted_entity[UTTERANCES] = validated_utterances return formatted_entity