def resolve_slots(input, slots, dataset_entities, language, scope): builtin_entities = get_builtin_entities(input, language, scope) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] if is_builtin_entity(entity_name): found = False for ent in builtin_entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = builtin_slot(slot, ent[ENTITY]) resolved_slots.append(resolved_slot) found = True break if not found: builtin_matches = get_builtin_entities(raw_value, language, scope=[entity_name]) if builtin_matches: resolved_slot = builtin_slot(slot, builtin_matches[0][VALUE]) resolved_slots.append(resolved_slot) else: # custom slot entity = dataset_entities[entity_name] if raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][raw_value] elif entity[AUTOMATICALLY_EXTENSIBLE]: resolved_value = raw_value else: # entity is skipped resolved_value = None if resolved_value is not None: resolved_slots.append(custom_slot(slot, resolved_value)) return resolved_slots
def get_slots(self, text): """Extracts slots from the provided text Returns: list of dict: The list of extracted slots Raises: NotTrained: When the slot filler is not fitted """ if not self.fitted: raise NotTrained("CRFSlotFiller must be fitted") tokens = tokenize(text, self.language) if not tokens: return [] features = self.compute_features(tokens) tags = [_decode_tag(tag) for tag in self.crf_model.predict_single(features)] slots = tags_to_slots(text, tokens, tags, self.config.tagging_scheme, self.slot_name_mapping) builtin_slots_names = set(slot_name for (slot_name, entity) in iteritems(self.slot_name_mapping) if is_builtin_entity(entity)) if not builtin_slots_names: return slots # Replace tags corresponding to builtin entities by outside tags tags = _replace_builtin_tags(tags, builtin_slots_names) return self._augment_slots(text, tokens, tags, builtin_slots_names)
def validate_and_format_dataset(dataset): """Checks that the dataset is valid and format it""" # Make this function idempotent if dataset.get(VALIDATED, False): return dataset dataset = deepcopy(dataset) dataset = json.loads(json.dumps(dataset)) validate_type(dataset, dict) mandatory_keys = [INTENTS, ENTITIES, LANGUAGE] for key in mandatory_keys: validate_key(dataset, key, object_label="dataset") validate_type(dataset[ENTITIES], dict) validate_type(dataset[INTENTS], dict) language = dataset[LANGUAGE] validate_type(language, str) if language not in get_all_languages(): raise ValueError("Unknown language: '%s'" % language) for intent in itervalues(dataset[INTENTS]): validate_and_format_intent(intent, dataset[ENTITIES]) queries_entities_values = extract_queries_entities(dataset) for entity_name, entity in iteritems(dataset[ENTITIES]): queries_entities = queries_entities_values[entity_name] if is_builtin_entity(entity_name): dataset[ENTITIES][entity_name] = \ validate_and_format_builtin_entity(entity, queries_entities) else: dataset[ENTITIES][entity_name] = validate_and_format_custom_entity( entity, queries_entities, language) dataset[VALIDATED] = True return dataset
def json(self): intent_datasets_json = { d.intent_name: d.json for d in self.intent_datasets } intents = { intent_name: { "utterances": dataset_json["utterances"] } for intent_name, dataset_json in iteritems(intent_datasets_json) } ents = deepcopy(self.entities) ents_values = dict() for entity_name, entity in iteritems(self.entities): ents_values[entity_name] = set(a.value for a in entity.utterances) if entity.use_synonyms: ents_values[entity_name].update( set(t for s in entity.utterances for t in s.synonyms)) for dataset in self.intent_datasets: for ent_name, ent in iteritems(dataset.entities): if ent_name not in ents: ents[ent_name] = ent elif not is_builtin_entity(ent_name): for u in ent.utterances: if u.value not in ents_values: ents[ent_name].utterances.append(u) ents = { entity_name: entity.json for entity_name, entity in iteritems(ents) } return dict(language=self.language, intents=intents, entities=ents)
def json(self): intent_datasets_json = {d.intent_name: d.json for d in self.intent_datasets} intents = { intent_name: { "utterances": dataset_json["utterances"] } for intent_name, dataset_json in iteritems(intent_datasets_json) } ents = deepcopy(self.entities) ents_values = dict() for entity_name, entity in iteritems(self.entities): ents_values[entity_name] = set(a.value for a in entity.utterances) if entity.use_synonyms: ents_values[entity_name].update( set(t for s in entity.utterances for t in s.synonyms)) for dataset in self.intent_datasets: for ent_name, ent in iteritems(dataset.entities): if ent_name not in ents: ents[ent_name] = ent elif not is_builtin_entity(ent_name): for u in ent.utterances: if u.value not in ents_values: ents[ent_name].utterances.append(u) ents = { entity_name: entity.json for entity_name, entity in iteritems(ents) } return dict(language=self.language, intents=intents, entities=ents)
def add_unknown_word_to_utterances(augmented_utterances, replacement_string, unknown_word_prob, random_state): for u in augmented_utterances: for chunk in u[DATA]: if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]) \ and random_state.rand() < unknown_word_prob: chunk[TEXT] = WORD_REGEX.sub(replacement_string, chunk[TEXT]) return augmented_utterances
def remove_builtin_slots(dataset): filtered_dataset = deepcopy(dataset) for intent_data in itervalues(filtered_dataset[INTENTS]): for utterance in intent_data[UTTERANCES]: utterance[DATA] = [ chunk for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])] return filtered_dataset
def entities(self): """retun all entities in json format for datasets""" ents = dict() for s in self.slots: if s.entity not in ents: ents[s.entity] = self.mk_entity(s) elif not is_builtin_entity(s.entity): ents[s.entity].utterances.append(EntityUtterance(s.text)) return ents
def _get_utterances_to_features_names(dataset, language): utterances_to_features = defaultdict(set) for entity_name, entity_data in iteritems(dataset[ENTITIES]): if is_builtin_entity(entity_name): continue for u in entity_data[UTTERANCES]: utterances_to_features[u].add( _entity_name_to_feature(entity_name, language)) return dict(utterances_to_features)
def extract_queries_entities(dataset): entities_values = {ent_name: [] for ent_name in dataset[ENTITIES]} for intent in itervalues(dataset[INTENTS]): for query in intent[UTTERANCES]: for chunk in query[DATA]: if ENTITY in chunk and not is_builtin_entity(chunk[ENTITY]): entities_values[chunk[ENTITY]].append(chunk[TEXT]) return {k: list(v) for k, v in iteritems(entities_values)}
def create_entity(entity_name, utterances=None, automatically_extensible=True, use_synonyms=True): if is_builtin_entity(entity_name): return BuiltinEntity(entity_name) else: if utterances is None: utterances = [] return CustomEntity(entity_name, utterances, automatically_extensible, use_synonyms)
def _get_utterances_to_features_names(dataset, language): utterances_to_features = defaultdict(set) for entity_name, entity_data in iteritems(dataset[ENTITIES]): if is_builtin_entity(entity_name): continue for u in entity_data[UTTERANCES]: utterances_to_features[u].add(_entity_name_to_feature( entity_name, language)) return dict(utterances_to_features)
def get_intent_custom_entities(dataset, intent): intent_entities = set() for utterance in dataset[INTENTS][intent][UTTERANCES]: for c in utterance[DATA]: if ENTITY in c: intent_entities.add(c[ENTITY]) custom_entities = dict() for ent in intent_entities: if not is_builtin_entity(ent): custom_entities[ent] = dataset[ENTITIES][ent] return custom_entities
def _get_joined_entity_utterances(dataset, language): joined_entity_utterances = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): if is_builtin_entity(entity_name): utterances = [_get_builtin_entity_name(entity_name, language)] else: utterances = list(entity[UTTERANCES]) utterances_patterns = map(regex_escape, utterances) utterances_patterns = (p for p in utterances_patterns if p) joined_entity_utterances[entity_name] = r"|".join( sorted(utterances_patterns, key=len, reverse=True)) return joined_entity_utterances
def _is_trainable(self, intent, dataset): if len(intent[UTTERANCES]) >= self.config.max_queries: return False intent_entities = set(chunk[ENTITY] for query in intent[UTTERANCES] for chunk in query[DATA] if ENTITY in chunk) total_entities = sum(len(dataset[ENTITIES][ent][UTTERANCES]) for ent in intent_entities if not is_builtin_entity(ent)) if total_entities > self.config.max_entities: return False return True
def parse_entity(self, msg, intent, slot): entity_label = self._engine._dataset_metadata[ 'slot_name_mappings'].get(intent, {}).get(slot) # TODO try to find a way to retrieve multiple slot values, that's a hard one # May be we can try matching on _dataset_metadata['entities'] if is_builtin_entity(entity_label): parsed = self._entity_parser.parse(msg) if parsed: return get_entity_value(parsed[0]['entity'], msg) return msg
def get_entities_iterators(intent_entities, language, random_state): entities_its = dict() for entity_name, entity in iteritems(intent_entities): utterance_values = random_state.permutation(list(entity[UTTERANCES])) if is_builtin_entity(entity_name): entity_examples = get_builtin_entity_examples(entity_name, language) # Builtin entity examples must be kept first in the iterator to # ensure that they are used when augmenting data iterator_values = entity_examples + list(utterance_values) else: iterator_values = utterance_values entities_its[entity_name] = cycle(iterator_values) return entities_its
def _get_dataset_metadata(dataset): entities = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): if is_builtin_entity(entity_name): continue ent = deepcopy(entity) ent.pop(CAPITALIZE) entities[entity_name] = ent slot_name_mappings = get_slot_name_mappings(dataset) return { "language_code": dataset[LANGUAGE], "entities": entities, "slot_name_mappings": slot_name_mappings }
def get_entities_iterators(intent_entities, language, add_builtin_entities_examples, random_state): entities_its = dict() for entity_name, entity in iteritems(intent_entities): utterance_values = random_state.permutation(list(entity[UTTERANCES])) if add_builtin_entities_examples and is_builtin_entity(entity_name): entity_examples = get_builtin_entity_examples(entity_name, language) # Builtin entity examples must be kept first in the iterator to # ensure that they are used when augmenting data iterator_values = entity_examples + list(utterance_values) else: iterator_values = utterance_values entities_its[entity_name] = cycle(iterator_values) return entities_its
def generate_utterance(contexts_iterator, entities_iterators): context = deepcopy(next(contexts_iterator)) context_data = [] for chunk in context[DATA]: if ENTITY in chunk: if not is_builtin_entity(chunk[ENTITY]): new_chunk = dict(chunk) new_chunk[TEXT] = deepcopy( next(entities_iterators[new_chunk[ENTITY]])) context_data.append(new_chunk) else: context_data.append(chunk) else: context_data.append(chunk) context[DATA] = context_data return context
def parse_entity(self, msg, intent, slot): entity_label = self._engine._dataset_metadata[ 'slot_name_mappings'].get(intent, {}).get(slot) # TODO try to find a way to retrieve multiple slot values, that's a hard one # May be we can try matching on _dataset_metadata['entities'] if entity_label: if is_builtin_entity(entity_label): parsed = self._entity_parser.parse(msg) if parsed: return [parsed[0]['entity']] # TODO if slot is not an auto-extensible, use fuzzy matching to match with restricted values return super(SnipsInterpreter, self).parse_entity(msg, intent, slot)
def resolve_slots(input, slots, dataset_entities, language, scope): # Do not use cached entities here as datetimes must be computed using # current context builtin_entities = get_builtin_entities(input, language, scope, use_cache=False) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] if is_builtin_entity(entity_name): found = False for ent in builtin_entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = builtin_slot(slot, ent[ENTITY]) resolved_slots.append(resolved_slot) found = True break if not found: builtin_matches = get_builtin_entities(raw_value, language, scope=[entity_name], use_cache=False) if builtin_matches: resolved_slot = builtin_slot(slot, builtin_matches[0][VALUE]) resolved_slots.append(resolved_slot) else: # custom slot entity = dataset_entities[entity_name] normalized_raw_value = normalize(raw_value) if raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][raw_value] elif normalized_raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][normalized_raw_value] elif entity[AUTOMATICALLY_EXTENSIBLE]: resolved_value = raw_value else: # entity is skipped resolved_value = None if resolved_value is not None: resolved_slots.append(custom_slot(slot, resolved_value)) return resolved_slots
def parse(self, text, intents=None): """Performs intent parsing on the provided *text* by calling its intent parsers successively Args: text (str): Input intents (str or list of str): If provided, reduces the scope of intent parsing to the provided list of intents Returns: dict: The most likely intent along with the extracted slots. See :func:`.parsing_result` for the output format. Raises: NotTrained: When the nlu engine is not fitted TypeError: When input type is not unicode """ logging.info("NLU engine parsing: '%s'...", text) if not isinstance(text, str): raise TypeError("Expected unicode but received: %s" % type(text)) if not self.fitted: raise NotTrained("SnipsNLUEngine must be fitted") if isinstance(intents, str): intents = [intents] language = self._dataset_metadata["language_code"] entities = self._dataset_metadata["entities"] for parser in self.intent_parsers: res = parser.parse(text, intents) if is_empty(res): continue slots = res[RES_SLOTS] scope = [ s[RES_ENTITY] for s in slots if is_builtin_entity(s[RES_ENTITY]) ] resolved_slots = resolve_slots(text, slots, entities, language, scope) return parsing_result(text, intent=res[RES_INTENT], slots=resolved_slots) return empty_result(text)
def capitalize_utterances(utterances, entities, language, ratio, random_state): capitalized_utterances = [] for utterance in utterances: capitalized_utterance = deepcopy(utterance) for i, chunk in enumerate(capitalized_utterance[DATA]): capitalized_utterance[DATA][i][TEXT] = chunk[TEXT].lower() if ENTITY not in chunk: continue entity_label = chunk[ENTITY] if is_builtin_entity(entity_label): continue if not entities[entity_label][CAPITALIZE]: continue if random_state.rand() > ratio: continue capitalized_utterance[DATA][i][TEXT] = capitalize( chunk[TEXT], language) capitalized_utterances.append(capitalized_utterance) return capitalized_utterances
def _reconciliate_builtin_slots(text, slots, builtin_entities): for slot in slots: if not is_builtin_entity(slot[RES_ENTITY]): continue for be in builtin_entities: if be[ENTITY_KIND] != slot[RES_ENTITY]: continue be_start = be[RES_MATCH_RANGE][START] be_end = be[RES_MATCH_RANGE][END] be_length = be_end - be_start slot_start = slot[RES_MATCH_RANGE][START] slot_end = slot[RES_MATCH_RANGE][END] slot_length = slot_end - slot_start if be_start <= slot_start and be_end >= slot_end \ and be_length > slot_length: slot[RES_MATCH_RANGE] = {START: be_start, END: be_end} slot[RES_VALUE] = text[be_start:be_end] break return slots
def parse(self, text, intents=None): """Performs intent parsing on the provided *text* by calling its intent parsers successively Args: text (str): Input intents (str or list of str): If provided, reduces the scope of intent parsing to the provided list of intents Returns: dict: The most likely intent along with the extracted slots. See :func:`.parsing_result` for the output format. Raises: NotTrained: When the nlu engine is not fitted TypeError: When input type is not unicode """ if not isinstance(text, str): raise TypeError("Expected unicode but received: %s" % type(text)) if not self.fitted: raise NotTrained("SnipsNLUEngine must be fitted") if isinstance(intents, str): intents = [intents] language = self._dataset_metadata["language_code"] entities = self._dataset_metadata["entities"] for parser in self.intent_parsers: res = parser.parse(text, intents) if is_empty(res): continue slots = res[RES_SLOTS] scope = [s[RES_ENTITY] for s in slots if is_builtin_entity(s[RES_ENTITY])] resolved_slots = resolve_slots(text, slots, entities, language, scope) return parsing_result(text, intent=res[RES_INTENT], slots=resolved_slots) return empty_result(text)
def validate_and_format_intent(intent, entities): validate_type(intent, dict) validate_key(intent, UTTERANCES, object_label="intent dict") validate_type(intent[UTTERANCES], list) for utterance in intent[UTTERANCES]: validate_type(utterance, dict) validate_key(utterance, DATA, object_label="utterance") validate_type(utterance[DATA], list) for chunk in utterance[DATA]: validate_type(chunk, dict) validate_key(chunk, TEXT, object_label="chunk") if ENTITY in chunk or SLOT_NAME in chunk: mandatory_keys = [ENTITY, SLOT_NAME] validate_keys(chunk, mandatory_keys, object_label="chunk") if is_builtin_entity(chunk[ENTITY]): continue else: validate_key(entities, chunk[ENTITY], object_label=ENTITIES) return intent
def _get_joined_entity_utterances(dataset, language): joined_entity_utterances = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): # matches are performed in a case insensitive manner utterances = set(u.lower() for u in entity[UTTERANCES]) patterns = [] if is_builtin_entity(entity_name): # We add a placeholder value for builtin entities placeholder = _get_entity_name_placeholder(entity_name, language) patterns.append(regex_escape(placeholder)) else: for utterance in utterances: tokens = tokenize_light(utterance, language) pattern = WHITESPACE_PATTERN.join(regex_escape(t) for t in tokens) patterns.append(pattern) patterns = (p for p in patterns if p) joined_entity_utterances[entity_name] = r"|".join( sorted(patterns, key=len, reverse=True)) return joined_entity_utterances
def augment_utterances(dataset, intent_name, language, min_utterances, capitalization_ratio, random_state): contexts_it = get_contexts_iterator(dataset, intent_name, random_state) intent_entities = get_intent_entities(dataset, intent_name) intent_entities = { e: dataset[ENTITIES][e] for e in intent_entities if not is_builtin_entity(e) } entities_its = get_entities_iterators(intent_entities, random_state) generated_utterances = [] nb_to_generate = num_queries_to_generate(dataset, intent_name, min_utterances) while nb_to_generate > 0: generated_utterance = generate_utterance(contexts_it, entities_its) generated_utterances.append(generated_utterance) nb_to_generate -= 1 generated_utterances = capitalize_utterances( generated_utterances, dataset[ENTITIES], language, ratio=capitalization_ratio, random_state=random_state) return generated_utterances
def _reconciliate_builtin_slots(text, slots, builtin_entities): for slot in slots: if not is_builtin_entity(slot[RES_ENTITY]): continue for be in builtin_entities: if be[ENTITY_KIND] != slot[RES_ENTITY]: continue be_start = be[RES_MATCH_RANGE][START] be_end = be[RES_MATCH_RANGE][END] be_length = be_end - be_start slot_start = slot[RES_MATCH_RANGE][START] slot_end = slot[RES_MATCH_RANGE][END] slot_length = slot_end - slot_start if be_start <= slot_start and be_end >= slot_end \ and be_length > slot_length: slot[RES_MATCH_RANGE] = { START: be_start, END: be_end } slot[RES_VALUE] = text[be_start: be_end] break return slots
def _preprocess_utterance(utterance, language, entity_utterances_to_features_names, word_clusters_name): utterance_text = get_text_from_chunks(utterance[DATA]) utterance_tokens = tokenize_light(utterance_text, language) word_clusters_features = _get_word_cluster_features( utterance_tokens, word_clusters_name, language) normalized_stemmed_tokens = [ _normalize_stem(t, language) for t in utterance_tokens ] entities_features = _get_dataset_entities_features( normalized_stemmed_tokens, entity_utterances_to_features_names) builtin_entities = get_builtin_entities(utterance_text, language, use_cache=True) builtin_entities_features = [ _builtin_entity_to_feature(ent[ENTITY_KIND], language) for ent in builtin_entities ] # We remove values of builtin slots from the utterance to avoid learning # specific samples such as '42' or 'tomorrow' filtered_normalized_stemmed_tokens = [ _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA] if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY]) ] features = get_default_sep(language).join( filtered_normalized_stemmed_tokens) if builtin_entities_features: features += " " + " ".join(sorted(builtin_entities_features)) if entities_features: features += " " + " ".join(sorted(entities_features)) if word_clusters_features: features += " " + " ".join(sorted(word_clusters_features)) return features
def augment_utterances(dataset, intent_name, language, min_utterances, capitalization_ratio, random_state): contexts_it = get_contexts_iterator(dataset, intent_name, random_state) intent_entities = get_intent_entities(dataset, intent_name) intent_entities = { e: dataset[ENTITIES][e] for e in intent_entities if not is_builtin_entity(e) } entities_its = get_entities_iterators(intent_entities, random_state) generated_utterances = [] nb_to_generate = num_queries_to_generate(dataset, intent_name, min_utterances) while nb_to_generate > 0: generated_utterance = generate_utterance(contexts_it, entities_its) generated_utterances.append(generated_utterance) nb_to_generate -= 1 generated_utterances = capitalize_utterances(generated_utterances, dataset[ENTITIES], language, ratio=capitalization_ratio, random_state=random_state) return generated_utterances
def __init__(self, name): if not is_builtin_entity(name): raise LookupError("Invalid builtin entity {}".format(name)) self.name = name
def mk_entity(cls, slot, automatically_extensible=True, use_synonyms=True): if is_builtin_entity(slot.entity): return BuiltinEntity(slot.entity) return CustomEntity([EntityUtterance(slot.text)], automatically_extensible, use_synonyms)