def _get_joined_entity_utterances(dataset, language): joined_entity_utterances = dict() for entity_name, entity in iteritems(dataset[ENTITIES]): # matches are performed in a case insensitive manner utterances = set(u.lower() for u in entity[UTTERANCES]) patterns = [] if is_builtin_entity(entity_name): # We add a placeholder value for builtin entities placeholder = _get_entity_name_placeholder(entity_name, language) patterns.append(regex_escape(placeholder)) else: for utterance in utterances: tokens = tokenize_light(utterance, language) pattern = WHITESPACE_PATTERN.join(regex_escape(t) for t in tokens) patterns.append(pattern) patterns = (p for p in patterns if p) joined_entity_utterances[entity_name] = r"|".join( sorted(patterns, key=len, reverse=True)) return joined_entity_utterances
def _query_to_pattern(query, joined_entity_utterances, group_names_to_slot_names, language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: max_index = _generate_new_index(group_names_to_slot_names) slot_name = chunk[SLOT_NAME] entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] ignored_char_pattern = get_ignored_characters_pattern(language) pattern = r"^%s%s%s$" % (ignored_char_pattern, ignored_char_pattern.join(pattern), ignored_char_pattern) return pattern, group_names_to_slot_names
def _query_to_pattern(query, joined_entity_utterances, group_names_to_slot_names, language): pattern = [] for chunk in query[DATA]: if SLOT_NAME in chunk: max_index = _generate_new_index(group_names_to_slot_names) slot_name = chunk[SLOT_NAME] entity = chunk[ENTITY] group_names_to_slot_names[max_index] = slot_name pattern.append( r"(?P<%s>%s)" % (max_index, joined_entity_utterances[entity])) else: tokens = tokenize_light(chunk[TEXT], language) pattern += [regex_escape(t) for t in tokens] pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN, WHITESPACE_PATTERN.join(pattern), WHITESPACE_PATTERN) return pattern, group_names_to_slot_names
from __future__ import unicode_literals import re import string from num2words import num2words from snips_nlu.utils import regex_escape SPACE = " " WHITE_SPACES = "%s\t\n\r\f\v" % SPACE # equivalent of r"\s" COMMONLY_IGNORED_CHARACTERS = "%s%s" % (WHITE_SPACES, string.punctuation) COMMONLY_IGNORED_CHARACTERS_PATTERN = r"[%s]*" % regex_escape( COMMONLY_IGNORED_CHARACTERS) _PUNCTUATION_REGEXES = dict() _NUM2WORDS_SUPPORT = dict() # pylint:disable=unused-argument def get_default_sep(language): return " " # pylint:enable=unused-argument # pylint:disable=unused-argument def get_punctuation(language): return string.punctuation