def _load_words_inflections(stemming_path): try: inflection_path = next(stemming_path.glob("top_*_words_inflected.txt")) except StopIteration: return None inflections = dict() with inflection_path.open(encoding="utf8") as f: for line in f: elements = line.strip().split(';') inflections[normalize(elements[0])] = normalize(elements[1]) return inflections
def _load_words_inflections(language): inflection_paths = glob.glob( os.path.join(RESOURCES_PATH, language, "top_*_words_inflected.txt")) if not inflection_paths: return dict() inflections = dict() with io.open(inflection_paths[0], encoding="utf8") as f: for line in f: elements = line.strip().split(';') inflections[normalize(elements[0])] = normalize(elements[1]) return inflections
def _word_inflections(language): inflection_paths = glob.glob(os.path.join(RESOURCES_PATH, language, "top_*_words_inflected.txt")) if not inflection_paths: return dict() inflections = dict() with io.open(inflection_paths[0], encoding="utf8") as f: for line in f: elements = line.strip().split(';') inflections[normalize(elements[0])] = normalize(elements[1]) return inflections
def _verbs_lexemes(language): stems_paths = glob.glob(os.path.join(RESOURCES_PATH, language, "top_*_verbs_lexemes.txt")) if not stems_paths: return dict() verb_lexemes = dict() with io.open(stems_paths[0], encoding="utf8") as f: for line in f: elements = line.strip().split(';') verb = normalize(elements[0]) lexemes = elements[1].split(',') verb_lexemes.update( {normalize(lexeme): verb for lexeme in lexemes}) return verb_lexemes
def _normalize_stem(text, language): normalized_stemmed = normalize(text) try: normalized_stemmed = stem(normalized_stemmed, language) except UnknownResource: pass return normalized_stemmed
def _load_stop_words(stop_words_path): if not stop_words_path.exists(): return None with stop_words_path.open(encoding='utf8') as f: lines = (normalize(l) for l in f) stop_words = set(l for l in lines if l) return stop_words
def stem_token(token, resources): if token.stemmed_value: return token.stemmed_value if not token.normalized_value: token.normalized_value = normalize(token.value) token.stemmed_value = _stem(token.normalized_value, resources) return token.stemmed_value
def _utterance_to_pattern(self, utterance, stop_words, entity_placeholders): from snips_nlu_utils import normalize slot_names_count = defaultdict(int) pattern = [] for chunk in utterance[DATA]: if SLOT_NAME in chunk: slot_name = chunk[SLOT_NAME] slot_names_count[slot_name] += 1 group_name = self.slot_names_to_group_names[slot_name] count = slot_names_count[slot_name] if count > 1: group_name = "%s_%s" % (group_name, count) placeholder = entity_placeholders[chunk[ENTITY]] pattern.append(r"(?P<%s>%s)" % (group_name, placeholder)) else: tokens = tokenize_light(chunk[TEXT], self.language) pattern += [ regex_escape(t.lower()) for t in tokens if normalize(t) not in stop_words ] pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN, WHITESPACE_PATTERN.join(pattern), WHITESPACE_PATTERN) return pattern
def stem(string, language, resources): from snips_nlu_utils import normalize normalized_string = normalize(string) tokens = tokenize_light(normalized_string, language) stemmed_tokens = [_stem(token, resources) for token in tokens] return " ".join(stemmed_tokens)
def normalize_token(token): from snips_nlu_utils import normalize if token.normalized_value: return token.normalized_value token.normalized_value = normalize(token.value) return token.normalized_value
def _load_stop_words(language): if STOP_WORDS in RESOURCE_INDEX[language]: stop_words_file_path = os.path.join( get_resources_path(language), RESOURCE_INDEX[language][STOP_WORDS]) with io.open(stop_words_file_path, encoding='utf8') as f: lines = (normalize(l) for l in f) _RESOURCES[language][STOP_WORDS] = set(l for l in lines if l)
def _load_verbs_lexemes(stemming_path): try: lexems_path = next(stemming_path.glob("top_*_verbs_lexemes.txt")) except StopIteration: return None verb_lexemes = dict() with lexems_path.open(encoding="utf8") as f: for line in f: elements = line.strip().split(';') verb = normalize(elements[0]) lexemes = elements[1].split(',') verb_lexemes.update( {normalize(lexeme): verb for lexeme in lexemes}) return verb_lexemes
def _normalize_stem(text, language): normalized_stemmed = normalize(text) try: normalized_stemmed = stem(normalized_stemmed, language) except MissingResource: pass return normalized_stemmed
def _load_verbs_lexemes(language): stems_paths = glob.glob( os.path.join(RESOURCES_PATH, language, "top_*_verbs_lexemes.txt")) if not stems_paths: return dict() verb_lexemes = dict() with io.open(stems_paths[0], encoding="utf8") as f: for line in f: elements = line.strip().split(';') verb = normalize(elements[0]) lexemes = elements[1].split(',') verb_lexemes.update( {normalize(lexeme): verb for lexeme in lexemes}) return verb_lexemes
def _preprocess_text(self, txt, intent): """Replaces stop words and characters that are tokenized out by whitespaces""" stop_words = self._get_intent_stop_words(intent) tokens = tokenize_light(txt, self.language) cleaned_string = " ".join( [tkn for tkn in tokens if normalize(tkn) not in stop_words]) return cleaned_string.lower()
def stem_token(token, resources): from snips_nlu_utils import normalize if token.stemmed_value: return token.stemmed_value if not token.normalized_value: token.normalized_value = normalize(token.value) token.stemmed_value = _stem(token.normalized_value, resources) return token.stemmed_value
def extract_entity_values(dataset, apply_normalization): entities_per_intent = {intent: set() for intent in dataset[INTENTS]} intent_entities = extract_intent_entities(dataset) for intent, entities in iteritems(intent_entities): for entity in entities: entity_values = set(dataset[ENTITIES][entity][UTTERANCES]) if apply_normalization: entity_values = {normalize(v) for v in entity_values} entities_per_intent[intent].update(entity_values) return entities_per_intent
def _stem(t): t = normalize(t) if t == "beautiful": s = "beauty" elif t == "birdy": s = "bird" elif t == "entity": s = "ent" else: s = t return s
def _load_gazetteers(language): gazetteers_paths = { os.path.splitext(name)[0]: os.path.join(get_resources_path(language), name) for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) } gazetteers = dict() for name, path in iteritems(gazetteers_paths): with io.open(path, encoding="utf8") as f: gazetteers[name] = set() for l in f: normalized = normalize(l.strip()) if normalized: normalized = get_ignored_characters_pattern(language).join( [t.value for t in tokenize(normalized, language)]) gazetteers[name].add(normalized) return gazetteers
def _load_gazetteers(language): gazetteers_paths = { os.path.splitext(name)[0]: os.path.join( get_resources_path(language), name) for name in RESOURCE_INDEX[language].get(GAZETTEERS, []) } gazetteers = dict() for name, path in iteritems(gazetteers_paths): with io.open(path, encoding="utf8") as f: gazetteers[name] = set() for l in f: normalized = normalize(l.strip()) if normalized: normalized = get_ignored_characters_pattern(language).join( [t.value for t in tokenize(normalized, language)]) gazetteers[name].add(normalized) _RESOURCES[language][GAZETTEERS] = gazetteers
def _load_gazetteers(gazetteers_path, language): if not gazetteers_path.is_dir(): return dict() gazetteers = dict() for filepath in gazetteers_path.iterdir(): gazetteer_name = filepath.stem with filepath.open(encoding="utf8") as f: gazetteers[gazetteer_name] = set() for line in f: normalized = normalize(line.strip()) if normalized: token_values = (t.value for t in tokenize(normalized, language)) normalized = get_default_sep(language).join(token_values) gazetteers[gazetteer_name].add(normalized) return gazetteers
def resolve_slots(input, slots, dataset_entities, language, scope): # Do not use cached entities here as datetimes must be computed using # current context builtin_entities = get_builtin_entities(input, language, scope, use_cache=False) resolved_slots = [] for slot in slots: entity_name = slot[RES_ENTITY] raw_value = slot[RES_VALUE] if is_builtin_entity(entity_name): found = False for ent in builtin_entities: if ent[ENTITY_KIND] == entity_name and \ ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]: resolved_slot = builtin_slot(slot, ent[ENTITY]) resolved_slots.append(resolved_slot) found = True break if not found: builtin_matches = get_builtin_entities(raw_value, language, scope=[entity_name], use_cache=False) if builtin_matches: resolved_slot = builtin_slot(slot, builtin_matches[0][VALUE]) resolved_slots.append(resolved_slot) else: # custom slot entity = dataset_entities[entity_name] normalized_raw_value = normalize(raw_value) if raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][raw_value] elif normalized_raw_value in entity[UTTERANCES]: resolved_value = entity[UTTERANCES][normalized_raw_value] elif entity[AUTOMATICALLY_EXTENSIBLE]: resolved_value = raw_value else: # entity is skipped resolved_value = None if resolved_value is not None: resolved_slots.append(custom_slot(slot, resolved_value)) return resolved_slots
def normalized_value(self): if self._normalized_value is not None: return self._normalized_value self._normalized_value = normalize(self.value) return self._normalized_value
def normalization_variations(string): return {normalize(string)}
def _normalize_stem(text, language, resources, use_stemming): if use_stemming: return stem(text, language, resources) return normalize(text)
def preprocess(string): normalized = normalize(string) if resource_exists(self.language, STEMS) and self.use_stemming: return stem(normalized, self.language) return normalized
def test_should_normalize(self): self.assertEqual("hello", normalize("Hëllo"))
def normalization_variations(string): from snips_nlu_utils import normalize return {normalize(string)}
def preprocess(string): normalized = normalize(string) return stem(normalized, self.language) if self.use_stemming \ else normalized
def stem(string, language, resources): normalized_string = normalize(string) tokens = tokenize_light(normalized_string, language) stemmed_tokens = [_stem(token, resources) for token in tokens] return " ".join(stemmed_tokens)
def _normalize_stem(text, language, resources, use_stemming): from snips_nlu_utils import normalize if use_stemming: return stem(text, language, resources) return normalize(text)
def preprocess(string): normalized = normalize(string) if self.use_stemming: return stem(normalized, self.language) return normalized
def normalize_token(token): if token.normalized_value: return token.normalized_value token.normalized_value = normalize(token.value) return token.normalized_value
def stem_token(token, language): if token.stemmed_value: return token.stemmed_value token.stemmed_value = stem(normalize(token.value), language) return token.stemmed_value
def _normalize_stem(text, language, use_stemming): if use_stemming: return stem(text, language) return normalize(text)