示例#1
0
def _load_words_inflections(stemming_path):
    try:
        inflection_path = next(stemming_path.glob("top_*_words_inflected.txt"))
    except StopIteration:
        return None

    inflections = dict()
    with inflection_path.open(encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            inflections[normalize(elements[0])] = normalize(elements[1])
    return inflections
示例#2
0
def _load_words_inflections(language):
    inflection_paths = glob.glob(
        os.path.join(RESOURCES_PATH, language, "top_*_words_inflected.txt"))
    if not inflection_paths:
        return dict()

    inflections = dict()
    with io.open(inflection_paths[0], encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            inflections[normalize(elements[0])] = normalize(elements[1])
    return inflections
示例#3
0
def _word_inflections(language):
    inflection_paths = glob.glob(os.path.join(RESOURCES_PATH, language,
                                              "top_*_words_inflected.txt"))
    if not inflection_paths:
        return dict()

    inflections = dict()
    with io.open(inflection_paths[0], encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            inflections[normalize(elements[0])] = normalize(elements[1])
    return inflections
示例#4
0
def _verbs_lexemes(language):
    stems_paths = glob.glob(os.path.join(RESOURCES_PATH, language,
                                         "top_*_verbs_lexemes.txt"))
    if not stems_paths:
        return dict()

    verb_lexemes = dict()
    with io.open(stems_paths[0], encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            verb = normalize(elements[0])
            lexemes = elements[1].split(',')
            verb_lexemes.update(
                {normalize(lexeme): verb for lexeme in lexemes})
    return verb_lexemes
示例#5
0
def _normalize_stem(text, language):
    normalized_stemmed = normalize(text)
    try:
        normalized_stemmed = stem(normalized_stemmed, language)
    except UnknownResource:
        pass
    return normalized_stemmed
示例#6
0
def _load_stop_words(stop_words_path):
    if not stop_words_path.exists():
        return None
    with stop_words_path.open(encoding='utf8') as f:
        lines = (normalize(l) for l in f)
        stop_words = set(l for l in lines if l)
    return stop_words
示例#7
0
def stem_token(token, resources):
    if token.stemmed_value:
        return token.stemmed_value
    if not token.normalized_value:
        token.normalized_value = normalize(token.value)
    token.stemmed_value = _stem(token.normalized_value, resources)
    return token.stemmed_value
    def _utterance_to_pattern(self, utterance, stop_words,
                              entity_placeholders):
        from snips_nlu_utils import normalize

        slot_names_count = defaultdict(int)
        pattern = []
        for chunk in utterance[DATA]:
            if SLOT_NAME in chunk:
                slot_name = chunk[SLOT_NAME]
                slot_names_count[slot_name] += 1
                group_name = self.slot_names_to_group_names[slot_name]
                count = slot_names_count[slot_name]
                if count > 1:
                    group_name = "%s_%s" % (group_name, count)
                placeholder = entity_placeholders[chunk[ENTITY]]
                pattern.append(r"(?P<%s>%s)" % (group_name, placeholder))
            else:
                tokens = tokenize_light(chunk[TEXT], self.language)
                pattern += [
                    regex_escape(t.lower()) for t in tokens
                    if normalize(t) not in stop_words
                ]

        pattern = r"^%s%s%s$" % (WHITESPACE_PATTERN,
                                 WHITESPACE_PATTERN.join(pattern),
                                 WHITESPACE_PATTERN)
        return pattern
def stem(string, language, resources):
    from snips_nlu_utils import normalize

    normalized_string = normalize(string)
    tokens = tokenize_light(normalized_string, language)
    stemmed_tokens = [_stem(token, resources) for token in tokens]
    return " ".join(stemmed_tokens)
def normalize_token(token):
    from snips_nlu_utils import normalize

    if token.normalized_value:
        return token.normalized_value
    token.normalized_value = normalize(token.value)
    return token.normalized_value
示例#11
0
def _load_stop_words(language):
    if STOP_WORDS in RESOURCE_INDEX[language]:
        stop_words_file_path = os.path.join(
            get_resources_path(language), RESOURCE_INDEX[language][STOP_WORDS])
        with io.open(stop_words_file_path, encoding='utf8') as f:
            lines = (normalize(l) for l in f)
            _RESOURCES[language][STOP_WORDS] = set(l for l in lines if l)
示例#12
0
def _load_verbs_lexemes(stemming_path):
    try:
        lexems_path = next(stemming_path.glob("top_*_verbs_lexemes.txt"))
    except StopIteration:
        return None

    verb_lexemes = dict()
    with lexems_path.open(encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            verb = normalize(elements[0])
            lexemes = elements[1].split(',')
            verb_lexemes.update(
                {normalize(lexeme): verb
                 for lexeme in lexemes})
    return verb_lexemes
示例#13
0
def _normalize_stem(text, language):
    normalized_stemmed = normalize(text)
    try:
        normalized_stemmed = stem(normalized_stemmed, language)
    except MissingResource:
        pass
    return normalized_stemmed
示例#14
0
def _load_verbs_lexemes(language):
    stems_paths = glob.glob(
        os.path.join(RESOURCES_PATH, language, "top_*_verbs_lexemes.txt"))
    if not stems_paths:
        return dict()

    verb_lexemes = dict()
    with io.open(stems_paths[0], encoding="utf8") as f:
        for line in f:
            elements = line.strip().split(';')
            verb = normalize(elements[0])
            lexemes = elements[1].split(',')
            verb_lexemes.update(
                {normalize(lexeme): verb
                 for lexeme in lexemes})
    return verb_lexemes
示例#15
0
 def _preprocess_text(self, txt, intent):
     """Replaces stop words and characters that are tokenized out by
         whitespaces"""
     stop_words = self._get_intent_stop_words(intent)
     tokens = tokenize_light(txt, self.language)
     cleaned_string = " ".join(
         [tkn for tkn in tokens if normalize(tkn) not in stop_words])
     return cleaned_string.lower()
示例#16
0
def _load_stop_words(language):
    if STOP_WORDS in RESOURCE_INDEX[language]:
        stop_words_file_path = os.path.join(
            get_resources_path(language),
            RESOURCE_INDEX[language][STOP_WORDS])
        with io.open(stop_words_file_path, encoding='utf8') as f:
            lines = (normalize(l) for l in f)
            _RESOURCES[language][STOP_WORDS] = set(l for l in lines if l)
def stem_token(token, resources):
    from snips_nlu_utils import normalize

    if token.stemmed_value:
        return token.stemmed_value
    if not token.normalized_value:
        token.normalized_value = normalize(token.value)
    token.stemmed_value = _stem(token.normalized_value, resources)
    return token.stemmed_value
示例#18
0
def extract_entity_values(dataset, apply_normalization):
    entities_per_intent = {intent: set() for intent in dataset[INTENTS]}
    intent_entities = extract_intent_entities(dataset)
    for intent, entities in iteritems(intent_entities):
        for entity in entities:
            entity_values = set(dataset[ENTITIES][entity][UTTERANCES])
            if apply_normalization:
                entity_values = {normalize(v) for v in entity_values}
            entities_per_intent[intent].update(entity_values)
    return entities_per_intent
 def _stem(t):
     t = normalize(t)
     if t == "beautiful":
         s = "beauty"
     elif t == "birdy":
         s = "bird"
     elif t == "entity":
         s = "ent"
     else:
         s = t
     return s
示例#20
0
def _load_gazetteers(language):
    gazetteers_paths = {
        os.path.splitext(name)[0]: os.path.join(get_resources_path(language),
                                                name)
        for name in RESOURCE_INDEX[language].get(GAZETTEERS, [])
    }
    gazetteers = dict()
    for name, path in iteritems(gazetteers_paths):
        with io.open(path, encoding="utf8") as f:
            gazetteers[name] = set()
            for l in f:
                normalized = normalize(l.strip())
                if normalized:
                    normalized = get_ignored_characters_pattern(language).join(
                        [t.value for t in tokenize(normalized, language)])
                    gazetteers[name].add(normalized)
    return gazetteers
示例#21
0
def _load_gazetteers(language):
    gazetteers_paths = {
        os.path.splitext(name)[0]: os.path.join(
            get_resources_path(language), name)
        for name in RESOURCE_INDEX[language].get(GAZETTEERS, [])
    }
    gazetteers = dict()
    for name, path in iteritems(gazetteers_paths):
        with io.open(path, encoding="utf8") as f:
            gazetteers[name] = set()
            for l in f:
                normalized = normalize(l.strip())
                if normalized:
                    normalized = get_ignored_characters_pattern(language).join(
                        [t.value for t in tokenize(normalized, language)])
                    gazetteers[name].add(normalized)
    _RESOURCES[language][GAZETTEERS] = gazetteers
示例#22
0
def _load_gazetteers(gazetteers_path, language):
    if not gazetteers_path.is_dir():
        return dict()

    gazetteers = dict()
    for filepath in gazetteers_path.iterdir():
        gazetteer_name = filepath.stem
        with filepath.open(encoding="utf8") as f:
            gazetteers[gazetteer_name] = set()
            for line in f:
                normalized = normalize(line.strip())
                if normalized:
                    token_values = (t.value
                                    for t in tokenize(normalized, language))
                    normalized = get_default_sep(language).join(token_values)
                    gazetteers[gazetteer_name].add(normalized)
    return gazetteers
示例#23
0
def resolve_slots(input, slots, dataset_entities, language, scope):
    # Do not use cached entities here as datetimes must be computed using
    # current context
    builtin_entities = get_builtin_entities(input,
                                            language,
                                            scope,
                                            use_cache=False)
    resolved_slots = []
    for slot in slots:
        entity_name = slot[RES_ENTITY]
        raw_value = slot[RES_VALUE]
        if is_builtin_entity(entity_name):
            found = False
            for ent in builtin_entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = builtin_slot(slot, ent[ENTITY])
                    resolved_slots.append(resolved_slot)
                    found = True
                    break
            if not found:
                builtin_matches = get_builtin_entities(raw_value,
                                                       language,
                                                       scope=[entity_name],
                                                       use_cache=False)
                if builtin_matches:
                    resolved_slot = builtin_slot(slot,
                                                 builtin_matches[0][VALUE])
                    resolved_slots.append(resolved_slot)
        else:  # custom slot
            entity = dataset_entities[entity_name]
            normalized_raw_value = normalize(raw_value)
            if raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][raw_value]
            elif normalized_raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][normalized_raw_value]
            elif entity[AUTOMATICALLY_EXTENSIBLE]:
                resolved_value = raw_value
            else:
                # entity is skipped
                resolved_value = None

            if resolved_value is not None:
                resolved_slots.append(custom_slot(slot, resolved_value))
    return resolved_slots
示例#24
0
 def normalized_value(self):
     if self._normalized_value is not None:
         return self._normalized_value
     self._normalized_value = normalize(self.value)
     return self._normalized_value
def normalization_variations(string):
    return {normalize(string)}
示例#26
0
def _normalize_stem(text, language, resources, use_stemming):
    if use_stemming:
        return stem(text, language, resources)
    return normalize(text)
示例#27
0
 def preprocess(string):
     normalized = normalize(string)
     if resource_exists(self.language, STEMS) and self.use_stemming:
         return stem(normalized, self.language)
     return normalized
示例#28
0
 def test_should_normalize(self):
     self.assertEqual("hello", normalize("Hëllo"))
示例#29
0
def normalization_variations(string):
    from snips_nlu_utils import normalize

    return {normalize(string)}
示例#30
0
 def preprocess(string):
     normalized = normalize(string)
     return stem(normalized, self.language) if self.use_stemming \
         else normalized
示例#31
0
def stem(string, language, resources):
    normalized_string = normalize(string)
    tokens = tokenize_light(normalized_string, language)
    stemmed_tokens = [_stem(token, resources) for token in tokens]
    return " ".join(stemmed_tokens)
示例#32
0
def normalization_variations(string):
    return {normalize(string)}
def _normalize_stem(text, language, resources, use_stemming):
    from snips_nlu_utils import normalize

    if use_stemming:
        return stem(text, language, resources)
    return normalize(text)
示例#34
0
 def preprocess(string):
     normalized = normalize(string)
     if self.use_stemming:
         return stem(normalized, self.language)
     return normalized
示例#35
0
 def normalized_value(self):
     if self._normalized_value is not None:
         return self._normalized_value
     self._normalized_value = normalize(self.value)
     return self._normalized_value
示例#36
0
def normalize_token(token):
    if token.normalized_value:
        return token.normalized_value
    token.normalized_value = normalize(token.value)
    return token.normalized_value
示例#37
0
def stem_token(token, language):
    if token.stemmed_value:
        return token.stemmed_value
    token.stemmed_value = stem(normalize(token.value), language)
    return token.stemmed_value
示例#38
0
def _normalize_stem(text, language, use_stemming):
    if use_stemming:
        return stem(text, language)
    return normalize(text)
 def preprocess(string):
     normalized = normalize(string)
     if resource_exists(self.language, STEMS) and self.use_stemming:
         return stem(normalized, self.language)
     return normalized