Exemplo n.º 1
0
def resolve_slots(input, slots, dataset_entities, language, scope):
    builtin_entities = get_builtin_entities(input, language, scope)
    resolved_slots = []
    for slot in slots:
        entity_name = slot[RES_ENTITY]
        raw_value = slot[RES_VALUE]
        if is_builtin_entity(entity_name):
            found = False
            for ent in builtin_entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = builtin_slot(slot, ent[ENTITY])
                    resolved_slots.append(resolved_slot)
                    found = True
                    break
            if not found:
                builtin_matches = get_builtin_entities(raw_value, language,
                                                       scope=[entity_name])
                if builtin_matches:
                    resolved_slot = builtin_slot(slot,
                                                 builtin_matches[0][VALUE])
                    resolved_slots.append(resolved_slot)
        else:  # custom slot
            entity = dataset_entities[entity_name]
            if raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][raw_value]
            elif entity[AUTOMATICALLY_EXTENSIBLE]:
                resolved_value = raw_value
            else:
                # entity is skipped
                resolved_value = None

            if resolved_value is not None:
                resolved_slots.append(custom_slot(slot, resolved_value))
    return resolved_slots
Exemplo n.º 2
0
    def test_get_builtin_entities_should_support_all_languages(self):
        # Given
        text = ""

        for language in get_all_languages():
            msg = "get_builtin_entities does not support %s." % language
            with self.fail_if_exception(msg):
                # When / Then
                get_builtin_entities(text, language)
Exemplo n.º 3
0
    def test_get_builtin_entities_should_support_all_languages(self):
        # Given
        text = ""

        for language in get_all_languages():
            msg = "get_builtin_entities does not support %s." % language
            with self.fail_if_exception(msg):
                # When / Then
                get_builtin_entities(text, language)
Exemplo n.º 4
0
def numbers_variations(string, language):
    if not supports_num2words(language):
        return set()

    number_entities = get_builtin_entities(string,
                                           language,
                                           scope=[SNIPS_NUMBER])

    number_entities = sorted(number_entities,
                             key=lambda x: x[RES_MATCH_RANGE][START])
    if not number_entities:
        return set()

    digit_values = [digit_value(e) for e in number_entities]
    alpha_values = [alphabetic_value(e, language) for e in number_entities]

    values = [(n[RES_MATCH_RANGE], (d, a))
              for (n, d, a) in zip(number_entities, digit_values, alpha_values)
              if a is not None]

    n_values = len(values)
    if 2**n_values > MAX_ENTITY_VARIATIONS:
        return set()

    combinations = itertools.product(range(2), repeat=n_values)
    variations = set()
    for c in combinations:
        ranges_and_utterances = [(values[i][0], values[i][1][ix])
                                 for i, ix in enumerate(c)]
        variations.add(build_variated_query(string, ranges_and_utterances))
    return variations
def _replace_builtin_entities(text, language):
    builtin_entities = get_builtin_entities(text, language, use_cache=True)
    if not builtin_entities:
        return dict(), text

    range_mapping = dict()
    processed_text = ""
    offset = 0
    current_ix = 0
    builtin_entities = sorted(builtin_entities,
                              key=lambda e: e[RES_MATCH_RANGE][START])
    for ent in builtin_entities:
        ent_start = ent[RES_MATCH_RANGE][START]
        ent_end = ent[RES_MATCH_RANGE][END]
        rng_start = ent_start + offset

        processed_text += text[current_ix:ent_start]

        entity_length = ent_end - ent_start
        entity_place_holder = _get_entity_name_placeholder(ent[ENTITY_KIND],
                                                           language)

        offset += len(entity_place_holder) - entity_length

        processed_text += entity_place_holder
        rng_end = ent_end + offset
        new_range = (rng_start, rng_end)
        range_mapping[new_range] = ent[RES_MATCH_RANGE]
        current_ix = ent_end

    processed_text += text[current_ix:]
    return range_mapping, processed_text
Exemplo n.º 6
0
def numbers_variations(string, language):
    variations = set()
    if not supports_num2words(language):
        return variations

    number_entities = get_builtin_entities(
        string, language, scope=[SNIPS_NUMBER])

    number_entities = sorted(number_entities,
                             key=lambda x: x[RES_MATCH_RANGE][START])
    if not number_entities:
        return variations

    digit_values = [digit_value(e) for e in number_entities]
    alpha_values = [alphabetic_value(e, language) for e in number_entities]

    values = [(n[RES_MATCH_RANGE], (d, a)) for (n, d, a) in
              zip(number_entities, digit_values, alpha_values)
              if a is not None]

    combinations = itertools.product(range(2), repeat=len(values))
    for c in combinations:
        ranges_and_utterances = [(values[i][0], values[i][1][ix])
                                 for i, ix in enumerate(c)]
        variations.add(build_variated_query(string, ranges_and_utterances))
    return variations
def _replace_builtin_entities(text, language):
    builtin_entities = get_builtin_entities(text, language)
    if not builtin_entities:
        return dict(), text

    range_mapping = dict()
    processed_text = ""
    offset = 0
    current_ix = 0
    builtin_entities = sorted(builtin_entities,
                              key=lambda e: e[RES_MATCH_RANGE][START])
    for ent in builtin_entities:
        ent_start = ent[RES_MATCH_RANGE][START]
        ent_end = ent[RES_MATCH_RANGE][END]
        rng_start = ent_start + offset

        processed_text += text[current_ix:ent_start]

        entity_length = ent_end - ent_start
        entity_place_holder = _get_builtin_entity_name(ent[ENTITY_KIND],
                                                       language)

        offset += len(entity_place_holder) - entity_length

        processed_text += entity_place_holder
        rng_end = ent_end + offset
        new_range = (rng_start, rng_end)
        range_mapping[new_range] = ent[RES_MATCH_RANGE]
        current_ix = ent_end

    processed_text += text[current_ix:]
    return range_mapping, processed_text
Exemplo n.º 8
0
def resolve_slots(input, slots, dataset_entities, language, scope):
    # Do not use cached entities here as datetimes must be computed using
    # current context
    builtin_entities = get_builtin_entities(input,
                                            language,
                                            scope,
                                            use_cache=False)
    resolved_slots = []
    for slot in slots:
        entity_name = slot[RES_ENTITY]
        raw_value = slot[RES_VALUE]
        if is_builtin_entity(entity_name):
            found = False
            for ent in builtin_entities:
                if ent[ENTITY_KIND] == entity_name and \
                        ent[RES_MATCH_RANGE] == slot[RES_MATCH_RANGE]:
                    resolved_slot = builtin_slot(slot, ent[ENTITY])
                    resolved_slots.append(resolved_slot)
                    found = True
                    break
            if not found:
                builtin_matches = get_builtin_entities(raw_value,
                                                       language,
                                                       scope=[entity_name],
                                                       use_cache=False)
                if builtin_matches:
                    resolved_slot = builtin_slot(slot,
                                                 builtin_matches[0][VALUE])
                    resolved_slots.append(resolved_slot)
        else:  # custom slot
            entity = dataset_entities[entity_name]
            normalized_raw_value = normalize(raw_value)
            if raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][raw_value]
            elif normalized_raw_value in entity[UTTERANCES]:
                resolved_value = entity[UTTERANCES][normalized_raw_value]
            elif entity[AUTOMATICALLY_EXTENSIBLE]:
                resolved_value = raw_value
            else:
                # entity is skipped
                resolved_value = None

            if resolved_value is not None:
                resolved_slots.append(custom_slot(slot, resolved_value))
    return resolved_slots
Exemplo n.º 9
0
    def test_get_builtin_entities_should_respect_scope(self):
        # Given
        text = "meet me at 10 p.m."

        # When
        scope = ["snips/number"]
        parse = get_builtin_entities(text, "en", scope=scope)

        # Then
        self.assertEqual(len(parse), 1)
        self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
Exemplo n.º 10
0
    def test_get_builtin_entities_should_respect_scope(self):
        # Given
        text = "meet me at 10 p.m."

        # When
        scope = ["snips/number"]
        parse = get_builtin_entities(text, "en", scope=scope)

        # Then
        self.assertEqual(len(parse), 1)
        self.assertEqual(parse[0][ENTITY_KIND], "snips/number")
Exemplo n.º 11
0
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        scope = set(self.slot_name_mapping[slot]
                    for slot in builtin_slots_names)
        builtin_entities = [
            be for entity_kind in scope for be in get_builtin_entities(
                text, self.language, [entity_kind], use_cache=True)
        ]
        # We remove builtin entities which conflicts with custom slots
        # extracted by the CRF
        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        # We resolve conflicts between builtin entities by keeping the longest
        # matches. In case when two builtin entities span the same range, we
        # keep both.
        builtin_entities = _disambiguate_builtin_entities(builtin_entities)

        # We group builtin entities based on their position
        grouped_entities = (
            list(bes)
            for _, bes in groupby(builtin_entities,
                                  key=lambda s: s[RES_MATCH_RANGE][START]))
        grouped_entities = sorted(
            grouped_entities,
            key=lambda entities: entities[0][RES_MATCH_RANGE][START])

        features = self.compute_features(tokens)
        spans_ranges = [entities[0][RES_MATCH_RANGE]
                        for entities in grouped_entities]
        tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)

        # We loop on all possible slots permutations and use the CRF to find
        # the best one in terms of probability
        slots_permutations = _get_slots_permutations(
            grouped_entities, self.slot_name_mapping)
        best_updated_tags = tags
        best_permutation_score = -1
        for slots in slots_permutations:
            updated_tags = copy(tags)
            for slot_index, slot in enumerate(slots):
                indexes = tokens_indexes[slot_index]
                sub_tags_sequence = positive_tagging(
                    self.config.tagging_scheme, slot, len(indexes))
                updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence
            score = self._get_sequence_probability(features, updated_tags)
            if score > best_permutation_score:
                best_updated_tags = updated_tags
                best_permutation_score = score
        slots = tags_to_slots(text, tokens, best_updated_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)

        return _reconciliate_builtin_slots(text, slots, builtin_entities)
Exemplo n.º 12
0
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        scope = set(self.slot_name_mapping[slot]
                    for slot in builtin_slots_names)
        builtin_entities = [be for entity_kind in scope
                            for be in get_builtin_entities(text, self.language,
                                                           [entity_kind])]
        # We remove builtin entities which conflicts with custom slots
        # extracted by the CRF
        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        # We resolve conflicts between builtin entities by keeping the longest
        # matches. In case when two builtin entities span the same range, we
        # keep both.
        builtin_entities = _disambiguate_builtin_entities(builtin_entities)

        # We group builtin entities based on their position
        grouped_entities = (
            list(bes)
            for _, bes in groupby(builtin_entities,
                                  key=lambda s: s[RES_MATCH_RANGE][START]))
        grouped_entities = sorted(
            grouped_entities,
            key=lambda entities: entities[0][RES_MATCH_RANGE][START])

        features = self.compute_features(tokens)
        spans_ranges = [entities[0][RES_MATCH_RANGE]
                        for entities in grouped_entities]
        tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)

        # We loop on all possible slots permutations and use the CRF to find
        # the best one in terms of probability
        slots_permutations = _get_slots_permutations(
            grouped_entities, self.slot_name_mapping)
        best_updated_tags = tags
        best_permutation_score = -1
        for slots in slots_permutations:
            updated_tags = copy(tags)
            for slot_index, slot in enumerate(slots):
                indexes = tokens_indexes[slot_index]
                sub_tags_sequence = positive_tagging(
                    self.config.tagging_scheme, slot, len(indexes))
                updated_tags[indexes[0]:indexes[-1] + 1] = sub_tags_sequence
            score = self._get_sequence_probability(features, updated_tags)
            if score > best_permutation_score:
                best_updated_tags = updated_tags
                best_permutation_score = score
        slots = tags_to_slots(text, tokens, best_updated_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)

        return _reconciliate_builtin_slots(text, slots, builtin_entities)
Exemplo n.º 13
0
    def _augment_slots(self, text, tokens, tags, builtin_slots_names):
        augmented_tags = tags
        scope = [self.slot_name_mapping[slot] for slot in builtin_slots_names]
        builtin_entities = get_builtin_entities(text, self.language, scope)

        builtin_entities = _filter_overlapping_builtins(
            builtin_entities, tokens, tags, self.config.tagging_scheme)

        grouped_entities = groupby(builtin_entities,
                                   key=lambda s: s[ENTITY_KIND])
        features = None
        for entity, matches in grouped_entities:
            spans_ranges = [match[RES_MATCH_RANGE] for match in matches]
            num_possible_builtins = len(spans_ranges)
            tokens_indexes = _spans_to_tokens_indexes(spans_ranges, tokens)
            related_slots = list(
                set(s for s in builtin_slots_names if
                    self.slot_name_mapping[s] == entity))
            best_updated_tags = augmented_tags
            best_permutation_score = -1

            for slots in _generate_slots_permutations(
                    num_possible_builtins, related_slots,
                    self.config.exhaustive_permutations_threshold):
                updated_tags = copy(augmented_tags)
                for slot_index, slot in enumerate(slots):
                    if slot_index >= len(tokens_indexes):
                        break
                    indexes = tokens_indexes[slot_index]
                    sub_tags_sequence = positive_tagging(
                        self.config.tagging_scheme, slot, len(indexes))
                    updated_tags[indexes[0]:indexes[-1] + 1] = \
                        sub_tags_sequence
                if features is None:
                    features = self.compute_features(tokens)
                score = self._get_sequence_probability(features, updated_tags)
                if score > best_permutation_score:
                    best_updated_tags = updated_tags
                    best_permutation_score = score
            augmented_tags = best_updated_tags
        slots = tags_to_slots(text, tokens, augmented_tags,
                              self.config.tagging_scheme,
                              self.slot_name_mapping)
        return _reconciliate_builtin_slots(text, slots, builtin_entities)
Exemplo n.º 14
0
        def builtin_entity_match(tokens, token_index):
            text = initial_string_from_tokens(tokens)
            start = tokens[token_index].start
            end = tokens[token_index].end

            builtin_entities = get_builtin_entities(
                text, self.language, scope=[builtin_entity])
            builtin_entities = [ent for ent in builtin_entities
                                if entity_filter(ent, start, end)]
            for ent in builtin_entities:
                entity_start = ent[RES_MATCH_RANGE][START]
                entity_end = ent[RES_MATCH_RANGE][END]
                indexes = []
                for index, token in enumerate(tokens):
                    if (entity_start <= token.start < entity_end) \
                            and (entity_start < token.end <= entity_end):
                        indexes.append(index)
                return get_scheme_prefix(token_index, indexes,
                                         self.tagging_scheme)
Exemplo n.º 15
0
    def test_alphabetic_value(self):
        # Given
        language = LANGUAGE_EN
        string = "1 time and 23 times and one thousand and sixty and 1.2"
        entities = get_builtin_entities(string, language)
        entities = sorted(entities, key=lambda x: x[RES_MATCH_RANGE][START])

        expected_values = [
            "one", "twenty-three", "one thousand and sixty", None
        ]

        self.assertEqual(len(entities), len(expected_values))

        for i, ent in enumerate(entities):
            # When
            value = alphabetic_value(ent, language)

            # Then
            self.assertEqual(value, expected_values[i])
Exemplo n.º 16
0
        def builtin_entity_match(tokens, token_index):
            text = initial_string_from_tokens(tokens)
            start = tokens[token_index].start
            end = tokens[token_index].end

            builtin_entities = get_builtin_entities(
                text, self.language, scope=[builtin_entity])
            builtin_entities = [ent for ent in builtin_entities
                                if entity_filter(ent, start, end)]
            for ent in builtin_entities:
                entity_start = ent[RES_MATCH_RANGE][START]
                entity_end = ent[RES_MATCH_RANGE][END]
                indexes = []
                for index, token in enumerate(tokens):
                    if (entity_start <= token.start < entity_end) \
                            and (entity_start < token.end <= entity_end):
                        indexes.append(index)
                return get_scheme_prefix(token_index, indexes,
                                         self.tagging_scheme)
Exemplo n.º 17
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [_normalize_stem(t, language)
                                 for t in utterance_tokens]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (
        e[RES_MATCH_RANGE] for e in
        sorted(builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START])
    )
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [_normalize_stem(t, language)
                                          for t in filtered_utterance_tokens]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
Exemplo n.º 18
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_text = get_text_from_chunks(utterance[DATA])
    utterance_tokens = tokenize_light(utterance_text, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance_text,
                                            language,
                                            use_cache=True)
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove values of builtin slots from the utterance to avoid learning
    # specific samples such as '42' or 'tomorrow'
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(chunk[TEXT], language) for chunk in utterance[DATA]
        if ENTITY not in chunk or not is_builtin_entity(chunk[ENTITY])
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features
Exemplo n.º 19
0
def _preprocess_utterance(utterance, language,
                          entity_utterances_to_features_names,
                          word_clusters_name):
    utterance_tokens = tokenize_light(utterance, language)
    word_clusters_features = _get_word_cluster_features(
        utterance_tokens, word_clusters_name, language)
    normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in utterance_tokens
    ]
    entities_features = _get_dataset_entities_features(
        normalized_stemmed_tokens, entity_utterances_to_features_names)

    builtin_entities = get_builtin_entities(utterance, language)
    entities_ranges = (e[RES_MATCH_RANGE] for e in sorted(
        builtin_entities, key=lambda e: e[RES_MATCH_RANGE][START]))
    builtin_entities_features = [
        _builtin_entity_to_feature(ent[ENTITY_KIND], language)
        for ent in builtin_entities
    ]

    # We remove builtin entities from the utterance to avoid learning specific
    # examples such as '42'
    filtered_utterance = _remove_ranges(utterance, entities_ranges)
    filtered_utterance_tokens = tokenize_light(filtered_utterance, language)
    filtered_normalized_stemmed_tokens = [
        _normalize_stem(t, language) for t in filtered_utterance_tokens
    ]

    features = get_default_sep(language).join(
        filtered_normalized_stemmed_tokens)
    if builtin_entities_features:
        features += " " + " ".join(sorted(builtin_entities_features))
    if entities_features:
        features += " " + " ".join(sorted(entities_features))
    if word_clusters_features:
        features += " " + " ".join(sorted(word_clusters_features))

    return features