def _parse_top_intents(self, text, top_n, intents=None):
        if isinstance(intents, str):
            intents = {intents}
        elif isinstance(intents, list):
            intents = set(intents)

        if top_n < 1:
            raise ValueError(
                "top_n argument must be greater or equal to 1, but got: %s" %
                top_n)

        def placeholder_fn(entity_name):
            return _get_entity_name_placeholder(entity_name, self.language)

        results = []

        for intent, entity_scope in iteritems(self.entity_scopes):
            if intents is not None and intent not in intents:
                continue
            builtin_entities = self.builtin_entity_parser.parse(
                text, scope=entity_scope["builtin"], use_cache=True)
            custom_entities = self.custom_entity_parser.parse(
                text, scope=entity_scope["custom"], use_cache=True)
            all_entities = builtin_entities + custom_entities
            mapping, processed_text = replace_entities_with_placeholders(
                text, all_entities, placeholder_fn=placeholder_fn)
            cleaned_text = self._preprocess_text(text, intent)
            cleaned_processed_text = self._preprocess_text(
                processed_text, intent)
            for regex in self.regexes_per_intent[intent]:
                res = self._get_matching_result(text, cleaned_processed_text,
                                                regex, intent, mapping)
                if res is None and cleaned_text != cleaned_processed_text:
                    res = self._get_matching_result(text, cleaned_text, regex,
                                                    intent)
                if res is not None:
                    results.append(res)
                    break

        # In some rare cases there can be multiple ambiguous intents
        # In such cases, priority is given to results containing fewer slots
        weights = [1.0 / (1.0 + len(res[RES_SLOTS])) for res in results]
        total_weight = sum(weights)

        for res, weight in zip(results, weights):
            res[RES_INTENT][RES_PROBA] = weight / total_weight

        results = sorted(results, key=lambda r: -r[RES_INTENT][RES_PROBA])

        return results[:top_n]
示例#2
0
    def test_should_replace_entities(self):
        # Given
        text = "Be the first to be there at 9pm"

        # When
        entities = [{
            "entity_kind": "snips/ordinal",
            "value": "the first",
            "range": {
                "start": 3,
                "end": 12
            }
        }, {
            "entity_kind": "my_custom_entity",
            "value": "first",
            "range": {
                "start": 7,
                "end": 12
            }
        }, {
            "entity_kind": "snips/datetime",
            "value": "at 9pm",
            "range": {
                "start": 25,
                "end": 31
            }
        }]

        def placeholder_fn(x):
            return "%%%s%%" % "".join(tokenize_light(x, "en")).upper()

        range_mapping, processed_text = replace_entities_with_placeholders(
            text=text, entities=entities, placeholder_fn=placeholder_fn)

        # Then
        expected_mapping = {
            (3, 17): {
                START: 3,
                END: 12
            },
            (30, 45): {
                START: 25,
                END: 31
            }
        }
        expected_processed_text = \
            "Be %SNIPSORDINAL% to be there %SNIPSDATETIME%"

        self.assertDictEqual(expected_mapping, range_mapping)
        self.assertEqual(expected_processed_text, processed_text)
示例#3
0
    def _parse_top_intents(self, text, top_n, intents=None):
        if isinstance(intents, str):
            intents = {intents}
        elif isinstance(intents, list):
            intents = set(intents)

        if top_n < 1:
            raise ValueError(
                "top_n argument must be greater or equal to 1, but got: %s"
                % top_n)

        def placeholder_fn(entity_name):
            return _get_entity_name_placeholder(entity_name, self.language)

        results = []

        for intent, entity_scope in iteritems(self.entity_scopes):
            if intents is not None and intent not in intents:
                continue
            builtin_entities = self.builtin_entity_parser.parse(
                text, scope=entity_scope["builtin"], use_cache=True)
            custom_entities = self.custom_entity_parser.parse(
                text, scope=entity_scope["custom"], use_cache=True)
            all_entities = builtin_entities + custom_entities
            mapping, processed_text = replace_entities_with_placeholders(
                text, all_entities, placeholder_fn=placeholder_fn)
            cleaned_text = self._preprocess_text(text, intent)
            cleaned_processed_text = self._preprocess_text(processed_text,
                                                           intent)
            for regex in self.regexes_per_intent[intent]:
                res = self._get_matching_result(text, cleaned_processed_text,
                                                regex, intent, mapping)
                if res is None and cleaned_text != cleaned_processed_text:
                    res = self._get_matching_result(text, cleaned_text, regex,
                                                    intent)
                if res is not None:
                    results.append(res)
                    break

        confidence_score = 1.
        if results:
            confidence_score = 1. / float(len(results))

        results = results[:top_n]

        for res in results:
            res[RES_INTENT][RES_PROBA] = confidence_score

        return results
示例#4
0
 def _enrich_utterance(self, x, builtin_ents, custom_ents):
     utterance = get_text_from_chunks(x[DATA])
     all_entities = builtin_ents + custom_ents
     placeholder_fn = self._placeholder_fn
     # Replace entities with placeholders
     enriched_utterance = replace_entities_with_placeholders(
         utterance, all_entities, placeholder_fn)[1]
     # Tokenize
     enriched_utterance = tokenize_light(enriched_utterance, self.language)
     # Remove the unknownword strings if needed
     if self.config.unknown_words_replacement_string:
         enriched_utterance = [
             t for t in enriched_utterance
             if t != self.config.unknown_words_replacement_string
         ]
     return enriched_utterance