Пример #1
0
 def _combine_with_existing_features(
     message: Message,
     additional_features: Any,
     feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[
         MESSAGE_TEXT_ATTRIBUTE],
 ) -> Any:
     if message.get(feature_name) is not None:
         return np.concatenate(
             (message.get(feature_name), additional_features), axis=-1)
     else:
         return additional_features
 def __additional_ner_features(message: Message) -> List[Any]:
     features = message.get("ner_features", [])
     tokens = message.get("tokens", [])
     if len(tokens) != len(features):
         warn_string = "Number of custom NER features ({}) does not match number of tokens ({})".format(
             len(features), len(tokens))
         raise Exception(warn_string)
     # convert to python-crfsuite feature format
     features_out = []
     for feature in features:
         feature_dict = {
             str(index): token_features
             for index, token_features in enumerate(feature)
         }
         converted = {"custom_ner_features": feature_dict}
         features_out.append(converted)
     return features_out
    def process(self, message: Message, **kwargs: Any) -> None:

        self._check_spacy_doc(message)

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
    def _from_crf_to_json(self, message: Message,
                          entities: List[Any]) -> List[Dict[Text, Any]]:

        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")

        if len(tokens) != len(entities):
            raise Exception(
                "Inconsistency in amount of tokens between crfsuite and message"
            )

        if self.component_config["BILOU_flag"]:
            return self._convert_bilou_tagging_to_entity_result(
                message, tokens, entities)
        else:
            # not using BILOU tagging scheme, multi-word entities are split.
            return self._convert_simple_tagging_to_entity_result(
                tokens, entities)
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any],
                    Optional[Dict[Text, Any]], ]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse  # pytype: disable=import-error

            doc_or_tokens = message.get("spacy_doc")
            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            doc_or_tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)

        # collect badly annotated examples
        collected = []
        for t, e in zip(doc_or_tokens, ents):
            if e == "-":
                collected.append(t)
            elif collected:
                collected_text = " ".join([t.text for t in collected])
                logger.warning("Misaligned entity annotation for '{}' "
                               "in sentence '{}' with intent '{}'. "
                               "Make sure the start and end values of the "
                               "annotated training examples end at token "
                               "boundaries (e.g. don't include trailing "
                               "whitespaces or punctuation)."
                               "".format(collected_text, message.text,
                                         message.get("intent")))
                collected = []

        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
    def _from_text_to_crf(
        self,
        message: Message,
        entities: List[Text] = None
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any],
                    Optional[Dict[Text, Any]], ]]:
        """Takes a sentence and switches it to crfsuite format."""

        crf_format = []
        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")
        ner_features = (self.__additional_ner_features(message)
                        if self.use_ner_features else None)
        for i, token in enumerate(tokens):
            pattern = self.__pattern_of_token(message, i)
            entity = entities[i] if entities else "N/A"
            tag = self.__tag_of_token(token) if self.pos_features else None
            custom_ner_features = ner_features[
                i] if self.use_ner_features else None
            crf_format.append(
                (token.text, tag, entity, pattern, custom_ner_features))
        return crf_format
    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
        def convert_entity(entity):
            return entity["start"], entity["end"], entity["entity"]

        return [convert_entity(ent) for ent in example.get("entities", [])]