Пример #1
0
    def parse(
        self,
        text: Text,
        time: Optional[datetime.datetime] = None,
        only_output_properties: bool = True,
    ) -> Dict[Text, Any]:
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one
            # should pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(
            message.as_dict(only_output_properties=only_output_properties))
        return output
    def process(self, message: Message, **kwargs: Any) -> None:

        self._check_spacy_doc(message)

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
Пример #3
0
 def _parse_training_example(self, example):
     """Extract entities and synonyms, and convert to plain text."""
     entities = self._find_entities_in_training_example(example)
     plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'],
                         example)
     self._add_synonyms(plain_text, entities)
     message = Message(plain_text, {'intent': self.current_title})
     if len(entities) > 0:
         message.set('entities', entities)
     return message
Пример #4
0
 def _combine_with_existing_features(
     message: Message,
     additional_features: Any,
     feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[
         MESSAGE_TEXT_ATTRIBUTE],
 ) -> Any:
     if message.get(feature_name) is not None:
         return np.concatenate(
             (message.get(feature_name), additional_features), axis=-1)
     else:
         return additional_features
Пример #5
0
    def filter_trainable_entities(
        self, entity_examples: List[Message]
    ) -> List[Message]:
        """Filters out untrainable entity annotations.

        Creates a copy of entity_examples in which entities that have
        `extractor` set to something other than
        self.name (e.g. 'CRFEntityExtractor') are removed.
        """

        filtered = []
        for message in entity_examples:
            entities = []
            for ent in message.get("entities", []):
                extractor = ent.get("extractor")
                if not extractor or extractor == self.name:
                    entities.append(ent)
            data = message.data.copy()
            data["entities"] = entities
            filtered.append(
                Message(
                    text=message.text,
                    data=data,
                    output_properties=message.output_properties,
                    time=message.time,
                )
            )

        return filtered
Пример #6
0
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        validate_rasa_nlu_data(js)

        data = js['rasa_nlu_data']
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warn("DEPRECATION warning: your rasa data "
                        "contains 'intent_examples' "
                        "or 'entity_examples' which will be "
                        "removed in the future. Consider "
                        "putting all your examples "
                        "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex['text'], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features)
Пример #7
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the WIT.ai data format."""

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [
                e for e in entities
                if ("start" in e and "end" in e and e["entity"] != 'intent')
            ]
            for e in entities:
                # for some reason wit adds additional quotes around entity values
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
 def __additional_ner_features(message: Message) -> List[Any]:
     features = message.get("ner_features", [])
     tokens = message.get("tokens", [])
     if len(tokens) != len(features):
         warn_string = "Number of custom NER features ({}) does not match number of tokens ({})".format(
             len(features), len(tokens))
         raise Exception(warn_string)
     # convert to python-crfsuite feature format
     features_out = []
     for feature in features:
         feature_dict = {
             str(index): token_features
             for index, token_features in enumerate(feature)
         }
         converted = {"custom_ner_features": feature_dict}
         features_out.append(converted)
     return features_out
Пример #9
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex['data'])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
    def _from_crf_to_json(self, message: Message,
                          entities: List[Any]) -> List[Dict[Text, Any]]:

        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")

        if len(tokens) != len(entities):
            raise Exception(
                "Inconsistency in amount of tokens between crfsuite and message"
            )

        if self.component_config["BILOU_flag"]:
            return self._convert_bilou_tagging_to_entity_result(
                message, tokens, entities)
        else:
            # not using BILOU tagging scheme, multi-word entities are split.
            return self._convert_simple_tagging_to_entity_result(
                tokens, entities)
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any],
                    Optional[Dict[Text, Any]], ]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse  # pytype: disable=import-error

            doc_or_tokens = message.get("spacy_doc")
            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            doc_or_tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)

        # collect badly annotated examples
        collected = []
        for t, e in zip(doc_or_tokens, ents):
            if e == "-":
                collected.append(t)
            elif collected:
                collected_text = " ".join([t.text for t in collected])
                logger.warning("Misaligned entity annotation for '{}' "
                               "in sentence '{}' with intent '{}'. "
                               "Make sure the start and end values of the "
                               "annotated training examples end at token "
                               "boundaries (e.g. don't include trailing "
                               "whitespaces or punctuation)."
                               "".format(collected_text, message.text,
                                         message.get("intent")))
                collected = []

        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
Пример #12
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process incoming message and compute and set features"""

        if self.vectorizers is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
        else:
            message_text = self._get_message_text_by_attribute(
                message, attribute=MESSAGE_TEXT_ATTRIBUTE)

            bag = (self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
                [message_text]).toarray().squeeze())
            message.set(
                MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
                self._combine_with_existing_features(
                    message,
                    bag,
                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[
                        MESSAGE_TEXT_ATTRIBUTE],
                ),
            )
    def _from_text_to_crf(
        self,
        message: Message,
        entities: List[Text] = None
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any],
                    Optional[Dict[Text, Any]], ]]:
        """Takes a sentence and switches it to crfsuite format."""

        crf_format = []
        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")
        ner_features = (self.__additional_ner_features(message)
                        if self.use_ner_features else None)
        for i, token in enumerate(tokens):
            pattern = self.__pattern_of_token(message, i)
            entity = entities[i] if entities else "N/A"
            tag = self.__tag_of_token(token) if self.pos_features else None
            custom_ner_features = ner_features[
                i] if self.use_ner_features else None
            crf_format.append(
                (token.text, tag, entity, pattern, custom_ner_features))
        return crf_format
Пример #14
0
    def read_from_json(self, js, **kwargs):
        # type: (Text, Any) -> TrainingData
        """Loads training data stored in the LUIS.ai data format."""

        training_examples = []
        regex_features = []

        # Simple check to ensure we support this luis data schema version
        if not js["luis_schema_version"].startswith("2"):
            raise Exception(
                "Invalid luis data schema version {}, should be 2.x.x. "
                "Make sure to use the latest luis version "
                "(e.g. by downloading your data again)."
                "".format(js["luis_schema_version"]))

        for r in js.get("regex_features", []):
            if r.get("activated", False):
                regex_features.append({
                    "name": r.get("name"),
                    "pattern": r.get("pattern")
                })

        for s in js["utterances"]:
            text = s.get("text")
            intent = s.get("intent")
            entities = []
            for e in s.get("entities") or []:
                start, end = e["startPos"], e["endPos"] + 1
                val = text[start:end]
                entities.append({
                    "entity": e["entity"],
                    "value": val,
                    "start": start,
                    "end": end
                })

            data = {"entities": entities}
            if intent:
                data["intent"] = intent
            training_examples.append(Message(text, data))
        return TrainingData(training_examples, regex_features=regex_features)
    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
        def convert_entity(entity):
            return entity["start"], entity["end"], entity["entity"]

        return [convert_entity(ent) for ent in example.get("entities", [])]