def _from_json_to_crf( self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Text, Text, Text, Text]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(tokens, entity_offsets) if '-' in ents: logger.warning("Misaligned entity annotation in sentence '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(message.text)) if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def parse(self, text: Text, time: Optional[datetime.datetime] = None, only_output_properties: bool = True) -> Dict[Text, Any]: """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one # should pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update( message.as_dict(only_output_properties=only_output_properties)) return output
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = message.get("text_features").reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list( zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{ "name": intent_name, "confidence": score } for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) features = self.features_for_tokens(message.get("tokens"), mitie_feature_extractor) message.set("text_features", self._combine_with_existing_text_features(message, features))
def process(self, message: Message, **kwargs: Any) -> None: self._check_spacy_doc(message) extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: # can't use the existing doc here (spacy_doc on the message) # because tokens are lower cased which is bad for NER spacy_nlp = kwargs.get("spacy_nlp", None) doc = spacy_nlp(message.text) extracted = self.add_extractor_name(self.extract_entities(doc)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: if self.vect is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") else: message_text = self._get_message_text(message) bag = self.vect.transform([message_text]).toarray().squeeze() message.set( "text_features", self._combine_with_existing_text_features(message, bag))
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'intent_featurizer_mitie'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities(message.text, message.get("tokens"), mitie_feature_extractor) extracted = self.add_extractor_name(ents) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def _parse_training_example(self, example): """Extract entities and synonyms, and convert to plain text.""" from spawn_ai.training_data import Message entities = self._find_entities_in_training_example(example) plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example) self._add_synonyms(plain_text, entities) message = Message(plain_text, {'intent': self.current_title}) if len(entities) > 0: message.set('entities', entities) return message
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any): """Loads training data stored in the WIT.ai data format.""" from spawn_ai.training_data import Message, TrainingData training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent')] for e in entities: # for some reason wit adds additional quotes around entities e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def _from_text_to_crf( self, message: Message, entities: List[Text] = None ) -> List[Tuple[Text, Text, Text, Text]]: """Takes a sentence and switches it to crfsuite format.""" crf_format = [] if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") for i, token in enumerate(tokens): pattern = self.__pattern_of_token(message, i) entity = entities[i] if entities else "N/A" tag = self.__tag_of_token(token) if self.pos_features else None crf_format.append((token.text, tag, entity, pattern)) return crf_format
def process(self, message: Message, **kwargs: Any) -> None: if self._url() is not None: reference_time = self._reference_time_from_message(message) matches = self._duckling_parse(message.text, reference_time) dimensions = self.component_config["dimensions"] relevant_matches = filter_irrelevant_matches(matches, dimensions) extracted = convert_duckling_format_to_rasa(relevant_matches) else: extracted = [] logger.warning("Duckling HTTP component in pipeline, but no " "`url` configuration in the config " "file nor is `RASA_DUCKLING_HTTP_URL` " "set as an environment variable.") extracted = self.add_extractor_name(extracted) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def _from_crf_to_json(self, message: Message, entities: List[Any]) -> List[Dict[Text, Any]]: if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") if len(tokens) != len(entities): raise Exception('Inconsistency in amount of tokens ' 'between crfsuite and message') if self.component_config["BILOU_flag"]: return self._convert_bilou_tagging_to_entity_result( tokens, entities) else: # not using BILOU tagging scheme, multi-word entities are split. return self._convert_simple_tagging_to_entity_result( tokens, entities)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'intent_featurizer_mitie'. " "Missing a proper MITIE feature extractor.") if self.clf: token_strs = self._tokens_of_message(message) intent, confidence = self.clf(token_strs, mitie_feature_extractor) else: # either the model didn't get trained or it wasn't # provided with any data intent = None confidence = 0.0 message.set("intent", { "name": intent, "confidence": confidence }, add_to_output=True)
def _read_intent(self, intent_js, examples_js): """Reads the intent and examples from respective jsons.""" from spawn_ai.training_data import Message, TrainingData intent = intent_js.get("name") training_examples = [] for ex in examples_js: text, entities = self._join_text_chunks(ex['data']) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any) -> 'TrainingData': """Loads training data stored in the LUIS.ai data format.""" from spawn_ai.training_data import Message, TrainingData training_examples = [] regex_features = [] # Simple check to ensure we support this luis data schema version if not js["luis_schema_version"].startswith("2"): raise Exception("Invalid luis data schema version {}, " "should be 2.x.x. " "Make sure to use the latest luis version " "(e.g. by downloading your data again)." "".format(js["luis_schema_version"])) for r in js.get("regex_features", []): if r.get("activated", False): regex_features.append({"name": r.get("name"), "pattern": r.get("pattern")}) for s in js["utterances"]: text = s.get("text") intent = s.get("intent") entities = [] for e in s.get("entities") or []: start, end = e["startPos"], e["endPos"] + 1 val = text[start:end] entities.append({"entity": e["entity"], "value": val, "start": start, "end": end}) data = {"entities": entities} if intent: data["intent"] = intent training_examples.append(Message(text, data)) return TrainingData(training_examples, regex_features=regex_features)
def filter_trainable_entities( self, entity_examples: List[Message]) -> List[Message]: """Filters out untrainable entity annotations. Creates a copy of entity_examples in which entities that have `extractor` set to something other than self.name (e.g. 'ner_crf') are removed.""" filtered = [] for message in entity_examples: entities = [] for ent in message.get("entities", []): extractor = ent.get("extractor") if not extractor or extractor == self.name: entities.append(ent) data = message.data.copy() data['entities'] = entities filtered.append( Message(text=message.text, data=data, output_properties=message.output_properties, time=message.time)) return filtered
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.get("spacy_doc")))
def process(self, message: Message, **kwargs: Any) -> None: updated_entities = message.get("entities", [])[:] self.replace_synonyms(updated_entities) message.set("entities", updated_entities, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: intent = {"name": self.parse(message.text), "confidence": 1.0} message.set("intent", intent, add_to_output=True)
def process(self, message: Message, **kwargs: Any): updated = self._text_features_with_ngrams(message, self.best_num_ngrams) message.set("text_features", updated)
def _convert_example(example: Message) -> List[Tuple[int, int, Text]]: def convert_entity(entity): return entity["start"], entity["end"], entity["entity"] return [convert_entity(ent) for ent in example.get("entities", [])]
def process(self, message: Message, **kwargs: Any) -> None: message.set("spacy_doc", self.doc_for_text(message.text))
def process(self, message: Message, **kwargs: Any) -> None: updated = self._text_features_with_regex(message) message.set("text_features", updated)
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.text))