def parse(self, text: Text, time: Optional[datetime.datetime] = None, only_output_properties: bool = True) -> Dict[Text, Any]: """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one # should pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update( message.as_dict(only_output_properties=only_output_properties)) return output
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any): """Loads training data stored in the WIT.ai data format.""" from spawn_ai.training_data import Message, TrainingData training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent')] for e in entities: # for some reason wit adds additional quotes around entities e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def _parse_training_example(self, example): """Extract entities and synonyms, and convert to plain text.""" from spawn_ai.training_data import Message entities = self._find_entities_in_training_example(example) plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example) self._add_synonyms(plain_text, entities) message = Message(plain_text, {'intent': self.current_title}) if len(entities) > 0: message.set('entities', entities) return message
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any) -> 'TrainingData': """Loads training data stored in the LUIS.ai data format.""" from spawn_ai.training_data import Message, TrainingData training_examples = [] regex_features = [] # Simple check to ensure we support this luis data schema version if not js["luis_schema_version"].startswith("2"): raise Exception("Invalid luis data schema version {}, " "should be 2.x.x. " "Make sure to use the latest luis version " "(e.g. by downloading your data again)." "".format(js["luis_schema_version"])) for r in js.get("regex_features", []): if r.get("activated", False): regex_features.append({"name": r.get("name"), "pattern": r.get("pattern")}) for s in js["utterances"]: text = s.get("text") intent = s.get("intent") entities = [] for e in s.get("entities") or []: start, end = e["startPos"], e["endPos"] + 1 val = text[start:end] entities.append({"entity": e["entity"], "value": val, "start": start, "end": end}) data = {"entities": entities} if intent: data["intent"] = intent training_examples.append(Message(text, data)) return TrainingData(training_examples, regex_features=regex_features)
def filter_trainable_entities( self, entity_examples: List[Message]) -> List[Message]: """Filters out untrainable entity annotations. Creates a copy of entity_examples in which entities that have `extractor` set to something other than self.name (e.g. 'ner_crf') are removed.""" filtered = [] for message in entity_examples: entities = [] for ent in message.get("entities", []): extractor = ent.get("extractor") if not extractor or extractor == self.name: entities.append(ent) data = message.data.copy() data['entities'] = entities filtered.append( Message(text=message.text, data=data, output_properties=message.output_properties, time=message.time)) return filtered