def parse(self, text: Text, time: Optional[datetime.datetime] = None, only_output_properties: bool = True) -> Dict[Text, Any]: """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one # should pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update( message.as_dict(only_output_properties=only_output_properties)) return output
def _parse_intent_example(self, example_in_md): entities = [] utter = example_in_md match = re.search(ent_regex, utter) while match is not None: entity_synonym = match.groupdict()['synonym'] entity_entity = match.groupdict()['entity'] entity_value = match.groupdict()['value'] if match.groupdict()['value'] is None: entity_value = entity_synonym start_index = match.start() end_index = start_index + len(entity_synonym) entities.append({ 'entity': entity_entity, 'value': entity_value, 'start': start_index, 'end': end_index }) utter = utter[:match.start()] + entity_synonym + utter[match.end( ):] match = re.search(ent_regex, utter) message = Message(utter, {'intent': self.current_intent}) if len(entities) > 0: message.set('entities', entities) return message
def process(self, message: Message, **kwargs: Any) -> None: self._check_spacy_doc(message) extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("all_components") ner_syn = component_builder.create_component("ner_synonyms", _config) examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }] }), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }] }) ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}], "spacy_doc": spacy_nlp("central indian restaurant") })] config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}} ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
def _from_json_to_crf( self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Text, Text, Text, Text]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(tokens, entity_offsets) if '-' in ents: logger.warning("Misaligned entity annotation in sentence '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(message.text)) if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = message.get("text_features").reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list( zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{ "name": intent_name, "confidence": score } for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def _parse_intent_example(self, example_in_md): entities = [] utter = example_in_md for regex in [ent_regex, ent_regex_with_value]: utter = re.sub(regex, r"\1", utter) # [text](entity) -> text ent_matches = re.finditer(regex, example_in_md) for matchNum, match in enumerate(ent_matches): if 'synonym' in match.groupdict(): entity_value_in_utter = match.groupdict()['synonym'] else: entity_value_in_utter = match.groupdict()['value'] start_index = utter.index(entity_value_in_utter) end_index = start_index + len(entity_value_in_utter) entities.append({ 'entity': match.groupdict()['entity'], 'value': match.groupdict()['value'], 'start': start_index, 'end': end_index }) message = Message(utter, {'intent': self.current_intent}) if len(entities) > 0: message.set('entities', entities) return message
def parse(self, text, time=None): # type: (Text) -> Dict[Text, Any] """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one should # pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) print('[DEBUG] Message Output Attributes: %s' % self.default_output_attributes()) print('[DEBUG] Message Text: %s' % message.text) print('[DEBUG] Message Data: %s' % message.data) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update(message.as_dict(only_output_properties=True)) return output
def test_duckling_entity_extractor(component_builder): _config = utilities.base_test_conf("all_components") _config["duckling_dimensions"] = ["time"] duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) features = self.features_for_tokens(message.get("tokens"), mitie_feature_extractor) message.set( "text_features", self._combine_with_existing_text_features(message, features))
def _parse_training_example(self, example): """Extract entities and synonyms, and convert to plain text.""" entities = self._find_entities_in_training_example(example) plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example) self._add_synonyms(plain_text, entities) message = Message(plain_text, {'intent': self.current_title}) if len(entities) > 0: message.set('entities', entities) return message
def process(self, message: Message, **kwargs: Any) -> None: # can't use the existing doc here (spacy_doc on the message) # because tokens are lower cased which is bad for NER spacy_nlp = kwargs.get("spacy_nlp", None) doc = spacy_nlp(message.text) extracted = self.add_extractor_name(self.extract_entities(doc)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: if self.vect is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") else: message_text = self._get_message_text(message) bag = self.vect.transform([message_text]).toarray().squeeze() message.set( "text_features", self._combine_with_existing_text_features(message, bag))
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities(message.text, message.get("tokens"), mitie_feature_extractor) extracted = self.add_extractor_name(ents) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: # can't use the existing doc here (spacy_doc on the message) # because tokens are lower cased which is bad for NER spacy_nlp = kwargs.get("spacy_nlp", None) doc = spacy_nlp(message.text) all_extracted = self.add_extractor_name(self.extract_entities(doc)) dimensions = self.component_config["dimensions"] extracted = SpacyEntityExtractor.filter_irrelevant_entities( all_extracted, dimensions) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_spacy_ner_extractor(spacy_nlp): ext = SpacyEntityExtractor() example = Message("anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west")}) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { u'start': 16, u'extractor': u'ner_spacy', u'end': 20, u'value': u'West', u'entity': u'LOC'}
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
def neg_featurize(self, neg_train_data): """ Use the previously trained featurizers in the pipeline to featurize the negative training data (a set of sentences) """ X_neg = [] for example in neg_train_data: m = Message(example) self.partially_process(m) #print("message: {}; intent: {}".format(example, m.get("intent"))) X_neg.append(m.get("text_features")) X_neg = np.array(X_neg) return X_neg
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its similarity to the input""" # Classifier needs this to be non empty, so we set to first label. message.data["intent"] = self.label_list[0] predict_examples = get_test_examples([message]) predict_features = convert_examples_to_features( predict_examples, self.label_list, self.max_seq_length, self.tokenizer ) # Get first index since we are only classifying text blob at a time. example = predict_features[0] result = self.predict_fn( { "input_ids": np.array(example.input_ids).reshape( -1, self.max_seq_length ), "input_mask": np.array(example.input_mask).reshape( -1, self.max_seq_length ), "label_ids": np.array(example.label_id).reshape(-1), "segment_ids": np.array(example.segment_ids).reshape( -1, self.max_seq_length ), } ) probabilities = list(np.exp(result["probabilities"])[0]) with self.session.as_default(): index = tf.argmax(probabilities, axis=0).eval(session=tf.Session()) label = self.label_list[index] score = float(probabilities[index]) intent = {"name": label, "confidence": score} intent_ranking = sorted( [ {"name": self.label_list[i], "confidence": float(score)} for i, score in enumerate(probabilities) ], key=lambda k: k["confidence"], reverse=True, ) message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def load_rasa_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" data = _read_json_from_file(filename) validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn("DEPRECATION warning: Data file contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider putting all your examples " "into the 'common_examples' section.") all_examples = common + intent + entity training_examples = [] for e in all_examples: data = e.copy() if "text" in data: del data["text"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def read_from_json(self, js, **kwargs): """Loads training data stored in the rasa NLU data format.""" validate_rasa_nlu_data(js) data = js['rasa_nlu_data'] common_examples = data.get("common_examples", []) intent_examples = data.get("intent_examples", []) entity_examples = data.get("entity_examples", []) entity_synonyms = data.get("entity_synonyms", []) regex_features = data.get("regex_features", []) lookup_tables = data.get("lookup_tables", []) entity_synonyms = transform_entity_synonyms(entity_synonyms) if intent_examples or entity_examples: logger.warning("DEPRECATION warning: your rasa data " "contains 'intent_examples' " "or 'entity_examples' which will be " "removed in the future. Consider " "putting all your examples " "into the 'common_examples' section.") all_examples = common_examples + intent_examples + entity_examples training_examples = [] for ex in all_examples: msg = Message.build(ex['text'], ex.get("intent"), ex.get("entities")) training_examples.append(msg) return TrainingData(training_examples, entity_synonyms, regex_features, lookup_tables)
def load_wit_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the WIT.ai data format.""" training_examples = [] data = _read_json_from_file(filename) for s in data["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent')] for e in entities: # for some reason wit adds additional quotes around entity values e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def load_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = e.copy() if "text" in data: del data["text"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def filter_trainable_entities( self, entity_examples: List[Message]) -> List[Message]: """Filters out untrainable entity annotations. Creates a copy of entity_examples in which entities that have `extractor` set to something other than self.name (e.g. 'CRFEntityExtractor') are removed. """ filtered = [] for message in entity_examples: entities = [] for ent in message.get("entities", []): extractor = ent.get("extractor") if not extractor or extractor == self.name: entities.append(ent) data = message.data.copy() data['entities'] = entities filtered.append( Message(text=message.text, data=data, output_properties=message.output_properties, time=message.time)) return filtered
def load_train_data(data): validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn( "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be " + "removed in the future. Consider putting all your examples into the 'common_examples' section." ) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = {} if e.get("intent"): data["intent"] = e["intent"] if e.get("entities") is not None: data["entities"] = e["entities"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def load_rasa_data(filename): # type: (Text) -> TrainingData """Loads training data stored in the rasa NLU data format.""" with io.open(filename, encoding="utf-8-sig") as f: data = json.loads(f.read()) validate_rasa_nlu_data(data) common = data['rasa_nlu_data'].get("common_examples", list()) intent = data['rasa_nlu_data'].get("intent_examples", list()) entity = data['rasa_nlu_data'].get("entity_examples", list()) regex_features = data['rasa_nlu_data'].get("regex_features", list()) synonyms = data['rasa_nlu_data'].get("entity_synonyms", list()) entity_synonyms = get_entity_synonyms_dict(synonyms) if intent or entity: logger.warn( "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be " + "removed in the future. Consider putting all your examples into the 'common_examples' section." ) all_examples = common + intent + entity training_examples = [] for e in all_examples: data = {} if e.get("intent"): data["intent"] = e["intent"] if e.get("entities") is not None: data["entities"] = e["entities"] training_examples.append(Message(e["text"], data)) return TrainingData(training_examples, entity_synonyms, regex_features)
def read_from_json(self, js: Dict[Text, Any], **kwargs: Any): """Loads training data stored in the WIT.ai data format.""" from rasa_nlu.training_data import Message, TrainingData training_examples = [] for s in js["data"]: entities = s.get("entities") if entities is None: continue text = s.get("text") intents = [e["value"] for e in entities if e["entity"] == 'intent'] intent = intents[0].strip("\"") if intents else None entities = [ e for e in entities if ("start" in e and "end" in e and e["entity"] != 'intent') ] for e in entities: # for some reason wit adds additional quotes around entities e["value"] = e["value"].strip("\"") data = {} if intent: data["intent"] = intent if entities is not None: data["entities"] = entities training_examples.append(Message(text, data)) return TrainingData(training_examples)
def test_crf_json_from_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = True sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} r = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'B-what': 1.0}, {'L-what': 1.0}, {'B-where': 1.0}, {'I-where': 1.0}, {'L-where': 1.0}]) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {'start': 9, 'end': 22, 'value': 'home cleaning', 'entity': 'what'} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {'start': 23, 'end': 31, 'value': 'close-by', 'entity': 'where'}
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() ext.BILOU_flag = False sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} rs = ext._from_crf_to_json(Message(sentence, doc), [{'O': 1.0}, {'O': 1.0}, {'O': 1.0}, {'what': 1.0}, {'what': 1.0}, {'where': 1.0}, {'where': 1.0}, {'where': 1.0}]) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r['confidence'] # confidence should exist del r['confidence'] assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'} assert rs[1] == {'start': 14, 'end': 22, 'value': 'cleaning', 'entity': 'what'} assert rs[2] == {'start': 23, 'end': 28, 'value': 'close', 'entity': 'where'} assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'} assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}