def process(self, message: Message, **kwargs: Any) -> None: self._check_spacy_doc(message) extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 for i, token in enumerate(message.get("tokens")): if i in labeled_tokens: assert token.get("pattern") in [0, 1] else: # if the token is not part of a regex the pattern should not be set assert token.get("pattern") is None
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = message.get("text_features").reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list(zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{"name": intent_name, "confidence": score} for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def _parse_intent_example(self, example_in_md): entities = [] utter = example_in_md for regex in [ent_regex, ent_regex_with_value]: utter = re.sub(regex, r"\1", utter) # [text](entity) -> text ent_matches = re.finditer(regex, example_in_md) for matchNum, match in enumerate(ent_matches): if 'synonym' in match.groupdict(): entity_value_in_utter = match.groupdict()['synonym'] else: entity_value_in_utter = match.groupdict()['value'] start_index = utter.index(entity_value_in_utter) end_index = start_index + len(entity_value_in_utter) entities.append({ 'entity': match.groupdict()['entity'], 'value': match.groupdict()['value'], 'start': start_index, 'end': end_index }) message = Message(utter, {'intent': self.current_intent}) if len(entities) > 0: message.set('entities', entities) return message
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}, {"pattern": '[0-1]+', "name": "binary", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def _parse_training_example(self, example): """Extract entities and synonyms, and convert to plain text.""" entities = self._find_entities_in_training_example(example) plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example) self._add_synonyms(plain_text, entities) message = Message(plain_text, {'intent': self.current_title}) if len(entities) > 0: message.set('entities', entities) return message
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities(message.text, message.get("tokens"), mitie_feature_extractor) extracted = self.add_extractor_name(ents) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__'}) train_message = Message(sentence) train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": 'char'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") if self.clf: token_strs = self._tokens_of_message(message) intent, confidence = self.clf(token_strs, mitie_feature_extractor) else: # either the model didn't get trained or it wasn't # provided with any data intent = None confidence = 0.0 message.set("intent", {"name": intent, "confidence": confidence}, add_to_output=True)
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) train_message.set("intent", "bla") # this is needed for a valid training example data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.get("spacy_doc")))
def process(self, message: Message, **kwargs: Any) -> None: updated_entities = message.get("entities", [])[:] self.replace_synonyms(updated_entities) message.set("entities", updated_entities, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: updated = self._text_features_with_regex(message) message.set("text_features", updated)
def process(self, message: Message, **kwargs: Any) -> None: message.set("spacy_doc", self.doc_for_text(message.text))
def process(self, message: Message, **kwargs: Any) -> None: extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.text))
def process(self, message: Message, **kwargs: Any) -> None: intent = {"name": self.parse(message.text), "confidence": 1.0} message.set("intent", intent, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: spacy_nlp = kwargs.get("spacy_nlp", None) message.text = lemmatize(lemmatize(message.text)) message.set("spacy_doc", self.doc_for_text(message.text, spacy_nlp))
def process(self, message: Message, **kwargs: Any): updated = self._text_features_with_ngrams(message, self.best_num_ngrams) message.set("text_features", updated)