def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}, {"pattern": '[0-1]+', "name": "binary", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 for i, token in enumerate(message.get("tokens")): if i in labeled_tokens: assert token.get("pattern") in [0, 1] else: # if the token is not part of a regex the pattern should not be set assert token.get("pattern") is None
def _from_json_to_crf(self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Text, Text, Text, Text]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(tokens, entity_offsets) if '-' in ents: logger.warning("Misaligned entity annotation in sentence '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(message.text)) if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def _from_json_to_crf( self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Text, Text, Text, Text]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse doc = message.get("spacy_doc") gold = GoldParse(doc, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(tokens, entity_offsets) if '-' in ents: logger.warning("Misaligned entity annotation in sentence '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(message.text)) if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [{ "name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"] }, { "name": 'plates', "elements": "data/test/lookup_tables/plates.txt" }] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [{ "pattern": '[0-9]+', "name": "number", "usage": "intent" }, { "pattern": '\\bhey*', "name": "hello", "usage": "intent" }, { "pattern": '[0-1]+', "name": "binary", "usage": "intent" }] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") ents = self.extract_entities(message.text, message.get("tokens"), mitie_feature_extractor) extracted = self.add_extractor_name(ents) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_spacy_ner_extractor(spacy_nlp): ext = SpacyEntityExtractor() example = Message("anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west")}) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { u'start': 16, u'extractor': u'ner_spacy', u'end': 20, u'value': u'West', u'entity': u'LOC'}
def test_spacy_ner_extractor(component_builder, spacy_nlp): _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) ext = component_builder.create_component(_config.for_component(0), _config) example = Message( "anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west") }) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 16, 'extractor': 'SpacyEntityExtractor', 'end': 20, 'value': 'West', 'entity': 'LOC', 'confidence': None } # Test dimension filtering includes only specified dimensions example = Message( "anywhere in the West with Sebastian Thrun", { "intent": "example_intent", "entities": [], "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun") }) _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["PERSON"]) ext = component_builder.create_component(_config.for_component(0), _config) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 26, 'extractor': 'SpacyEntityExtractor', 'end': 41, 'value': 'Sebastian Thrun', 'entity': 'PERSON', 'confidence': None }
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) train_message.set("intent", "bla") # this is needed for a valid training example data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def process(self, message: Message, **kwargs: Any) -> None: self._check_spacy_doc(message) extracted = self.add_extractor_name(self.extract_entities(message)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_duckling_entity_extractor(component_builder): _config = utilities.base_test_conf("all_components") _config["duckling_dimensions"] = ["time"] duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3 # Test duckling with a defined date message = Message("Let us meet tomorrow.", time="1381536182000") # 1381536182000 == 2013/10/12 02:03:02 duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = message.get("text_features").reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list( zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{ "name": intent_name, "confidence": score } for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_duckling_entity_extractor(component_builder): _config = utilities.base_test_conf("all_components") _config["duckling_dimensions"] = ["time"] duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3 # Test duckling with a defined date message = Message("Let us meet tomorrow.", time="1381536182000") # 1381536182000 == 2013/10/12 02:03:02 duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = message.get("text_features").reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list(zip(list(intents), list(probabilities)))[:INTENT_RANKING_LENGTH] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [{"name": intent_name, "confidence": score} for intent_name, score in ranking] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def _from_text_to_crf(self, message: Message, entities: List[Text] = None ) -> List[Tuple[Text, Text, Text, Text]]: """Takes a sentence and switches it to crfsuite format.""" crf_format = [] if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") for i, token in enumerate(tokens): pattern = self.__pattern_of_token(message, i) entity = entities[i] if entities else "N/A" tag = self.__tag_of_token(token) if self.pos_features else None crf_format.append((token.text, tag, entity, pattern)) return crf_format
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = self._mitie_feature_extractor(**kwargs) features = self.features_for_tokens(message.get("tokens"), mitie_feature_extractor) message.set( "text_features", self._combine_with_existing_text_features(message, features))
def test_spacy_ner_extractor(spacy_nlp): ext = SpacyEntityExtractor() example = Message("anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west")}) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 16, 'extractor': 'ner_spacy', 'end': 20, 'value': 'West', 'entity': 'LOC', 'confidence': None}
def test_duckling_entity_extractor(component_builder): _config = utilities.base_test_conf("all_components") _config["duckling_dimensions"] = ["time"] duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3
def process(self, message: Message, **kwargs: Any) -> None: # can't use the existing doc here (spacy_doc on the message) # because tokens are lower cased which is bad for NER spacy_nlp = kwargs.get("spacy_nlp", None) doc = spacy_nlp(message.text) extracted = self.add_extractor_name(self.extract_entities(doc)) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_duckling_entity_extractor(component_builder): _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]}) _config.set_component_attr("ner_duckling", dimensions=["time"]) duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3 # Test duckling with a defined date # 1381536182000 == 2013/10/12 02:03:02 message = Message("Let us meet tomorrow.", time="1381536182000") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
def test_duckling_entity_extractor(component_builder): _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]}) _config.set_component_attr("ner_duckling", dimensions=["time"]) duckling = component_builder.create_component("ner_duckling", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 3 # Test duckling with a defined date # 1381536182000 == 2013/10/12 02:03:02 message = Message("Let us meet tomorrow.", time="1381536182000") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
def _from_crf_to_json(self, message: Message, entities: List[Any]) -> List[Dict[Text, Any]]: if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") if len(tokens) != len(entities): raise Exception('Inconsistency in amount of tokens ' 'between crfsuite and message') if self.component_config["BILOU_flag"]: return self._convert_bilou_tagging_to_entity_result( tokens, entities) else: # not using BILOU tagging scheme, multi-word entities are split. return self._convert_simple_tagging_to_entity_result( tokens, entities)
def _from_crf_to_json(self, message: Message, entities: List[Any]) -> List[Dict[Text, Any]]: if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") if len(tokens) != len(entities): raise Exception('Inconsistency in amount of tokens ' 'between crfsuite and message') if self.component_config["BILOU_flag"]: return self._convert_bilou_tagging_to_entity_result( tokens, entities) else: # not using BILOU tagging scheme, multi-word entities are split. return self._convert_simple_tagging_to_entity_result( tokens, entities)
def test_cabocha_comps(text, nlp, nlp_doc): ext = CabochaTokenizer() example = Message(text, { "intent": "wish", "entities": [], "cabocha_doc": nlp_doc }) # tokenizer ext.process(example, cabocha=nlp) for token in example.get("tokens"): print(token.text, token.offset) # entity extractor ext = CabochaEntityExtractor() ext.process(example, cabocha=nlp) print("total entities", len(example.get("entities", []))) for ent in example.get("entities"): print(ent)
def process(self, message: Message, **kwargs: Any) -> None: # can't use the existing doc here (spacy_doc on the message) # because tokens are lower cased which is bad for NER spacy_nlp = kwargs.get("spacy_nlp", None) doc = spacy_nlp(message.text) all_extracted = self.add_extractor_name(self.extract_entities(doc)) dimensions = self.component_config["dimensions"] extracted = SpacyEntityExtractor.filter_irrelevant_entities( all_extracted, dimensions) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_spacy_ner_extractor(component_builder, spacy_nlp): _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]}) ext = component_builder.create_component(_config.for_component(0), _config) example = Message("anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west")}) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 16, 'extractor': 'SpacyEntityExtractor', 'end': 20, 'value': 'West', 'entity': 'LOC', 'confidence': None} # Test dimension filtering includes only specified dimensions example = Message("anywhere in the West with Sebastian Thrun", { "intent": "example_intent", "entities": [], "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")}) _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]}) _config.set_component_attr(0, dimensions=["PERSON"]) ext = component_builder.create_component(_config.for_component(0), _config) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 26, 'extractor': 'SpacyEntityExtractor', 'end': 41, 'value': 'Sebastian Thrun', 'entity': 'PERSON', 'confidence': None}
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
def neg_featurize(self, neg_train_data): """ Use the previously trained featurizers in the pipeline to featurize the negative training data (a set of sentences) """ X_neg = [] for example in neg_train_data: m = Message(example) self.partially_process(m) #print("message: {}; intent: {}".format(example, m.get("intent"))) X_neg.append(m.get("text_features")) X_neg = np.array(X_neg) return X_neg
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__'}) train_message = Message(sentence) train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": 'char'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({ "token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__' }) train_message = Message(sentence) train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def process(self, message: Message, **kwargs: Any) -> None: if self._url() is not None: reference_time = self._reference_time_from_message(message) matches = self._duckling_parse(message.text, reference_time) dimensions = self.component_config["dimensions"] relevant_matches = filter_irrelevant_matches(matches, dimensions) extracted = convert_duckling_format_to_rasa(relevant_matches) else: extracted = [] logger.warning("Duckling HTTP component in pipeline, but no " "`url` configuration in the config " "file nor is `RASA_DUCKLING_HTTP_URL` " "set as an environment variable.") extracted = self.add_extractor_name(extracted) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: updated_entities = message.get("entities", [])[:] self.replace_synonyms(updated_entities) message.set("entities", updated_entities, add_to_output=True)
def test_duckling_entity_extractor(component_builder): httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"Today","start":0,"value":{"values":[{ "value":"2018-11-13T00:00:00.000-08:00","grain":"day", "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00", "grain":"day","type":"value"},"end":5, "dim":"time","latent":false},{"body":"the 5th","start":9, "value":{"values":[{ "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-01-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-02-05T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"},"end":16,"dim":"time", "latent":false},{"body":"5th of May","start":13,"value":{ "values":[{ "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2020-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2021-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}], "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"},"end":23,"dim":"time", "latent":false},{"body":"tomorrow","start":37,"value":{ "values":[{ "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"},"end":45,"dim":"time", "latent":false}]""" ) httpretty.enable() _config = RasaNLUModelConfig( {"pipeline": [{"name": "DucklingHTTPExtractor"}]} ) _config.set_component_attr(0, dimensions=["time"], timezone="UTC", url="http://localhost:8000") duckling = component_builder.create_component(_config.for_component(0), _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 4 # Test duckling with a defined date httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"tomorrow","start":12,"value":{"values":[{ "value":"2013-10-13T00:00:00.000Z","grain":"day", "type":"value"}],"value":"2013-10-13T00:00:00.000Z", "grain":"day","type":"value"},"end":20, "dim":"time","latent":false}]""" ) # 1381536182 == 2013/10/12 02:03:02 message = Message("Let us meet tomorrow.", time="1381536182") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z" # Test dimension filtering includes only specified dimensions _config = RasaNLUModelConfig( {"pipeline": [{"name": "DucklingHTTPExtractor"}]} ) _config.set_component_attr(0, dimensions=["number"], url="http://localhost:8000") ducklingNumber = component_builder.create_component( _config.for_component(0), _config) httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"Yesterday","start":0,"value":{"values":[{ "value":"2019-02-28T00:00:00.000+01:00","grain":"day", "type":"value"}],"value":"2019-02-28T00:00:00.000+01:00", "grain":"day","type":"value"},"end":9,"dim":"time"}, {"body":"5","start":21,"value":{"value":5,"type":"value"}, "end":22,"dim":"number"}]""" ) message = Message("Yesterday there were 5 people in a room") ducklingNumber.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "5" assert entities[0]["value"] == 5
def _convert_example(example: Message) -> List[Tuple[int, int, Text]]: def convert_entity(entity): return entity["start"], entity["end"], entity["entity"] return [convert_entity(ent) for ent in example.get("entities", [])]
def test_duckling_entity_extractor(component_builder): httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"Today","start":0,"value":{"values":[{ "value":"2018-11-13T00:00:00.000-08:00","grain":"day", "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00", "grain":"day","type":"value"},"end":5, "dim":"time","latent":false},{"body":"the 5th","start":9, "value":{"values":[{ "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-01-05T00:00:00.000-08:00","grain":"day", "type":"value"}, {"value":"2019-02-05T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-12-05T00:00:00.000-08:00","grain":"day", "type":"value"},"end":16,"dim":"time", "latent":false},{"body":"5th of May","start":13,"value":{ "values":[{ "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2020-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}, {"value":"2021-05-05T00:00:00.000-07:00","grain":"day", "type":"value"}], "value":"2019-05-05T00:00:00.000-07:00","grain":"day", "type":"value"},"end":23,"dim":"time", "latent":false},{"body":"tomorrow","start":37,"value":{ "values":[{ "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"}], "value":"2018-11-14T00:00:00.000-08:00","grain":"day", "type":"value"},"end":45,"dim":"time", "latent":false}]""" ) httpretty.enable() _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling_http"}]}) _config.set_component_attr("ner_duckling_http", dimensions=["time"], timezone="UTC", url="http://localhost:8000") duckling = component_builder.create_component("ner_duckling_http", _config) message = Message("Today is the 5th of May. Let us meet tomorrow.") duckling.process(message) entities = message.get("entities") assert len(entities) == 4 # Test duckling with a defined date httpretty.register_uri( httpretty.POST, "http://localhost:8000/parse", body="""[{"body":"tomorrow","start":12,"value":{"values":[{ "value":"2013-10-13T00:00:00.000Z","grain":"day", "type":"value"}],"value":"2013-10-13T00:00:00.000Z", "grain":"day","type":"value"},"end":20, "dim":"time","latent":false}]""" ) # 1381536182 == 2013/10/12 02:03:02 message = Message("Let us meet tomorrow.", time="1381536182") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
def process(self, message: Message, **kwargs: Any) -> None: updated_entities = message.get("entities", [])[:] self.replace_synonyms(updated_entities) message.set("entities", updated_entities, add_to_output=True)
def process(self, message: Message, **kwargs: Any) -> None: message.set("tokens", self.tokenize(message.get("spacy_doc")))
def _convert_example(example: Message) -> List[Tuple[int, int, Text]]: def convert_entity(entity): return entity["start"], entity["end"], entity["entity"] return [convert_entity(ent) for ent in example.get("entities", [])]