def process(self, message: Message, **kwargs: Any) -> None: entities = self.extract_entities(message) entities = self.add_extractor_name(entities) message.set(ENTITIES, message.get(ENTITIES, []) + entities, add_to_output=True)
def _tokens_of_message(message: Message) -> List[Text]: return [token.text for token in message.get(TOKENS_NAMES[TEXT], [])]
def process(self, message: Message, **kwargs: Any) -> None: for attribute in DENSE_FEATURIZABLE_ATTRIBUTES: if message.get(attribute): message.set(SPACY_DOCS[attribute], self.doc_for_text(message.get(attribute)))
def process(self, message: Message, **kwargs: Any) -> None: updated_entities = message.get(ENTITIES, [])[:] self.replace_synonyms(updated_entities) message.set(ENTITIES, updated_entities, add_to_output=True)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) words = word_tokenize(text) # words = ViTokenizer.tokenize(text).split(' ') return self._convert_words_to_tokens(words, text)
def get_doc(self, message: Message, attribute: Text) -> Any: return message.get(SPACY_DOCS[attribute])
def compute_similarity_for_pair(self, a: Message, b: Message): features_a = a.get("sentence_features").vector features_b = b.get("sentence_features").vector return self.compute_similarity_score(features_a, features_b)
def _get_doc(self, message: Message, attribute: Text) -> Optional["Doc"]: return message.get(SPACY_DOCS[attribute])
def _unpack(self, message: Message, domain: Domain) -> Message: """Unpacks the messsage if `TEXT` contains an encoding of attributes. Args: message: some message domain: the domain Returns: the given message if that message does not need to be unpacked, and a new message with the extracted attributes otherwise """ user_text = message.get(TEXT).strip() # If the prefix doesn't match, we don't even need to try to match the pattern. if not user_text.startswith(self._prefix): return message # Try to match the pattern. match = self._pattern.match(user_text) # If it doesn't match, then (potentially) something went wrong, because the # message text did start with the special prefix -- however, a user might # just have decided to start their text this way. if not match: logger.warning( f"Failed to parse intent end entities from '{user_text}'.") return message # Extract attributes from the match - and validate it via the domain. intent_name = self._parse_intent_name(match, domain) confidence = self._parse_optional_confidences(match) entities = self._parse_optional_entities(match, domain) # The intent name is *not* optional, but during parsing we might find out # that the given intent is unknown (and warn). In this case, stop here. if intent_name is None: return message if match.group("rest"): rasa.shared.utils.io.raise_warning( f"Failed to parse arguments in line '{match.string}'. " f"Failed to interpret some parts. " f"Continuing without {match.group('rest')}. ", docs=DOCS_URL_STORIES, ) # Add the results to the message. intent_data = { INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence, } intent_ranking = [{ INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence, }] message_data = {} message_data[TEXT] = user_text message_data[INTENT] = intent_data message_data[INTENT_RANKING_KEY] = intent_ranking message_data[ENTITIES] = entities return Message(message_data, output_properties=set(message_data.keys()))
def process(self, message: Message, **kwargs: Any) -> None: extracted = self._match_entities(message) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_base_examples(example): message = Message({TEXT: example["text"]}) tok = BlankSpacyTokenizer(component_config={"lang": example["lang"]}) tok.process(message) tokens = message.get(TOKENS_NAMES[TEXT]) assert [t.text for t in tokens] == example["result"]
def test_classification(self, trained_classifier, message, intent): text = Message(data={TEXT: message}) trained_classifier.process(text) assert text.get("intent").get("name", "NOT_CLASSIFIED") == intent
def process(self, message: Message, **kwargs: Any) -> None: if message.get(TEXT) != None: message.set(TEXT, message.get(TEXT).title())
async def test_adjusting_layers_incremental_training( create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector], train_and_preprocess: Callable[..., Tuple[TrainingData, List[GraphComponent]]], process_message: Callable[..., Message], ): """Tests adjusting sparse layers of `ResponseSelector` to increased sparse feature sizes during incremental training. Testing is done by checking the layer sizes. Checking if they were replaced correctly is also important and is done in `test_replace_dense_for_sparse_layers` in `test_rasa_layers.py`. """ iter1_data_path = "data/test_incremental_training/iter1/" iter2_data_path = "data/test_incremental_training/" pipeline = [ { "component": WhitespaceTokenizer }, { "component": LexicalSyntacticFeaturizer }, { "component": RegexFeaturizer }, { "component": CountVectorsFeaturizer }, { "component": CountVectorsFeaturizer, "analyzer": "char_wb", "min_ngram": 1, "max_ngram": 4, }, ] training_data, loaded_pipeline = train_and_preprocess( pipeline, iter1_data_path) response_selector = create_response_selector({EPOCHS: 1}) response_selector.train(training_data=training_data) old_data_signature = response_selector.model.data_signature old_predict_data_signature = response_selector.model.predict_data_signature message = Message(data={TEXT: "Rasa is great!"}) message = process_message(loaded_pipeline, message) message2 = copy.deepcopy(message) classified_message = response_selector.process([message])[0] old_sparse_feature_sizes = classified_message.get_sparse_feature_sizes( attribute=TEXT) initial_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] initial_rs_sequence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] initial_rs_sentence_layer = initial_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0] initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0] assert initial_rs_sequence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert initial_rs_sentence_size == sum( old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) loaded_selector = load_response_selector({EPOCHS: 1}) classified_message2 = loaded_selector.process([message2])[0] assert classified_message2.fingerprint() == classified_message.fingerprint( ) training_data2, loaded_pipeline2 = train_and_preprocess( pipeline, iter2_data_path) response_selector.train(training_data=training_data2) new_message = Message.build(text="Rasa is great!") new_message = process_message(loaded_pipeline2, new_message) classified_new_message = response_selector.process([new_message])[0] new_sparse_feature_sizes = classified_new_message.get_sparse_feature_sizes( attribute=TEXT) final_rs_layers = response_selector.model._tf_layers[ "sequence_layer.text"]._tf_layers["feature_combining"] final_rs_sequence_layer = final_rs_layers._tf_layers[ "sparse_dense.sequence"]._tf_layers["sparse_to_dense"] final_rs_sentence_layer = final_rs_layers._tf_layers[ "sparse_dense.sentence"]._tf_layers["sparse_to_dense"] final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0] final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0] assert final_rs_sequence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE]) assert final_rs_sentence_size == sum( new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE]) # check if the data signatures were correctly updated new_data_signature = response_selector.model.data_signature new_predict_data_signature = response_selector.model.predict_data_signature iter2_data = load_data(iter2_data_path) expected_sequence_lengths = len([ message for message in iter2_data.training_examples if message.get(INTENT_RESPONSE_KEY) ]) def test_data_signatures( new_signature: Dict[Text, Dict[Text, List[FeatureArray]]], old_signature: Dict[Text, Dict[Text, List[FeatureArray]]], ): # Wherever attribute / feature_type signature is not # expected to change, directly compare it to old data signature. # Else compute its expected signature and compare attributes_expected_to_change = [TEXT] feature_types_expected_to_change = [ FEATURE_TYPE_SEQUENCE, FEATURE_TYPE_SENTENCE, ] for attribute, signatures in new_signature.items(): for feature_type, feature_signatures in signatures.items(): if feature_type == "sequence_lengths": assert feature_signatures[ 0].units == expected_sequence_lengths elif feature_type not in feature_types_expected_to_change: assert feature_signatures == old_signature.get( attribute).get(feature_type) else: for index, feature_signature in enumerate( feature_signatures): if (feature_signature.is_sparse and attribute in attributes_expected_to_change): assert feature_signature.units == sum( new_sparse_feature_sizes.get(feature_type)) else: # dense signature or attributes that are not # expected to change can be compared directly assert ( feature_signature.units == old_signature.get( attribute).get(feature_type)[index].units) test_data_signatures(new_data_signature, old_data_signature) test_data_signatures(new_predict_data_signature, old_predict_data_signature)
def compute_features(self, example: Message): features = self.model[example.get("text")] example.set("sentence_features", features)
def process(self, message: Message, **kwargs: Any) -> None: from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder from tokenizer_tools.tagset.offset.sequence import Sequence decoder = BILUOSequenceEncoderDecoder() real_result_dir = os.path.join(self.model_dir, self.result_dir) print(real_result_dir) input_text = message.text input_feature = { 'words': [[i for i in input_text]], 'words_len': [len(input_text)], } print(input_feature) predictions = self.predict_fn(input_feature) tags = predictions['tags'][0] # print(predictions['tags']) # decode Unicode tags_seq = [i.decode() for i in tags] print(tags_seq) # BILUO to offset failed = False try: seq = decoder.to_offset(tags_seq, input_text) except Exception as e: # invalid tag sequence will raise exception # so return a empty result logger.error("Decode error: {}".format(e)) seq = Sequence(input_text) failed = True # print(seq) print(seq, tags_seq, failed) entity_set = [] seq.span_set.fill_text(input_text) for span in seq.span_set: ent = { "entity": span.entity, "value": span.value, "start": span.start, "confidence": None, "end": span.end } entity_set.append(ent) extracted = self.add_extractor_name(entity_set) message.set("entities", message.get("entities", []) + extracted, add_to_output=True)
def test_duckling_entity_extractor(component_builder): with responses.RequestsMock() as rsps: rsps.add( responses.POST, "http://localhost:8000/parse", json=[ { "body": "Today", "start": 0, "value": { "values": [{ "value": "2018-11-13T00:00:00.000-08:00", "grain": "day", "type": "value", }], "value": "2018-11-13T00:00:00.000-08:00", "grain": "day", "type": "value", }, "end": 5, "dim": "time", "latent": False, }, { "body": "the 5th", "start": 9, "value": { "values": [ { "value": "2018-12-05T00:00:00.000-08:00", "grain": "day", "type": "value", }, { "value": "2019-01-05T00:00:00.000-08:00", "grain": "day", "type": "value", }, { "value": "2019-02-05T00:00:00.000-08:00", "grain": "day", "type": "value", }, ], "value": "2018-12-05T00:00:00.000-08:00", "grain": "day", "type": "value", }, "end": 16, "dim": "time", "latent": False, }, { "body": "5th of May", "start": 13, "value": { "values": [ { "value": "2019-05-05T00:00:00.000-07:00", "grain": "day", "type": "value", }, { "value": "2020-05-05T00:00:00.000-07:00", "grain": "day", "type": "value", }, { "value": "2021-05-05T00:00:00.000-07:00", "grain": "day", "type": "value", }, ], "value": "2019-05-05T00:00:00.000-07:00", "grain": "day", "type": "value", }, "end": 23, "dim": "time", "latent": False, }, { "body": "tomorrow", "start": 37, "value": { "values": [{ "value": "2018-11-14T00:00:00.000-08:00", "grain": "day", "type": "value", }], "value": "2018-11-14T00:00:00.000-08:00", "grain": "day", "type": "value", }, "end": 45, "dim": "time", "latent": False, }, ], ) _config = RasaNLUModelConfig( {"pipeline": [{ "name": "DucklingEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["time"], timezone="UTC", url="http://localhost:8000") duckling = component_builder.create_component(_config.for_component(0), _config) message = Message( data={TEXT: "Today is the 5th of May. Let us meet tomorrow."}) duckling.process(message) entities = message.get("entities") assert len(entities) == 4 # Test duckling with a defined date with responses.RequestsMock() as rsps: rsps.add( responses.POST, "http://localhost:8000/parse", json=[{ "body": "tomorrow", "start": 12, "value": { "values": [{ "value": "2013-10-13T00:00:00.000Z", "grain": "day", "type": "value", }], "value": "2013-10-13T00:00:00.000Z", "grain": "day", "type": "value", }, "end": 20, "dim": "time", "latent": False, }], ) # 1381536182 == 2013/10/12 02:03:02 message = Message(data={TEXT: "Let us meet tomorrow."}, time="1381536182") duckling.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "tomorrow" assert entities[0]["value"] == "2013-10-13T00:00:00.000Z" # Test dimension filtering includes only specified dimensions _config = RasaNLUModelConfig( {"pipeline": [{ "name": "DucklingEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["number"], url="http://localhost:8000") duckling_number = component_builder.create_component( _config.for_component(0), _config) with responses.RequestsMock() as rsps: rsps.add( responses.POST, "http://localhost:8000/parse", json=[ { "body": "Yesterday", "start": 0, "value": { "values": [{ "value": "2019-02-28T00:00:00.000+01:00", "grain": "day", "type": "value", }], "value": "2019-02-28T00:00:00.000+01:00", "grain": "day", "type": "value", }, "end": 9, "dim": "time", }, { "body": "5", "start": 21, "value": { "value": 5, "type": "value" }, "end": 22, "dim": "number", }, ], ) message = Message( data={TEXT: "Yesterday there were 5 people in a room"}) duckling_number.process(message) entities = message.get("entities") assert len(entities) == 1 assert entities[0]["text"] == "5" assert entities[0]["value"] == 5
def _features_for_patterns( self, message: Message, attribute: Text ) -> Tuple[Optional[scipy.sparse.coo_matrix], Optional[scipy.sparse.coo_matrix]]: """Checks which known patterns match the message. Given a sentence, returns a vector of {1,0} values indicating which regexes did match. Furthermore, if the message is tokenized, the function will mark all tokens with a dict relating the name of the regex to whether it was matched. Args: message: Message to be featurized. attribute: Attribute of message to be featurized. Returns: Token and sentence level features of message attribute. """ # Attribute not set (e.g. response not present) if not message.get(attribute): return None, None tokens = message.get(TOKENS_NAMES[attribute], []) if not tokens: # nothing to featurize return None, None flags = 0 # default flag if not self.case_sensitive: flags = re.IGNORECASE sequence_length = len(tokens) num_patterns = len(self.known_patterns) sequence_features = np.zeros([sequence_length, num_patterns]) sentence_features = np.zeros([1, num_patterns]) for pattern_index, pattern in enumerate(self.known_patterns): matches = re.finditer(pattern["pattern"], message.get(attribute), flags=flags) matches = list(matches) for token_index, t in enumerate(tokens): patterns = t.get("pattern", default={}) patterns[pattern["name"]] = False for match in matches: if t.start < match.end() and t.end > match.start(): patterns[pattern["name"]] = True sequence_features[token_index][pattern_index] = 1.0 if attribute in [RESPONSE, TEXT, ACTION_TEXT]: # sentence vector should contain all patterns sentence_features[0][pattern_index] = 1.0 t.set("pattern", patterns) return ( scipy.sparse.coo_matrix(sequence_features), scipy.sparse.coo_matrix(sentence_features), )
def unpack_regex_message( message: Message, domain: Optional[Domain] = None, entity_extractor_name: Optional[Text] = None, ) -> Message: """Unpacks the message if `TEXT` contains an encoding of attributes. Args: message: some message domain: the domain entity_extractor_name: An extractor name which should be added for the entities. Returns: the given message if that message does not need to be unpacked, and a new message with the extracted attributes otherwise """ user_text = message.get(TEXT).strip() # If the prefix doesn't match, we don't even need to try to match the pattern. if not user_text.startswith(INTENT_MESSAGE_PREFIX): return message # Try to match the pattern. match = YAMLStoryReader._regex_message_pattern().match(user_text) # If it doesn't match, then (potentially) something went wrong, because the # message text did start with the special prefix -- however, a user might # just have decided to start their text this way. if not match: logger.warning(f"Failed to parse intent end entities from '{user_text}'.") return message # Extract attributes from the match - and validate it via the domain. intent_name = YAMLStoryReader._intent_name_from_regex_match(match, domain) confidence = YAMLStoryReader._confidences_from_regex_match(match) entities = YAMLStoryReader._entities_from_regex_match( match, domain, entity_extractor_name ) # The intent name is *not* optional, but during parsing we might find out # that the given intent is unknown (and warn). In this case, stop here. if intent_name is None: return message if match.group("rest"): rasa.shared.utils.io.raise_warning( f"Failed to parse arguments in line '{match.string}'. " f"Failed to interpret some parts. " f"Make sure your regex string is in the following format:" f"{INTENT_MESSAGE_PREFIX}" f"<intent_name>@<confidence-value><dictionary of entities> " f"Continuing without {match.group('rest')}. " ) # Add the results to the message. intent_data = { INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence, } intent_ranking = [ {INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence} ] message_data = {} message_data[TEXT] = user_text message_data[INTENT] = intent_data message_data[INTENT_RANKING_KEY] = intent_ranking message_data[ENTITIES] = entities return Message(message_data, output_properties=set(message_data.keys()))