def test_encode_all_labels__encoded_all_action_names_and_texts(): # ... where "labels" means actions... domain = Domain( intents=[], entities=[], slots=[], responses={}, forms={}, action_names=["a", "b", "c", "d"], data={}, ) f = SingleStateFeaturizer() f.prepare_for_training(domain) precomputations = MessageContainerForCoreFeaturization() precomputations.derive_messages_from_domain_and_add(domain) encoded_actions = f.encode_all_labels(domain, precomputations=precomputations) assert len(encoded_actions) == len(domain.action_names_or_texts) assert all( [ ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action for encoded_action in encoded_actions ] )
def test_encode_entities__with_entity_roles_and_groups(): # create fake message that has been tokenized and entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({ TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities }) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # instantiate matching domain and single state featurizer domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) # encode! encoded = f.encode_entities(entity_data={ TEXT: text, ENTITIES: entities }, precomputations=precomputations) # check assert len(f.entity_tag_specs) == 1 tags_to_ids = f.entity_tag_specs[0].tags_to_ids for idx, entity_tag in enumerate(entity_tags): tags_to_ids[entity_tag] = idx + 1 # hence, city -> 1, city#to -> 2 assert sorted(list(encoded.keys())) == [ENTITY_TAGS] assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]])
def test_persist_and_load_tracker_featurizer(tmp_path: Text, moodbot_domain: Domain): state_featurizer = SingleStateFeaturizer() state_featurizer.prepare_for_training(moodbot_domain, RegexInterpreter()) tracker_featurizer = MaxHistoryTrackerFeaturizer(state_featurizer) tracker_featurizer.persist(tmp_path) loaded_tracker_featurizer = TrackerFeaturizer.load(tmp_path) assert loaded_tracker_featurizer is not None assert loaded_tracker_featurizer.state_featurizer is not None
def test_single_state_featurizer_creates_encoded_all_actions(): domain = Domain( intents=[], entities=[], slots=[], templates={}, forms={}, action_names=["a", "b", "c", "d"], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, RegexInterpreter()) encoded_actions = f.encode_all_actions(domain, RegexInterpreter()) assert len(encoded_actions) == len(domain.action_names) assert all([ ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action for encoded_action in encoded_actions ])
def test_single_state_featurizer_uses_regex_interpreter( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent domain = Domain( intents=[], entities=[], slots=[], responses={}, forms=[], action_names=[], ) f = SingleStateFeaturizer() # simulate that core was trained separately by passing # RegexInterpreter to prepare_for_training f.prepare_for_training(domain, RegexInterpreter()) # simulate that nlu and core models were manually combined for prediction # by passing trained interpreter to encode_all_actions interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter features = f._extract_state_features({TEXT: "some text"}, interpreter) # RegexInterpreter cannot create features for text, therefore since featurizer # was trained without nlu, features for text should be empty assert not features
def test_single_state_featurizer_with_entity_roles_and_groups( unpacked_trained_moodbot_path: Text, ): from rasa.core.agent import Agent interpreter = Agent.load(unpacked_trained_moodbot_path).interpreter # TODO roles and groups are not supported in e2e yet domain = Domain( intents=[], entities=["city", f"city{ENTITY_LABEL_SEPARATOR}to"], slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, RegexInterpreter()) encoded = f.encode_entities( { TEXT: "I am flying from London to Paris", ENTITIES: [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: f"city{ENTITY_LABEL_SEPARATOR}to", ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ], }, interpreter=interpreter, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]] )
def test_single_state_featurizer_prepare_for_training(): domain = Domain( intents=["greet"], entities=["name"], slots=[Slot("name")], templates={}, forms=[], action_names=["utter_greet", "action_check_weather"], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, RegexInterpreter()) assert len(f._default_feature_states[INTENT]) > 1 assert "greet" in f._default_feature_states[INTENT] assert len(f._default_feature_states[ENTITIES]) == 1 assert f._default_feature_states[ENTITIES]["name"] == 0 assert len(f._default_feature_states[SLOTS]) == 1 assert f._default_feature_states[SLOTS]["name_0"] == 0 assert len(f._default_feature_states[ACTION_NAME]) > 2 assert "utter_greet" in f._default_feature_states[ACTION_NAME] assert "action_check_weather" in f._default_feature_states[ACTION_NAME] assert len(f._default_feature_states[ACTIVE_LOOP]) == 0
def test_prepare_for_training(): domain = Domain( intents=["greet"], entities=["name"], slots=[TextSlot("name", mappings=[{}])], responses={}, forms={}, action_names=["utter_greet", "action_check_weather"], data={}, ) f = SingleStateFeaturizer() f.prepare_for_training(domain) assert len(f._default_feature_states[INTENT]) > 1 assert "greet" in f._default_feature_states[INTENT] assert len(f._default_feature_states[ENTITIES]) == 1 assert f._default_feature_states[ENTITIES]["name"] == 0 assert len(f._default_feature_states[SLOTS]) == 1 assert f._default_feature_states[SLOTS]["name_0"] == 0 assert len(f._default_feature_states[ACTION_NAME]) > 2 assert "utter_greet" in f._default_feature_states[ACTION_NAME] assert "action_check_weather" in f._default_feature_states[ACTION_NAME] assert len(f._default_feature_states[ACTIVE_LOOP]) == 0
def test_encode_entities__with_bilou_entity_roles_and_groups(): # Instantiate domain and configure the single state featurizer for this domain. # Note that there are 2 entity tags here. entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], data={}, ) f = SingleStateFeaturizer() f.prepare_for_training(domain, bilou_tagging=True) # (1) example with both entities # create message that has been tokenized and where entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]] ) # (2) example with only the "city" entity # create message that has been tokenized and where entities have been extracted text = "I am flying to Saint Petersburg" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg", ENTITY_ATTRIBUTE_START: 15, ENTITY_ATTRIBUTE_END: 31, } ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])