def test_encode_all_labels__encoded_all_action_names_and_texts(): # ... where "labels" means actions... domain = Domain( intents=[], entities=[], slots=[], responses={}, forms={}, action_names=["a", "b", "c", "d"], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) precomputations = MessageContainerForCoreFeaturization() precomputations.derive_messages_from_domain_and_add(domain) encoded_actions = f.encode_all_labels(domain, precomputations=precomputations) assert len(encoded_actions) == len(domain.action_names_or_texts) assert all( [ ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action for encoded_action in encoded_actions ] )
def test_container_derive_messages_from_domain_and_add(): action_names = ["a", "b"] # action texts, response keys, forms, and action_names must be unique or the # domain will complain about it ... action_texts = ["a2", "b2"] # ... but the response texts could overlap with e.g action texts responses = {"a3": {TEXT: "a2"}, "b3": {TEXT: "b2"}} forms = {"a4": "a4"} # however, intent names can be anything intents = ["a", "b"] domain = Domain( intents=intents, action_names=action_names, action_texts=action_texts, responses=responses, entities=["e_a", "e_b", "e_c"], slots=[TextSlot(name="s", mappings=[{}])], forms=forms, data={}, ) lookup_table = MessageContainerForCoreFeaturization() lookup_table.derive_messages_from_domain_and_add(domain) assert len(lookup_table) == ( len(domain.intent_properties) + len(domain.action_names_or_texts) )
def test_encode_entities__with_entity_roles_and_groups(): # create fake message that has been tokenized and entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({ TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities }) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # instantiate matching domain and single state featurizer domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) # encode! encoded = f.encode_entities(entity_data={ TEXT: text, ENTITIES: entities }, precomputations=precomputations) # check assert len(f.entity_tag_specs) == 1 tags_to_ids = f.entity_tag_specs[0].tags_to_ids for idx, entity_tag in enumerate(entity_tags): tags_to_ids[entity_tag] = idx + 1 # hence, city -> 1, city#to -> 2 assert sorted(list(encoded.keys())) == [ENTITY_TAGS] assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]])
def test_container_add_fails_if_messages_are_different_but_have_same_key(): # construct a set of unique substates dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ {INTENT: "greet"}, {TEXT: "text", ENTITIES: dummy_value}, {ACTION_TEXT: "action_text"}, {ACTION_NAME: "action_name"}, ] constant_feature = _dummy_features(id=1, attribute="arbitrary") different_feature = _dummy_features(id=1, attribute="arbitrary") # adding the unique messages works fine of course,... lookup_table = MessageContainerForCoreFeaturization() for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[constant_feature])) # ... but adding any substate with same key but different content doesn't new_key = "some-new-key" expected_error_message = "Expected added message to be consistent" for sub_state in substates_with_unique_key_attribute: # with extra attribute sub_state_with_extra_attribute = sub_state.copy() sub_state_with_extra_attribute[new_key] = "some-value-for-the-new-key" with pytest.raises(ValueError, match=expected_error_message): lookup_table.add(Message(data=sub_state_with_extra_attribute)) # with new feature with pytest.raises(ValueError, match=expected_error_message): lookup_table.add( Message(data=sub_state, features=[constant_feature, different_feature]) ) # without features with pytest.raises(ValueError, match=expected_error_message): lookup_table.add(Message(data=sub_state))
def test_container_add_message_copies(): # construct a set of unique substates and messages dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ {INTENT: "greet"}, {TEXT: "text", ENTITIES: dummy_value}, {TEXT: "other-text"}, {ACTION_TEXT: "action_text"}, {ACTION_NAME: "action_name"}, ] unique_messages = [ Message(sub_state) for sub_state in substates_with_unique_key_attribute ] # make some copies num_copies = 3 messages = unique_messages * (1 + num_copies) # build table lookup_table = MessageContainerForCoreFeaturization() for message in messages: lookup_table.add(message) # assert that we have as many entries as unique keys assert len(lookup_table) == len(substates_with_unique_key_attribute) assert set(lookup_table.all_messages()) == set(unique_messages) assert ( lookup_table.num_collisions_ignored == len(substates_with_unique_key_attribute) * num_copies )
def test_container_feature_lookup_works_if_messages_are_broken_but_consistent(): not_broken_but_strange_table = MessageContainerForCoreFeaturization() not_broken_but_strange_table._table = { TEXT: {"A": Message(data=dict())}, INTENT: {"B": Message(data=dict(), features=[_dummy_features(1, TEXT)])}, } features = not_broken_but_strange_table.collect_features({TEXT: "A", INTENT: "B"}) assert TEXT in features and len(features[TEXT]) == 1
def test_container_fingerprint_differ_for_containers_with_different_insertion_order(): # because we use this for training data and order might affect training of # e.g. featurizers, we want this to differ container1 = MessageContainerForCoreFeaturization() container1.add(Message(data={INTENT: "1"})) container1.add(Message(data={INTENT: "2"})) container2 = MessageContainerForCoreFeaturization() container2.add(Message(data={INTENT: "2"})) container2.add(Message(data={INTENT: "1"})) assert container2.fingerprint() != container1.fingerprint()
def test_container_feature_lookup_fails_if_different_features_for_same_attribute(): broken_table = MessageContainerForCoreFeaturization() broken_table._table = { TEXT: {"A": Message(data={}, features=[_dummy_features(2, TEXT)])}, INTENT: {"B": Message(data={}, features=[_dummy_features(1, TEXT)])}, } with pytest.raises( RuntimeError, match=f"Feature for attribute {TEXT} has already been" ): broken_table.collect_features({TEXT: "A", INTENT: "B"})
def test_container_feature_lookup(): arbitrary_attribute = "other" messages = [ Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]), Message( data={ INTENT: "B", arbitrary_attribute: "C" }, features=[_dummy_features(2, arbitrary_attribute)], ), Message(data={TEXT: "A2"}, features=[_dummy_features(3, TEXT)]), Message( data={ INTENT: "B2", arbitrary_attribute: "C2" }, features=[_dummy_features(4, arbitrary_attribute)], ), ] table = MessageContainerForCoreFeaturization() table.add_all(messages) # If we don't specify a list of attributes, the resulting features dictionary will # only contain those attributes for which there are features. sub_state = {TEXT: "A", INTENT: "B", arbitrary_attribute: "C"} features = table.collect_features(sub_state=sub_state) for attribute, feature_value in [ (TEXT, 1), (INTENT, None), (arbitrary_attribute, 2), ]: if feature_value is not None: assert attribute in features assert len(features[attribute]) == 1 assert feature_value == features[attribute][0].features[0] else: assert attribute not in features # If we query features for `INTENT`, then a key will be there, even if there are # no features features = table.collect_features(sub_state=sub_state, attributes=list(sub_state.keys())) assert INTENT in features assert len(features[INTENT]) == 0 # We only get the list of features we want... features = table.collect_features(sub_state, attributes=[arbitrary_attribute]) assert TEXT not in features assert INTENT not in features assert len(features[arbitrary_attribute]) == 1 # ... even if there are no features: YET_ANOTHER = "another" features = table.collect_features(sub_state, attributes=[YET_ANOTHER]) assert len(features[YET_ANOTHER]) == 0
def test_container_add_fails_if_message_has_wrong_attributes( no_or_multiple_key_attributes: List[Text], ): sub_state = { attribute: "dummy" for attribute in no_or_multiple_key_attributes } with pytest.raises(ValueError, match="Expected exactly one attribute out of"): MessageContainerForCoreFeaturization().add(Message(sub_state))
def test_encode_state__with_lookup__creates_features_for_intent_and_action_name( with_action_listen: bool, ): """Tests that features for intent and action name are created if needed. Especially tests that this is the case even though no features are present in the given lookup table for this intent and action_name. However, if no `action_listen` is in the given sub-state, then the user sub-state should not be featurized (hence, no features for intent) should be created. """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"a": 0, "b": 1} f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, ACTION_LISTEN_NAME: 2} # create state action_name = ACTION_LISTEN_NAME if with_action_listen else "c" state = {USER: {INTENT: "e"}, PREVIOUS_ACTION: {ACTION_NAME: action_name}} # create a lookup table with all relevant entries **but no Features** precomputations = MessageContainerForCoreFeaturization() precomputations.add(Message(data={INTENT: state[USER][INTENT]})) precomputations.add( Message(data={ACTION_NAME: state[PREVIOUS_ACTION][ACTION_NAME]}) ) # encode! encoded = f.encode_state(state, precomputations=precomputations) if with_action_listen: assert set(encoded.keys()) == set([INTENT, ACTION_NAME]) assert ( encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]]) ).nnz == 0 else: assert set(encoded.keys()) == set([ACTION_NAME])
def test_container_add_does_not_fail_if_message_feature_content_differs(): # construct a set of unique substates dummy_value = "this-could-be-anything" substates_with_unique_key_attribute = [ { INTENT: "greet" }, { TEXT: "text", ENTITIES: dummy_value }, { ACTION_TEXT: "action_text" }, { ACTION_NAME: "action_name" }, ] constant_feature = _dummy_features(id=1, attribute="arbitrary") different_feature = _dummy_features(id=1, attribute="arbitrary") lookup_table = MessageContainerForCoreFeaturization() for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[constant_feature])) length = len(lookup_table) # with different feature for sub_state in substates_with_unique_key_attribute: lookup_table.add(Message(data=sub_state, features=[different_feature])) assert len(lookup_table) == length
def test_container_message_lookup(): # create some messages with unique key attributes messages = [ Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]), Message(data={TEXT: "B"}), Message(data={INTENT: "B"}), Message(data={ACTION_TEXT: "B"}), Message(data={ACTION_NAME: "B"}), ] # add messages to container table = MessageContainerForCoreFeaturization() table.add_all(messages) # lookup messages using existing texts message = table.lookup_message(user_text="A") assert message assert len(message.data) == 1 assert len(message.features) == 1 message = table.lookup_message(user_text="B") assert message assert len(message.data) == 1
def test_container_message_lookup_fails_if_text_cannot_be_looked_up(): table = MessageContainerForCoreFeaturization() with pytest.raises(ValueError, match="Expected a message with key"): table.lookup_message(user_text="a text not included in the table")
def test_container_feature_lookup_fails_without_key_attribute(): table = MessageContainerForCoreFeaturization() with pytest.raises(ValueError, match="Unknown key"): table.collect_features({TEXT: "A-unknown"})
def test_container_derive_messages_from_events_and_add( events: List[Event], expected_num_entries: int): lookup_table = MessageContainerForCoreFeaturization() lookup_table.derive_messages_from_events_and_add(events) assert len(lookup_table) == expected_num_entries
def test_container_all_messages(): message_data_list = [{INTENT: "1"}, {INTENT: "2", "other": 3}, {TEXT: "3"}] container = MessageContainerForCoreFeaturization() container.add_all([Message(data=data) for data in message_data_list]) assert len(container.all_messages()) == 3
def test_container_keys(): message_data_list = [{INTENT: "1"}, {INTENT: "2"}, {TEXT: "3", "other": 3}] container = MessageContainerForCoreFeaturization() container.add_all([Message(data=data) for data in message_data_list]) assert set(container.keys(INTENT)) == {"1", "2"} assert set(container.keys(TEXT)) == {"3"}
def test_encode_state__with_lookup__looksup_or_creates_features(action_name: Text): """Tests that features from table are combined or created from scratch. If the given action name is ... - ACTION_LISTEN_NAME then the user substate and the action name are encoded - some "other" action, then the user-substate is not encoed but the action name is - set to "None", then we remove the action name from the user substate and as a result there should be no encoding for the action name and for the user substate """ f = SingleStateFeaturizer() f._default_feature_states[INTENT] = {"greet": 0, "inform": 1} f._default_feature_states[ENTITIES] = { "city": 0, "name": 1, f"city{ENTITY_LABEL_SEPARATOR}to": 2, f"city{ENTITY_LABEL_SEPARATOR}from": 3, } f._default_feature_states[ACTION_NAME] = { "NOT_action_listen": 0, "utter_greet": 1, ACTION_LISTEN_NAME: 2, } # `_0` in slots represent feature dimension f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2} f._default_feature_states[ACTIVE_LOOP] = { "active_loop_1": 0, "active_loop_2": 1, "active_loop_3": 2, "active_loop_4": 3, } # create state text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_name_list = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] action_text = "throw a ball" intent = "inform" state = { USER: {TEXT: text, INTENT: intent, ENTITIES: entity_name_list,}, PREVIOUS_ACTION: {ACTION_NAME: action_name, ACTION_TEXT: action_text,}, ACTIVE_LOOP: {"name": "active_loop_4"}, SLOTS: {"slot_1": (1.0,)}, } if action_name is None: del state[PREVIOUS_ACTION][ACTION_NAME] # Build lookup table with all relevant information - and dummy features for all # dense featurizable attributes. # Note that we don't need to add the `ENTITIES` to the message including `TEXT` # here because `encode_state` won't featurize the entities using the lookup table # (only `encode_entities` does that). units = 300 precomputations = MessageContainerForCoreFeaturization() precomputations.add_all( [ Message( data={TEXT: text, TOKENS_NAMES[TEXT]: tokens}, features=[ dummy_features( fill_value=11, units=units, attribute=TEXT, type=SENTENCE, is_sparse=True, ), dummy_features( fill_value=12, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=False, ), # Note: sparse sequence feature is last here dummy_features( fill_value=13, units=units, attribute=TEXT, type=SEQUENCE, is_sparse=True, ), ], ), Message(data={INTENT: intent}), Message( data={ACTION_TEXT: action_text}, features=[ dummy_features( fill_value=1, units=units, attribute=ACTION_TEXT, type=SEQUENCE, is_sparse=True, ) ], ), ] ) if action_name is not None: precomputations.add(Message(data={ACTION_NAME: action_name})) # encode the state encoded = f.encode_state(state, precomputations=precomputations,) # check all the features are encoded and *_text features are encoded by a # dense featurizer expected_attributes = [SLOTS, ACTIVE_LOOP, ACTION_TEXT] if action_name is not None: # i.e. we did not remove it from the state expected_attributes += [ACTION_NAME] if action_name == ACTION_LISTEN_NAME: expected_attributes += [TEXT, ENTITIES, INTENT] assert set(encoded.keys()) == set(expected_attributes) # Remember, sparse sequence features come first (and `.features` denotes the matrix # not a `Features` object) if action_name == ACTION_LISTEN_NAME: assert encoded[TEXT][0].features.shape[-1] == units assert encoded[TEXT][0].is_sparse() assert encoded[ENTITIES][0].features.shape[-1] == 4 assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[0, 1]])) assert encoded[ACTION_TEXT][0].features.shape[-1] == units assert encoded[ACTION_TEXT][0].is_sparse() if action_name is not None: if action_name == "NOT_action_listen": action_name_encoding = [1, 0, 0] else: # action_listen action_name_encoding = [0, 0, 1] assert sparse_equals_dense( encoded[ACTION_NAME][0].features, np.array([action_name_encoding]) ) else: assert ACTION_NAME not in encoded assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[1, 0, 0]])) assert sparse_equals_dense( encoded[ACTIVE_LOOP][0].features, np.array([[0, 0, 0, 1]]) )
def test_container_fingerprints_differ_for_different_containers(): container1 = MessageContainerForCoreFeaturization() container1.add(Message(data={INTENT: "1"})) container2 = MessageContainerForCoreFeaturization() container2.add(Message(data={INTENT: "2"})) assert container2.fingerprint() != container1.fingerprint()
def test_encode_entities__with_bilou_entity_roles_and_groups(): # Instantiate domain and configure the single state featurizer for this domain. # Note that there are 2 entity tags here. entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain, bilou_tagging=True) # (1) example with both entities # create message that has been tokenized and where entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all( encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]] ) # (2) example with only the "city" entity # create message that has been tokenized and where entities have been extracted text = "I am flying to Saint Petersburg" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entities = [ { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg", ENTITY_ATTRIBUTE_START: 15, ENTITY_ATTRIBUTE_END: 31, }, ] message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities}) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # encode! encoded = f.encode_entities( {TEXT: text, ENTITIES: entities,}, precomputations=precomputations, bilou_tagging=True, ) assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS]) assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])