def test_encode_all_labels__encoded_all_action_names_and_texts():
    # ... where "labels" means actions...
    domain = Domain(
        intents=[],
        entities=[],
        slots=[],
        responses={},
        forms={},
        action_names=["a", "b", "c", "d"],
    )

    f = SingleStateFeaturizer()
    f.prepare_for_training(domain)

    precomputations = MessageContainerForCoreFeaturization()
    precomputations.derive_messages_from_domain_and_add(domain)

    encoded_actions = f.encode_all_labels(domain, precomputations=precomputations)

    assert len(encoded_actions) == len(domain.action_names_or_texts)
    assert all(
        [
            ACTION_NAME in encoded_action and ACTION_TEXT not in encoded_action
            for encoded_action in encoded_actions
        ]
    )
Exemplo n.º 2
0
def test_container_derive_messages_from_domain_and_add():
    action_names = ["a", "b"]
    # action texts, response keys, forms, and action_names must be unique or the
    # domain will complain about it ...
    action_texts = ["a2", "b2"]
    # ... but the response texts could overlap with e.g action texts
    responses = {"a3": {TEXT: "a2"}, "b3": {TEXT: "b2"}}
    forms = {"a4": "a4"}
    # however, intent names can be anything
    intents = ["a", "b"]
    domain = Domain(
        intents=intents,
        action_names=action_names,
        action_texts=action_texts,
        responses=responses,
        entities=["e_a", "e_b", "e_c"],
        slots=[TextSlot(name="s", mappings=[{}])],
        forms=forms,
        data={},
    )
    lookup_table = MessageContainerForCoreFeaturization()
    lookup_table.derive_messages_from_domain_and_add(domain)
    assert len(lookup_table) == (
        len(domain.intent_properties) + len(domain.action_names_or_texts)
    )
Exemplo n.º 3
0
def test_encode_entities__with_entity_roles_and_groups():

    # create fake message that has been tokenized and entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({
        TEXT: text,
        TOKENS_NAMES[TEXT]: tokens,
        ENTITIES: entities
    })

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # instantiate matching domain and single state featurizer
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain)

    # encode!
    encoded = f.encode_entities(entity_data={
        TEXT: text,
        ENTITIES: entities
    },
                                precomputations=precomputations)

    # check
    assert len(f.entity_tag_specs) == 1
    tags_to_ids = f.entity_tag_specs[0].tags_to_ids
    for idx, entity_tag in enumerate(entity_tags):
        tags_to_ids[entity_tag] = idx + 1  # hence, city -> 1, city#to -> 2
    assert sorted(list(encoded.keys())) == [ENTITY_TAGS]
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1],
                                                       [0], [2]])
Exemplo n.º 4
0
def test_container_add_fails_if_messages_are_different_but_have_same_key():
    # construct a set of unique substates
    dummy_value = "this-could-be-anything"
    substates_with_unique_key_attribute = [
        {INTENT: "greet"},
        {TEXT: "text", ENTITIES: dummy_value},
        {ACTION_TEXT: "action_text"},
        {ACTION_NAME: "action_name"},
    ]
    constant_feature = _dummy_features(id=1, attribute="arbitrary")
    different_feature = _dummy_features(id=1, attribute="arbitrary")
    # adding the unique messages works fine of course,...
    lookup_table = MessageContainerForCoreFeaturization()
    for sub_state in substates_with_unique_key_attribute:
        lookup_table.add(Message(data=sub_state, features=[constant_feature]))
    # ... but adding any substate with same key but different content doesn't
    new_key = "some-new-key"
    expected_error_message = "Expected added message to be consistent"
    for sub_state in substates_with_unique_key_attribute:
        # with extra attribute
        sub_state_with_extra_attribute = sub_state.copy()
        sub_state_with_extra_attribute[new_key] = "some-value-for-the-new-key"
        with pytest.raises(ValueError, match=expected_error_message):
            lookup_table.add(Message(data=sub_state_with_extra_attribute))
        # with new feature
        with pytest.raises(ValueError, match=expected_error_message):
            lookup_table.add(
                Message(data=sub_state, features=[constant_feature, different_feature])
            )
        # without features
        with pytest.raises(ValueError, match=expected_error_message):
            lookup_table.add(Message(data=sub_state))
Exemplo n.º 5
0
def test_container_add_message_copies():
    # construct a set of unique substates and messages
    dummy_value = "this-could-be-anything"
    substates_with_unique_key_attribute = [
        {INTENT: "greet"},
        {TEXT: "text", ENTITIES: dummy_value},
        {TEXT: "other-text"},
        {ACTION_TEXT: "action_text"},
        {ACTION_NAME: "action_name"},
    ]
    unique_messages = [
        Message(sub_state) for sub_state in substates_with_unique_key_attribute
    ]
    # make some copies
    num_copies = 3
    messages = unique_messages * (1 + num_copies)
    # build table
    lookup_table = MessageContainerForCoreFeaturization()
    for message in messages:
        lookup_table.add(message)
    # assert that we have as many entries as unique keys
    assert len(lookup_table) == len(substates_with_unique_key_attribute)
    assert set(lookup_table.all_messages()) == set(unique_messages)
    assert (
        lookup_table.num_collisions_ignored
        == len(substates_with_unique_key_attribute) * num_copies
    )
Exemplo n.º 6
0
def test_container_feature_lookup_works_if_messages_are_broken_but_consistent():
    not_broken_but_strange_table = MessageContainerForCoreFeaturization()
    not_broken_but_strange_table._table = {
        TEXT: {"A": Message(data=dict())},
        INTENT: {"B": Message(data=dict(), features=[_dummy_features(1, TEXT)])},
    }
    features = not_broken_but_strange_table.collect_features({TEXT: "A", INTENT: "B"})
    assert TEXT in features and len(features[TEXT]) == 1
Exemplo n.º 7
0
def test_container_fingerprint_differ_for_containers_with_different_insertion_order():
    # because we use this for training data and order might affect training of
    # e.g. featurizers, we want this to differ
    container1 = MessageContainerForCoreFeaturization()
    container1.add(Message(data={INTENT: "1"}))
    container1.add(Message(data={INTENT: "2"}))
    container2 = MessageContainerForCoreFeaturization()
    container2.add(Message(data={INTENT: "2"}))
    container2.add(Message(data={INTENT: "1"}))
    assert container2.fingerprint() != container1.fingerprint()
Exemplo n.º 8
0
def test_container_feature_lookup_fails_if_different_features_for_same_attribute():
    broken_table = MessageContainerForCoreFeaturization()
    broken_table._table = {
        TEXT: {"A": Message(data={}, features=[_dummy_features(2, TEXT)])},
        INTENT: {"B": Message(data={}, features=[_dummy_features(1, TEXT)])},
    }
    with pytest.raises(
        RuntimeError, match=f"Feature for attribute {TEXT} has already been"
    ):
        broken_table.collect_features({TEXT: "A", INTENT: "B"})
Exemplo n.º 9
0
def test_container_feature_lookup():
    arbitrary_attribute = "other"
    messages = [
        Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]),
        Message(
            data={
                INTENT: "B",
                arbitrary_attribute: "C"
            },
            features=[_dummy_features(2, arbitrary_attribute)],
        ),
        Message(data={TEXT: "A2"}, features=[_dummy_features(3, TEXT)]),
        Message(
            data={
                INTENT: "B2",
                arbitrary_attribute: "C2"
            },
            features=[_dummy_features(4, arbitrary_attribute)],
        ),
    ]

    table = MessageContainerForCoreFeaturization()
    table.add_all(messages)

    # If we don't specify a list of attributes, the resulting features dictionary will
    # only contain those attributes for which there are features.
    sub_state = {TEXT: "A", INTENT: "B", arbitrary_attribute: "C"}
    features = table.collect_features(sub_state=sub_state)
    for attribute, feature_value in [
        (TEXT, 1),
        (INTENT, None),
        (arbitrary_attribute, 2),
    ]:
        if feature_value is not None:
            assert attribute in features
            assert len(features[attribute]) == 1
            assert feature_value == features[attribute][0].features[0]
        else:
            assert attribute not in features

    # If we query features for `INTENT`, then a key will be there, even if there are
    # no features
    features = table.collect_features(sub_state=sub_state,
                                      attributes=list(sub_state.keys()))
    assert INTENT in features
    assert len(features[INTENT]) == 0

    # We only get the list of features we want...
    features = table.collect_features(sub_state,
                                      attributes=[arbitrary_attribute])
    assert TEXT not in features
    assert INTENT not in features
    assert len(features[arbitrary_attribute]) == 1

    # ... even if there are no features:
    YET_ANOTHER = "another"
    features = table.collect_features(sub_state, attributes=[YET_ANOTHER])
    assert len(features[YET_ANOTHER]) == 0
Exemplo n.º 10
0
def test_container_add_fails_if_message_has_wrong_attributes(
    no_or_multiple_key_attributes: List[Text], ):
    sub_state = {
        attribute: "dummy"
        for attribute in no_or_multiple_key_attributes
    }
    with pytest.raises(ValueError,
                       match="Expected exactly one attribute out of"):
        MessageContainerForCoreFeaturization().add(Message(sub_state))
def test_encode_state__with_lookup__creates_features_for_intent_and_action_name(
    with_action_listen: bool,
):
    """Tests that features for intent and action name are created if needed.
    Especially tests that this is the case even though no features are present in the
    given lookup table for this intent and action_name.
    However, if no `action_listen` is in the given sub-state, then the user sub-state
    should not be featurized (hence, no features for intent) should be created.
    """

    f = SingleStateFeaturizer()
    f._default_feature_states[INTENT] = {"a": 0, "b": 1}
    f._default_feature_states[ACTION_NAME] = {"c": 0, "d": 1, ACTION_LISTEN_NAME: 2}

    # create state
    action_name = ACTION_LISTEN_NAME if with_action_listen else "c"
    state = {USER: {INTENT: "e"}, PREVIOUS_ACTION: {ACTION_NAME: action_name}}

    # create a lookup table with all relevant entries **but no Features**
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(Message(data={INTENT: state[USER][INTENT]}))
    precomputations.add(
        Message(data={ACTION_NAME: state[PREVIOUS_ACTION][ACTION_NAME]})
    )

    # encode!
    encoded = f.encode_state(state, precomputations=precomputations)

    if with_action_listen:
        assert set(encoded.keys()) == set([INTENT, ACTION_NAME])
        assert (
            encoded[INTENT][0].features != scipy.sparse.coo_matrix([[0, 0]])
        ).nnz == 0
    else:
        assert set(encoded.keys()) == set([ACTION_NAME])
Exemplo n.º 12
0
def test_container_add_does_not_fail_if_message_feature_content_differs():
    # construct a set of unique substates
    dummy_value = "this-could-be-anything"
    substates_with_unique_key_attribute = [
        {
            INTENT: "greet"
        },
        {
            TEXT: "text",
            ENTITIES: dummy_value
        },
        {
            ACTION_TEXT: "action_text"
        },
        {
            ACTION_NAME: "action_name"
        },
    ]
    constant_feature = _dummy_features(id=1, attribute="arbitrary")
    different_feature = _dummy_features(id=1, attribute="arbitrary")
    lookup_table = MessageContainerForCoreFeaturization()
    for sub_state in substates_with_unique_key_attribute:
        lookup_table.add(Message(data=sub_state, features=[constant_feature]))
    length = len(lookup_table)
    # with different feature
    for sub_state in substates_with_unique_key_attribute:
        lookup_table.add(Message(data=sub_state, features=[different_feature]))
        assert len(lookup_table) == length
Exemplo n.º 13
0
def test_container_message_lookup():
    # create some messages with unique key attributes
    messages = [
        Message(data={TEXT: "A"}, features=[_dummy_features(1, TEXT)]),
        Message(data={TEXT: "B"}),
        Message(data={INTENT: "B"}),
        Message(data={ACTION_TEXT: "B"}),
        Message(data={ACTION_NAME: "B"}),
    ]
    # add messages to container
    table = MessageContainerForCoreFeaturization()
    table.add_all(messages)
    # lookup messages using existing texts
    message = table.lookup_message(user_text="A")
    assert message
    assert len(message.data) == 1
    assert len(message.features) == 1
    message = table.lookup_message(user_text="B")
    assert message
    assert len(message.data) == 1
Exemplo n.º 14
0
def test_container_message_lookup_fails_if_text_cannot_be_looked_up():
    table = MessageContainerForCoreFeaturization()
    with pytest.raises(ValueError, match="Expected a message with key"):
        table.lookup_message(user_text="a text not included in the table")
Exemplo n.º 15
0
def test_container_feature_lookup_fails_without_key_attribute():
    table = MessageContainerForCoreFeaturization()
    with pytest.raises(ValueError, match="Unknown key"):
        table.collect_features({TEXT: "A-unknown"})
Exemplo n.º 16
0
def test_container_derive_messages_from_events_and_add(
        events: List[Event], expected_num_entries: int):
    lookup_table = MessageContainerForCoreFeaturization()
    lookup_table.derive_messages_from_events_and_add(events)
    assert len(lookup_table) == expected_num_entries
Exemplo n.º 17
0
def test_container_all_messages():
    message_data_list = [{INTENT: "1"}, {INTENT: "2", "other": 3}, {TEXT: "3"}]
    container = MessageContainerForCoreFeaturization()
    container.add_all([Message(data=data) for data in message_data_list])
    assert len(container.all_messages()) == 3
Exemplo n.º 18
0
def test_container_keys():
    message_data_list = [{INTENT: "1"}, {INTENT: "2"}, {TEXT: "3", "other": 3}]
    container = MessageContainerForCoreFeaturization()
    container.add_all([Message(data=data) for data in message_data_list])
    assert set(container.keys(INTENT)) == {"1", "2"}
    assert set(container.keys(TEXT)) == {"3"}
def test_encode_state__with_lookup__looksup_or_creates_features(action_name: Text):
    """Tests that features from table are combined or created from scratch.
    If the given action name is ...
    - ACTION_LISTEN_NAME then the user substate and the action name are encoded
    - some "other" action, then the user-substate is not encoed but the action name is
    - set to "None", then we remove the action name from the user substate and as a
      result there should be no encoding for the action name and for the user substate
    """
    f = SingleStateFeaturizer()
    f._default_feature_states[INTENT] = {"greet": 0, "inform": 1}
    f._default_feature_states[ENTITIES] = {
        "city": 0,
        "name": 1,
        f"city{ENTITY_LABEL_SEPARATOR}to": 2,
        f"city{ENTITY_LABEL_SEPARATOR}from": 3,
    }
    f._default_feature_states[ACTION_NAME] = {
        "NOT_action_listen": 0,
        "utter_greet": 1,
        ACTION_LISTEN_NAME: 2,
    }
    # `_0` in slots represent feature dimension
    f._default_feature_states[SLOTS] = {"slot_1_0": 0, "slot_2_0": 1, "slot_3_0": 2}
    f._default_feature_states[ACTIVE_LOOP] = {
        "active_loop_1": 0,
        "active_loop_2": 1,
        "active_loop_3": 2,
        "active_loop_4": 3,
    }

    # create state
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entity_name_list = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    action_text = "throw a ball"
    intent = "inform"
    state = {
        USER: {TEXT: text, INTENT: intent, ENTITIES: entity_name_list,},
        PREVIOUS_ACTION: {ACTION_NAME: action_name, ACTION_TEXT: action_text,},
        ACTIVE_LOOP: {"name": "active_loop_4"},
        SLOTS: {"slot_1": (1.0,)},
    }
    if action_name is None:
        del state[PREVIOUS_ACTION][ACTION_NAME]

    # Build lookup table with all relevant information - and dummy features for all
    # dense featurizable attributes.
    # Note that we don't need to add the `ENTITIES` to the message including `TEXT`
    # here because `encode_state` won't featurize the entities using the lookup table
    # (only `encode_entities` does that).
    units = 300
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add_all(
        [
            Message(
                data={TEXT: text, TOKENS_NAMES[TEXT]: tokens},
                features=[
                    dummy_features(
                        fill_value=11,
                        units=units,
                        attribute=TEXT,
                        type=SENTENCE,
                        is_sparse=True,
                    ),
                    dummy_features(
                        fill_value=12,
                        units=units,
                        attribute=TEXT,
                        type=SEQUENCE,
                        is_sparse=False,
                    ),
                    # Note: sparse sequence feature is last here
                    dummy_features(
                        fill_value=13,
                        units=units,
                        attribute=TEXT,
                        type=SEQUENCE,
                        is_sparse=True,
                    ),
                ],
            ),
            Message(data={INTENT: intent}),
            Message(
                data={ACTION_TEXT: action_text},
                features=[
                    dummy_features(
                        fill_value=1,
                        units=units,
                        attribute=ACTION_TEXT,
                        type=SEQUENCE,
                        is_sparse=True,
                    )
                ],
            ),
        ]
    )
    if action_name is not None:
        precomputations.add(Message(data={ACTION_NAME: action_name}))

    # encode the state
    encoded = f.encode_state(state, precomputations=precomputations,)

    # check all the features are encoded and *_text features are encoded by a
    # dense featurizer
    expected_attributes = [SLOTS, ACTIVE_LOOP, ACTION_TEXT]
    if action_name is not None:  # i.e. we did not remove it from the state
        expected_attributes += [ACTION_NAME]
    if action_name == ACTION_LISTEN_NAME:
        expected_attributes += [TEXT, ENTITIES, INTENT]
    assert set(encoded.keys()) == set(expected_attributes)

    # Remember, sparse sequence features come first (and `.features` denotes the matrix
    # not a `Features` object)
    if action_name == ACTION_LISTEN_NAME:
        assert encoded[TEXT][0].features.shape[-1] == units
        assert encoded[TEXT][0].is_sparse()
        assert encoded[ENTITIES][0].features.shape[-1] == 4
        assert sparse_equals_dense(encoded[INTENT][0].features, np.array([[0, 1]]))
    assert encoded[ACTION_TEXT][0].features.shape[-1] == units
    assert encoded[ACTION_TEXT][0].is_sparse()
    if action_name is not None:
        if action_name == "NOT_action_listen":
            action_name_encoding = [1, 0, 0]
        else:  # action_listen
            action_name_encoding = [0, 0, 1]
        assert sparse_equals_dense(
            encoded[ACTION_NAME][0].features, np.array([action_name_encoding])
        )
    else:
        assert ACTION_NAME not in encoded
    assert sparse_equals_dense(encoded[SLOTS][0].features, np.array([[1, 0, 0]]))
    assert sparse_equals_dense(
        encoded[ACTIVE_LOOP][0].features, np.array([[0, 0, 0, 1]])
    )
Exemplo n.º 20
0
def test_container_fingerprints_differ_for_different_containers():
    container1 = MessageContainerForCoreFeaturization()
    container1.add(Message(data={INTENT: "1"}))
    container2 = MessageContainerForCoreFeaturization()
    container2.add(Message(data={INTENT: "2"}))
    assert container2.fingerprint() != container1.fingerprint()
def test_encode_entities__with_bilou_entity_roles_and_groups():

    # Instantiate domain and configure the single state featurizer for this domain.
    # Note that there are 2 entity tags here.
    entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"]
    domain = Domain(
        intents=[],
        entities=entity_tags,
        slots=[],
        responses={},
        forms={},
        action_names=[],
    )
    f = SingleStateFeaturizer()
    f.prepare_for_training(domain, bilou_tagging=True)

    # (1) example with both entities

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying from London to Paris"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[0],
            ENTITY_ATTRIBUTE_VALUE: "London",
            ENTITY_ATTRIBUTE_START: 17,
            ENTITY_ATTRIBUTE_END: 23,
        },
        {
            ENTITY_ATTRIBUTE_TYPE: entity_tags[1],
            ENTITY_ATTRIBUTE_VALUE: "Paris",
            ENTITY_ATTRIBUTE_START: 27,
            ENTITY_ATTRIBUTE_END: 32,
        },
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities,},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(
        encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [4], [0], [8]]
    )

    # (2) example with only the "city" entity

    # create message that has been tokenized and where entities have been extracted
    text = "I am flying to Saint Petersburg"
    tokens = [
        Token(text=match.group(), start=match.start())
        for match in re.finditer(r"\S+", text)
    ]
    entities = [
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Saint Petersburg",
            ENTITY_ATTRIBUTE_START: 15,
            ENTITY_ATTRIBUTE_END: 31,
        },
    ]
    message = Message({TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities})

    # create a lookup table that has seen this message
    precomputations = MessageContainerForCoreFeaturization()
    precomputations.add(message)

    # encode!
    encoded = f.encode_entities(
        {TEXT: text, ENTITIES: entities,},
        precomputations=precomputations,
        bilou_tagging=True,
    )
    assert sorted(list(encoded.keys())) == sorted([ENTITY_TAGS])
    assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [3]])