def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)

    spacy_featurizer = SpacyFeaturizer()
    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set("spacy_doc", spacy_nlp(text))

    white_space_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
示例#2
0
def test_convert_training_examples(
    spacy_nlp: Any,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer()
    count_vectors_featurizer = CountVectorsFeaturizer()
    spacy_featurizer = SpacyFeaturizer()

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.train(training_data)
    count_vectors_featurizer.train(training_data)
    spacy_featurizer.train(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message],
        attributes=attributes,
        entity_tag_specs=entity_tag_spec,
    )

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
示例#3
0
def create_spacy_featurizer(config: Dict[Text, Any]) -> SpacyFeaturizer:
    return SpacyFeaturizer(
        {
            **SpacyFeaturizer.get_default_config(),
            **config
        },
        "spacy_featurizer",
    )
示例#4
0
async def test_train_persist_with_different_configurations(
    crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor],
    config_params: Dict[Text, Any],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    spacy_tokenizer: SpacyTokenizer,
    spacy_featurizer: SpacyFeaturizer,
    spacy_nlp_component: SpacyNLP,
    spacy_model: SpacyModel,
):

    crf_extractor = crf_entity_extractor(config_params)

    importer = RasaFileImporter(training_data_paths=["data/examples/rasa"])
    training_data = importer.get_nlu_data()

    training_data = spacy_nlp_component.process_training_data(
        training_data, spacy_model)
    training_data = spacy_tokenizer.process_training_data(training_data)
    training_data = spacy_featurizer.process_training_data(training_data)
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})
    messages = spacy_nlp_component.process([message], spacy_model)
    messages = spacy_tokenizer.process(messages)
    message = spacy_featurizer.process(messages)[0]
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractor.load(
        {
            **CRFEntityExtractor.get_default_config(),
            **config_params
        },
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()

    detected_entities = processed_message2.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"
示例#5
0
def test_spacy_intent_featurizer(spacy_nlp_component):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
    spacy_nlp_component.train(td, config=None)
    spacy_featurizer = SpacyFeaturizer()
    spacy_featurizer.train(td, config=None)

    intent_features_exist = np.array([
        True if example.get("intent_features") is not None else False
        for example in td.intent_examples
    ])

    # no intent features should have been set
    assert not any(intent_features_exist)
示例#6
0
def test_spacy_featurizer_train(spacy_nlp):

    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    message.set(INTENT_ATTRIBUTE, "intent")
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array(
        [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
示例#7
0
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    greet = {TEXT: sentence, "intent": "greet", "text_features": [0.5]}

    message = Message(data=greet)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    vecs = seq_vecs[0][:5]

    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
    assert sen_vecs is not None
def test_spacy_featurizer_train(spacy_nlp):

    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])

    assert 5 == len(seq_vecs)
    assert 1 == len(sen_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, [])

    assert 5 == len(seq_vecs)
    assert 1 == len(sen_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sen_vecs = message.get_dense_features(INTENT, [])

    assert seq_vecs is None
    assert sen_vecs is None
def test_crf_use_dense_features(
    crf_entity_extractor: Callable[[Dict[Text, Any]],
                                   CRFEntityExtractorGraphComponent],
    spacy_nlp: Any,
):
    component_config = {
        "features": [
            ["low", "title", "upper", "pos", "pos2"],
            [
                "low",
                "suffix3",
                "suffix2",
                "upper",
                "title",
                "digit",
                "pos",
                "pos2",
                "text_dense_features",
            ],
            ["low", "title", "upper", "pos", "pos2"],
        ]
    }
    crf_extractor = crf_entity_extractor(component_config)

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(data={TEXT: text})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._convert_to_crf_tokens(message)
    features = crf_extractor._crf_tokens_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    dense_features, _ = message.get_dense_features(TEXT, [])
    if dense_features:
        dense_features = dense_features.features

    for i in range(0, len(dense_features[0])):
        assert (features[0]["0:text_dense_features"]["text_dense_features"][
            str(i)] == dense_features[0][i])
示例#10
0
def test_spacy_featurizer(sentence, spacy_nlp):

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    doc = spacy_nlp(sentence)
    vecs = ftr._features_for_doc(doc)
    expected = [t.vector for t in doc]

    assert np.allclose(vecs, expected, atol=1e-5)
示例#11
0
def test_spacy_featurizer(sentence, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    ftr = SpacyFeaturizer.create({"return_sequence": True},
                                 RasaNLUModelConfig())

    doc = spacy_nlp(sentence)
    vecs = ftr._features_for_doc(doc)
    expected = [t.vector for t in doc]
    assert np.allclose(vecs, expected, atol=1e-5)
def test_crf_use_dense_features(spacy_nlp: Any):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._convert_to_crf_tokens(message)
    features = crf_extractor._crf_tokens_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    dense_features = message.get_dense_features(TEXT, [])
    for i in range(0, len(dense_features[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == dense_features[0][i]
        )
示例#13
0
def test_spacy_featurizer_cls_vector(spacy_nlp):
    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))

    featurizer._set_spacy_features(message)

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array(
        [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
示例#14
0
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    greet = {"intent": "greet", "text_features": [0.5]}

    message = Message(sentence, greet)
    message.set("spacy_doc", doc)

    ftr._set_spacy_features(message)

    vecs = message.get("text_dense_features")[0][:5]

    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
示例#15
0
def test_spacy_featurizer_using_empty_model():
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
    import spacy

    sentence = "This test is using an empty spaCy model"

    model = spacy.blank("en")
    doc = model(sentence)

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert vecs is None
示例#16
0
def test_spacy_featurizer_no_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)

    spacy_config = {"return_sequence": False}
    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())

    greet = {"intent": "greet", "text_features": [0.5]}

    message = Message(sentence, greet)
    message.set("spacy_doc", doc)

    ftr._set_spacy_features(message)

    vecs = message.get("text_dense_features")[0]

    assert np.allclose(doc.vector, vecs, atol=1e-4)
    assert np.allclose(expected, vecs[:5], atol=1e-4)
示例#17
0
def test_spacy_featurizer_casing(spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    # if this starts failing for the default model, we should think about
    # removing the lower casing the spacy nlp component does when it
    # retrieves vectors. For compressed spacy models (e.g. models
    # ending in _sm) this test will most likely fail.

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
    for e in td.intent_examples:
        doc = spacy_nlp(e.text)
        doc_capitalized = spacy_nlp(e.text.capitalize())

        vecs = ftr._features_for_doc(doc)
        vecs_capitalized = ftr._features_for_doc(doc_capitalized)

        assert np.allclose(
            vecs, vecs_capitalized,
            atol=1e-5), "Vectors are unequal for texts '{}' and '{}'".format(
                e.text, e.text.capitalize())
示例#18
0
def test_spacy_featurizer_cls_vector(spacy_nlp):
    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    featurizer._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    assert 5 == len(seq_vecs)
    assert 1 == len(sen_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)
示例#19
0
def test_spacy_featurizer_using_empty_model():
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
    import spacy

    sentence = "This test is using an empty spaCy model"

    model = spacy.blank("en")
    doc = model(sentence)

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert seq_vecs is None
    assert sen_vecs is None
示例#20
0
from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer

from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.nlu.tokenizers.convert_tokenizer import ConveRTTokenizer
from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer

from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import CountVectorsFeaturizer

from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, SPARSE_FEATURE_NAMES

logger = logging_setup()

featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

test_input = "Okay, pick up this yellow banana for me."
message = Message(test_input)

message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input))
featurizer._set_spacy_features(message)
vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
logger.info("SpaCy: {}".format(vecs.shape))

message = Message(test_input)
featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())
MitieTokenizer().process(message)
tokens = message.get(TOKENS_NAMES[TEXT])
vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)
logger.info("Mitie: {}".format(vecs.shape))
示例#21
0
def spacy_featurizer() -> SpacyFeaturizer:
    return SpacyFeaturizer(SpacyFeaturizer.get_default_config(),
                           name="SpacyFeaturizer")