Exemplo n.º 1
0
def test_convert_training_examples(
    spacy_nlp: Any,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizer()
    count_vectors_featurizer = CountVectorsFeaturizer()
    spacy_featurizer = SpacyFeaturizer()

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.train(training_data)
    count_vectors_featurizer.train(training_data)
    spacy_featurizer.train(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message],
        attributes=attributes,
        entity_tag_specs=entity_tag_spec,
    )

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
Exemplo n.º 2
0
def test_custom_intent_symbol(text, expected_tokens, spacy_nlp):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    tk = SpacyTokenizer(component_config)

    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
    message.set(INTENT, text)

    tk.train(TrainingData([message]))

    assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
Exemplo n.º 3
0
def test_spacy_intent_tokenizer(spacy_nlp_component):
    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer

    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
    spacy_nlp_component.train(td, config=None)
    spacy_tokenizer = SpacyTokenizer()
    spacy_tokenizer.train(td, config=None)

    intent_tokens_exist = [
        True if example.get("intent_tokens") is not None else False
        for example in td.intent_examples
    ]

    # no intent tokens should have been set
    assert not any(intent_tokens_exist)
Exemplo n.º 4
0
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp):
    tk = SpacyTokenizer()

    message = Message(text)
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))
    message.set(RESPONSE_ATTRIBUTE, text)
    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(text))

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
        tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Exemplo n.º 5
0
def test_crf_extractor(spacy_nlp):
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent": "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )
    tokenizer = SpacyTokenizer()

    training_data = TrainingData(training_examples=examples)
    tokenizer.train(training_data)
    extractor.train(training_data)

    sentence = "italian restaurant"
    message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)})

    tokenizer.process(message)
    extractor.process(message)

    detected_entities = message.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"