Пример #1
0
def test_whitespace_training(supervised_embeddings_config):
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent": "restaurant_search",
                "entities": [
                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
                ],
            },
        ),
    ]

    tk = WhitespaceTokenizer()

    tk.train(TrainingData(training_examples=examples), supervised_embeddings_config)

    assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will"
    assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want"
    assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
Пример #2
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def test_process(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(text)

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message("Hi Max!",
                data={"entities": [{
                    "entity": "person",
                    "value": "Max"
                }]}),
        Message(
            "I live in Berlin",
            data={"entities": [{
                "entity": "city",
                "value": "Berlin"
            }]},
        ),
    ]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
Пример #4
0
def test_count_vector_featurizer_response_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message("hello")
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    tk.train(data)
    ftr.train(data)

    if intent_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None

    if response_features:
        assert (train_message.get(
            SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features)
    else:
        assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
Пример #5
0
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
    ner_syn = component_builder.create_component(_config.for_component(5),
                                                 _config)
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"
Пример #6
0
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature)

    ftr.process(test_message)

    assert np.all(
        test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] ==
        expected)
def test_count_vector_featurizer_using_tokens(tokens, expected):

    ftr = CountVectorsFeaturizer()

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set(TOKENS_NAMES[TEXT], tokens_feature)

    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Пример #8
0
def test_text_featurizer(sentence, expected_features):
    featurizer = LexicalSyntacticFeaturizer({
        "features": [
            ["BOS", "upper"],
            ["BOS", "EOS", "prefix2", "digit"],
            ["EOS", "low"],
        ]
    })

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    seq_vec, sen_vec = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vec, scipy.sparse.coo_matrix)
    assert sen_vec is None

    assert np.all(seq_vec.toarray() == expected_features[:-1])
def test_text_featurizer(sentence, expected_features):
    featurizer = LexicalSyntacticFeaturizer({
        "features": [
            ["BOS", "upper"],
            ["BOS", "EOS", "prefix2", "digit"],
            ["EOS", "low"],
        ]
    })

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]),
                      scipy.sparse.coo_matrix)

    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()

    assert np.all(actual == expected_features)
Пример #10
0
def test_build_tag_id_dict():
    message_1 = Message("Germany is part of the European Union")
    message_1.set(
        BILOU_ENTITIES,
        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
    )

    message_2 = Message("Berlin is the capital of Germany")
    message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"])

    training_data = TrainingData([message_1, message_2])

    tag_id_dict = bilou_utils.build_tag_id_dict(training_data)

    assert tag_id_dict == {
        "O": 0,
        "B-location": 1,
        "I-location": 2,
        "U-location": 3,
        "L-location": 4,
        "B-organisation": 5,
        "I-organisation": 6,
        "U-organisation": 7,
        "L-organisation": 8,
    }
Пример #11
0
def test_spacy_ner_extractor(component_builder, spacy_nlp):
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    ext = component_builder.create_component(_config.for_component(0), _config)
    example = Message(
        "anywhere in the U.K.",
        {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west"),
        },
    )

    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        "start": 16,
        "extractor": "SpacyEntityExtractor",
        "end": 20,
        "value": "U.K.",
        "entity": "GPE",
        "confidence": None,
    }

    # Test dimension filtering includes only specified dimensions

    example = Message(
        "anywhere in the West with Sebastian Thrun",
        {
            "intent": "example_intent",
            "entities": [],
            "spacy_doc":
            spacy_nlp("anywhere in the West with Sebastian Thrun"),
        },
    )
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})

    _config.set_component_attr(0, dimensions=["PERSON"])
    ext = component_builder.create_component(_config.for_component(0), _config)
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        "start": 26,
        "extractor": "SpacyEntityExtractor",
        "end": 41,
        "value": "Sebastian Thrun",
        "entity": "PERSON",
        "confidence": None,
    }
Пример #12
0
def test_count_vector_featurizer_persist_load(tmpdir):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    # set non default values to config
    config = {
        "analyzer": "char",
        "token_pattern": r"(?u)\b\w+\b",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = CountVectorsFeaturizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
    train_message1 = Message(sentence1)
    train_message2 = Message(sentence2)

    # this is needed for a valid training example
    train_message1.set("intent", "bla")
    train_message2.set("intent", "bla")
    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)
    # persist featurizer
    file_dict = train_ftr.persist("ftr", tmpdir.strpath)
    train_vect_params = train_ftr.vectorizer.get_params()
    # add trained vocabulary to vectorizer params
    train_vect_params.update({"vocabulary": train_ftr.vectorizer.vocabulary_})

    # load featurizer
    meta = train_ftr.component_config.copy()
    meta.update(file_dict)
    test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath)
    test_vect_params = test_ftr.vectorizer.get_params()

    assert train_vect_params == test_vect_params

    test_message1 = Message(sentence1)
    test_ftr.process(test_message1)
    test_message2 = Message(sentence2)
    test_ftr.process(test_message2)

    # check that train features and test features after loading are the same
    assert np.all([
        train_message1.get("text_features") == test_message1.get(
            "text_features"),
        train_message2.get("text_features") == test_message2.get(
            "text_features"),
    ])
Пример #13
0
def test_apply_bilou_schema():
    tokenizer = WhitespaceTokenizer()

    message_1 = Message("Germany is part of the European Union")
    message_1.set(
        ENTITIES,
        [
            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
            {
                "start": 23,
                "end": 37,
                "value": "European Union",
                "entity": "organisation",
            },
        ],
    )

    message_2 = Message("Berlin is the capital of Germany")
    message_2.set(
        ENTITIES,
        [
            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
        ],
    )

    training_data = TrainingData([message_1, message_2])

    tokenizer.train(training_data)

    bilou_utils.apply_bilou_schema(training_data)

    assert message_1.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "B-organisation",
        "L-organisation",
        "O",
    ]
    assert message_2.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "U-location",
        "O",
    ]
Пример #14
0
def test_count_vector_featurizer_char(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def test_do_not_overwrite_any_entities():
    message = Message("Max lives in Berlin.")
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message("Hi Max!",
                data={"entities": [{
                    "entity": "person",
                    "value": "Max"
                }]}),
        Message(
            "I live in Berlin",
            data={"entities": [{
                "entity": "city",
                "value": "Berlin"
            }]},
        ),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
def test_count_vector_featurizer_oov_token(sentence, expected):
    ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"})
    train_message = Message(sentence)
    WhitespaceTokenizer().process(train_message)

    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Пример #17
0
def test_count_vector_featurizer(sentence, expected):
    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
Пример #18
0
def test_spacy_ner_extractor(component_builder, spacy_nlp):
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    ext = component_builder.create_component(_config.for_component(0), _config)
    example = Message(
        "anywhere in the West", {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west")
        })
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'SpacyEntityExtractor',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None
    }

    # Test dimension filtering includes only specified dimensions

    example = Message(
        "anywhere in the West with Sebastian Thrun", {
            "intent": "example_intent",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")
        })
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    _config.set_component_attr(0, dimensions=["PERSON"])
    ext = component_builder.create_component(_config.for_component(0), _config)
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 26,
        'extractor': 'SpacyEntityExtractor',
        'end': 41,
        'value': 'Sebastian Thrun',
        'entity': 'PERSON',
        'confidence': None
    }
Пример #19
0
    def parse(
        self,
        text: Text,
        time: Optional[datetime.datetime] = None,
        only_output_properties: bool = True,
    ) -> Dict[Text, Any]:
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one
            # should pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(message.as_dict(only_output_properties=only_output_properties))
        return output
def test_count_vectors_featurizer_train():

    featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_count_vector_featurizer_attribute_featurization(
        sentence, intent, response, intent_features, response_features):
    ftr = CountVectorsFeaturizer()
    tk = WhitespaceTokenizer()

    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])

    tk.train(data)
    ftr.train(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(
        INTENT, [])
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, [])
    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
Пример #22
0
    def filter_trainable_entities(
            self, entity_examples: List[Message]) -> List[Message]:
        """Filters out untrainable entity annotations.

        Creates a copy of entity_examples in which entities that have
        `extractor` set to something other than
        self.name (e.g. 'CRFEntityExtractor') are removed.
        """

        filtered = []
        for message in entity_examples:
            entities = []
            for ent in message.get(ENTITIES, []):
                extractor = ent.get(EXTRACTOR)
                if not extractor or extractor == self.name:
                    entities.append(ent)
            data = message.data.copy()
            data[ENTITIES] = entities
            filtered.append(
                Message(
                    text=message.text,
                    data=data,
                    output_properties=message.output_properties,
                    time=message.time,
                ))

        return filtered
Пример #23
0
def test_convert_featurizer_train():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])

    assert vecs is None
Пример #24
0
def test_convert_featurizer_process():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())
    sentence = "Hey how are you today ?"
    message = Message(sentence)
    show_message(message)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    assert show_message(message, False) == {
        "tokens": ["hey", "how", "are", "you", "today", "__CLS__"],
        "text": "Hey how are you today ?"
    }
    featurizer.process(message)
    show_message(message)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
    assert len(tokens) == len(vecs)
    assert len(vecs) == 6
    assert len(tokens) == 6
    assert len(vecs[0]) == 1024
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Пример #25
0
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls,
                                      spacy_nlp):

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = RegexFeaturizer({}, known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(result.toarray()[0], expected, atol=1e-10)
    assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
Пример #26
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer()
    ftr.add_lookup_tables(lookups)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(sentence)
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(result.toarray(), expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Пример #27
0
def test_compute_default_label_features():
    label_features = [
        Message("test a"),
        Message("test b"),
        Message("test c"),
        Message("test d"),
    ]

    output = DIETClassifier._compute_default_label_features(label_features)

    output = output[0]

    for i, o in enumerate(output):
        assert isinstance(o, np.ndarray)
        assert o[0][i] == 1
        assert o.shape == (1, len(label_features))
Пример #28
0
def test_spacy_featurizer_train(spacy_nlp):

    featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    message.set(RESPONSE_ATTRIBUTE, sentence)
    message.set(INTENT_ATTRIBUTE, "intent")
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
    message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence))

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])
    expected_cls = np.array(
        [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE])

    assert vecs is None
Пример #29
0
def test_train_tokenizer(text, expected_tokens, expected_indices):
    tk = WhitespaceTokenizer()

    message = Message(text)
    message.set(RESPONSE_ATTRIBUTE, text)
    message.set(INTENT_ATTRIBUTE, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(
        TOKENS_NAMES[INTENT_ATTRIBUTE])

    assert [t.text for t in tokens] == [text]
Пример #30
0
def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    tokens = tokenizer.tokenize(Message(sentence), attribute=TEXT)

    actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text