Пример #1
0
async def test_train_persist_load_with_composite_entities(
        component_builder: ComponentBuilder, tmp_path: Path):
    pipeline = pipeline_from_components("WhitespaceTokenizer",
                                        "CRFEntityExtractor")

    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})

    (trainer, trained, persisted_path) = await rasa.nlu.train.train(
        _config,
        path=str(tmp_path),
        data="data/test/demo-rasa-composite-entities.yml",
        component_builder=component_builder,
    )

    assert trainer.pipeline
    assert trained.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)

    assert loaded.pipeline
    text = "I am looking for an italian restaurant"
    assert loaded.parse(text) == trained.parse(text)
Пример #2
0
def test_spacy_featurizer_casing(spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    # if this starts failing for the default model, we should think about
    # removing the lower casing the spacy nlp component does when it
    # retrieves vectors. For compressed spacy models (e.g. models
    # ending in _sm) this test will most likely fail.

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
    for e in td.intent_examples:
        doc = spacy_nlp(e.text)
        doc_capitalized = spacy_nlp(e.text.capitalize())

        vecs = ftr._features_for_doc(doc)
        vecs_capitalized = ftr._features_for_doc(doc_capitalized)

        assert np.allclose(
            vecs, vecs_capitalized,
            atol=1e-5), "Vectors are unequal for texts '{}' and '{}'".format(
                e.text, e.text.capitalize())
Пример #3
0
async def test_train_tensorboard_logging(component_builder, tmpdir):
    from pathlib import Path

    tensorboard_log_dir = Path(tmpdir.strpath) / "tensorboard"

    assert not tensorboard_log_dir.exists()

    _config = RasaNLUModelConfig({
        "pipeline": [
            {
                "name": "WhitespaceTokenizer"
            },
            {
                "name": "CountVectorsFeaturizer"
            },
            {
                "name": "DIETClassifier",
                EPOCHS: 3,
                TENSORBOARD_LOG_LEVEL: "epoch",
                TENSORBOARD_LOG_DIR: str(tensorboard_log_dir),
                EVAL_NUM_EXAMPLES: 15,
                EVAL_NUM_EPOCHS: 1,
            },
        ],
        "language":
        "en",
    })

    await train(
        _config,
        path=tmpdir.strpath,
        data="data/examples/rasa/demo-rasa-multi-intent.md",
        component_builder=component_builder,
    )

    assert tensorboard_log_dir.exists()

    all_files = list(tensorboard_log_dir.rglob("*.*"))
    assert len(all_files) == 3
Пример #4
0
async def test_inner_linear_normalization(
    component_builder: ComponentBuilder,
    tmp_path: Path,
    classifier_params: Dict[Text, Any],
    data_path: Text,
    monkeypatch: MonkeyPatch,
):
    pipeline = as_pipeline("WhitespaceTokenizer", "CountVectorsFeaturizer",
                           "DIETClassifier")
    assert pipeline[2]["name"] == "DIETClassifier"
    pipeline[2].update(classifier_params)

    _config = RasaNLUModelConfig({"pipeline": pipeline})
    (trained_model, _, persisted_path) = await rasa.nlu.train.train(
        _config,
        path=str(tmp_path),
        data=data_path,
        component_builder=component_builder,
    )
    loaded = Interpreter.load(persisted_path, component_builder)

    mock = Mock()
    monkeypatch.setattr(train_utils, "normalize", mock.normalize)

    parse_data = loaded.parse("hello")
    intent_ranking = parse_data.get("intent_ranking")

    # check whether normalization had the expected effect
    output_sums_to_1 = sum([
        intent.get("confidence") for intent in intent_ranking
    ]) == pytest.approx(1)
    assert output_sums_to_1

    # check whether the normalization of rankings is reflected in intent prediction
    assert parse_data.get("intent") == intent_ranking[0]

    # normalize shouldn't have been called
    mock.normalize.assert_not_called()
Пример #5
0
def test_train_selector(pipeline, component_builder, tmpdir):
    # use data that include some responses
    training_data = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa.md")
    training_data_responses = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa-responses.md")
    training_data = training_data.merge(training_data_responses)

    nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline})

    trainer = Trainer(nlu_config)
    trainer.train(training_data)

    persisted_path = trainer.persist(tmpdir)

    assert trainer.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)
    parsed = loaded.parse("hello")

    assert loaded.pipeline
    assert parsed is not None
    assert (parsed.get("response_selector").get("all_retrieval_intents")) == [
        "chitchat"
    ]
    assert (parsed.get("response_selector").get("default").get("response").get(
        "intent_response_key")) is not None
    assert (parsed.get("response_selector").get("default").get("response").get(
        "template_name")) is not None
    assert (parsed.get("response_selector").get("default").get("response").get(
        "response_templates")) is not None

    ranking = parsed.get("response_selector").get("default").get("ranking")
    assert ranking is not None

    for rank in ranking:
        assert rank.get("confidence") is not None
        assert rank.get("intent_response_key") is not None
Пример #6
0
async def test_raise_error_on_incorrect_pipeline(component_builder, tmp_path: Path):
    _config = RasaNLUModelConfig(
        {
            "pipeline": [
                {"name": "WhitespaceTokenizer"},
                {"name": "DIETClassifier", EPOCHS: 1},
            ],
            "language": "en",
        }
    )

    with pytest.raises(Exception) as e:
        await train(
            _config,
            path=str(tmp_path),
            data=DEFAULT_DATA_PATH,
            component_builder=component_builder,
        )

    assert (
        "'DIETClassifier' requires ['Featurizer']. "
        "Add required components to the pipeline." in str(e.value)
    )
Пример #7
0
def test_train_model_without_data():
    td = load_data(DEFAULT_DATA_PATH)
    # language, pipeline = pipelines_for_tests()[1]
    # show_dict(pipeline)
    # exit()
    language = "en"
    pipeline = load_json(
        "{}/test_case/test_pipelines/config_pipeline.json".format(prj_dir))
    # exit()
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": language})

    trainer = Trainer(_config)
    trainer.train(td)
    persisted_path = trainer.persist(model_dir)
    loaded = Interpreter.load(persisted_path)
    assert loaded.pipeline

    # Inference
    # result = loaded.parse("i'm looking for a place in the north of town")
    result = loaded.parse("show me chinese restaurants")
    result = dict(
        filter(lambda item: item[0] not in ["intent_ranking"], result.items()))
    show_dict(result)
Пример #8
0
async def test_train_model_training_data_persisted(component_builder, tmpdir):
    _config = RasaNLUModelConfig({
        "pipeline": [{
            "name": "KeywordIntentClassifier"
        }],
        "language":
        "en"
    })

    (trained, _, persisted_path) = await train(
        _config,
        path=tmpdir.strpath,
        data=DEFAULT_DATA_PATH,
        component_builder=component_builder,
        persist_nlu_training_data=True,
    )

    assert trained.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)

    assert loaded.pipeline
    assert loaded.model_metadata.get("training_data") is not None
async def test_elmo_train(component_builder, tmpdir):
    pipeline = [
        {"name": "WhitespaceTokenizer"},
        {"name": "ElmoFeaturizer"},
        {"name": "CountVectorsFeaturizer"},
        {"name": "EmbeddingIntentClassifier"},
    ]

    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})

    (trained, _, persisted_path) = await train(
        _config,
        path=tmpdir.strpath,
        data=DEFAULT_DATA_PATH,
        component_builder=component_builder,
    )

    assert trained.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)
    assert loaded.pipeline
    assert loaded.parse("hello") is not None
    assert loaded.parse("Hello today is Monday, again!") is not None
Пример #10
0
def test_train_selector(pipeline, component_builder, tmpdir):
    # use data that include some responses
    td = load_data("data/examples/rasa/demo-rasa.md")
    td_responses = load_data("data/examples/rasa/demo-rasa-responses.md")
    td = td.merge(td_responses)
    td.fill_response_phrases()

    nlu_config = RasaNLUModelConfig({"language": "en", "pipeline": pipeline})

    trainer = Trainer(nlu_config)
    trainer.train(td)

    persisted_path = trainer.persist(tmpdir)

    assert trainer.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)
    parsed = loaded.parse("hello")

    assert loaded.pipeline
    assert parsed is not None
    assert (parsed.get(RESPONSE_SELECTOR_PROPERTY_NAME).get("default").get(
        "full_retrieval_intent")) is not None
Пример #11
0
def test_convert_featurizer_train(component_builder):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    featurizer = component_builder.create_component_from_class(
        ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(TrainingData([message]),
                     RasaNLUModelConfig(),
                     tf_hub_module=tokenizer.module)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get_dense_features(TEXT, [])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get_dense_features(RESPONSE, [])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get_dense_features(INTENT, [])

    assert vecs is None
Пример #12
0
    def train(self, request):
        # if not set will use the default project name, e.g. "default"
        project = parameter_or_default(request, "project", default=None)
        # if set will not generate a model name but use the passed one
        model_name = parameter_or_default(request, "model", default=None)

        try:
            model_config, data = self.extract_data_and_config(request)

        except Exception as e:
            request.setResponseCode(400)
            returnValue(json_to_string({"error": "{}".format(e)}))

        data_file = dump_to_data_file(data)

        request.setHeader('Content-Type', 'application/zip')

        try:
            request.setResponseCode(200)
            request.setHeader("Content-Disposition", "attachment")
            path_to_model = yield self.data_router.start_train_process(
                data_file, project, RasaNLUModelConfig(model_config),
                model_name)
            zipped_path = utils.zip_folder(path_to_model)

            zip_content = io.open(zipped_path, 'r+b').read()
            return returnValue(zip_content)

        except MaxTrainingError as e:
            request.setResponseCode(403)
            returnValue(json_to_string({"error": "{}".format(e)}))
        except InvalidProjectError as e:
            request.setResponseCode(404)
            returnValue(json_to_string({"error": "{}".format(e)}))
        except TrainingException as e:
            request.setResponseCode(500)
            returnValue(json_to_string({"error": "{}".format(e)}))
Пример #13
0
async def test_softmax_normalization(
    component_builder,
    tmp_path,
    classifier_params,
    data_path,
    output_length,
    output_should_sum_to_1,
):
    pipeline = as_pipeline(
        "WhitespaceTokenizer", "CountVectorsFeaturizer", "DIETClassifier"
    )
    assert pipeline[2]["name"] == "DIETClassifier"
    pipeline[2].update(classifier_params)

    _config = RasaNLUModelConfig({"pipeline": pipeline})
    (trained_model, _, persisted_path) = await train(
        _config,
        path=str(tmp_path),
        data=data_path,
        component_builder=component_builder,
    )
    loaded = Interpreter.load(persisted_path, component_builder)

    parse_data = loaded.parse("hello")
    intent_ranking = parse_data.get("intent_ranking")
    # check that the output was correctly truncated after normalization
    assert len(intent_ranking) == output_length

    # check whether normalization had the expected effect
    output_sums_to_1 = sum(
        [intent.get("confidence") for intent in intent_ranking]
    ) == pytest.approx(1)
    assert output_sums_to_1 == output_should_sum_to_1

    # check whether the normalization of rankings is reflected in intent prediction
    assert parse_data.get("intent") == intent_ranking[0]
Пример #14
0
async def test_set_random_seed(component_builder, tmpdir):
    """test if train result is the same for two runs of tf embedding"""

    # set fixed random seed
    _config = RasaNLUModelConfig(
        {
            "pipeline": [
                {"name": "WhitespaceTokenizer"},
                {"name": "CountVectorsFeaturizer"},
                {"name": "DIETClassifier", RANDOM_SEED: 1, EPOCHS: 1},
            ],
            "language": "en",
        }
    )

    # first run
    (trained_a, _, persisted_path_a) = await train(
        _config,
        path=tmpdir.strpath + "_a",
        data=DEFAULT_DATA_PATH,
        component_builder=component_builder,
    )
    # second run
    (trained_b, _, persisted_path_b) = await train(
        _config,
        path=tmpdir.strpath + "_b",
        data=DEFAULT_DATA_PATH,
        component_builder=component_builder,
    )

    loaded_a = Interpreter.load(persisted_path_a, component_builder)
    loaded_b = Interpreter.load(persisted_path_b, component_builder)
    result_a = loaded_a.parse("hello")["intent"]["confidence"]
    result_b = loaded_b.parse("hello")["intent"]["confidence"]

    assert result_a == result_b
Пример #15
0
def test_spacy_featurizer_using_empty_model():
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
    import spacy

    sentence = "This test is using an empty spaCy model"

    model = spacy.blank("en")
    doc = model(sentence)

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert seq_vecs is None
    assert sen_vecs is None
Пример #16
0
async def test_train_persist_load_with_composite_entities(
        classifier_params, component_builder, tmpdir):
    pipeline = as_pipeline("WhitespaceTokenizer", "CountVectorsFeaturizer",
                           "DIETClassifier")
    assert pipeline[2]["name"] == "DIETClassifier"
    pipeline[2].update(classifier_params)

    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})

    (trainer, trained, persisted_path) = await rasa.nlu.train.train(
        _config,
        path=tmpdir.strpath,
        data="data/test/demo-rasa-composite-entities.yml",
        component_builder=component_builder,
    )

    assert trainer.pipeline
    assert trained.pipeline

    loaded = Interpreter.load(persisted_path, component_builder)

    assert loaded.pipeline
    text = "I am looking for an italian restaurant"
    assert loaded.parse(text) == trained.parse(text)
Пример #17
0
def test_duckling_entity_extractor_and_synonyms(component_builder):
    _config = RasaNLUModelConfig({
        "pipeline": [{
            "name": "DucklingHTTPExtractor"
        }, {
            "name": "EntitySynonymMapper"
        }]
    })
    _config.set_component_attr(0, dimensions=["number"])
    duckling = component_builder.create_component(_config.for_component(0),
                                                  _config)
    synonyms = component_builder.create_component(_config.for_component(1),
                                                  _config)
    message = Message("He was 6 feet away")
    duckling.process(message)
    # checks that the synonym processor
    # can handle entities that have int values
    synonyms.process(message)
    assert message is not None
Пример #18
0
def test_set_attr_on_component():
    _config = RasaNLUModelConfig(
        {
            "language": "en",
            "pipeline": [
                {"name": "SpacyNLP"},
                {"name": "SpacyTokenizer"},
                {"name": "SpacyFeaturizer"},
                {"name": "DIETClassifier"},
            ],
        }
    )
    idx_classifier = _config.component_names.index("DIETClassifier")
    idx_tokenizer = _config.component_names.index("SpacyTokenizer")

    _config.set_component_attr(idx_classifier, epochs=10)

    assert _config.for_component(idx_tokenizer) == {"name": "SpacyTokenizer"}
    assert _config.for_component(idx_classifier) == {
        "name": "DIETClassifier",
        "epochs": 10,
    }
Пример #19
0
def blank_config() -> RasaNLUModelConfig:
    return RasaNLUModelConfig({"language": "en", "pipeline": []})
Пример #20
0
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 16,
                    "end": 20,
                    "value": "west",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent":
                "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                "spacy_doc":
                spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = "anywhere in the west"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0]
            for word in crf_format] == ["anywhere", "in", "the", "west"]
    feats = ext._sentence_to_features(crf_format)
    assert "BOS" in feats[0]
    assert "EOS" in feats[-1]
    assert feats[1]["0:low"] == "in"
    sentence = "anywhere in the west"
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get("entities") == [{
        "start": 16,
        "end": 20,
        "value": "west",
        "entity": "location"
    }], "Entity without extractor remains"
    assert filtered[1].get("entities") == [{
        "start": 8,
        "end": 14,
        "value": "indian",
        "entity": "cuisine",
        "extractor": "CRFEntityExtractor",
    }], "Only CRFEntityExtractor entity annotation remains"
    assert examples[1].get("entities")[0] == {
        "start": 0,
        "end": 7,
        "value": "central",
        "entity": "location",
        "extractor": "random_extractor",
    }, "Original examples are not mutated"
Пример #21
0
def test_duckling_entity_extractor(component_builder):
    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"Today","start":0,"value":{"values":[{
             "value":"2018-11-13T00:00:00.000-08:00","grain":"day",
             "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00",
             "grain":"day","type":"value"},"end":5,
             "dim":"time","latent":false},{"body":"the 5th","start":9,
             "value":{"values":[{
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-01-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-02-05T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":16,"dim":"time",
             "latent":false},{"body":"5th of May","start":13,"value":{
             "values":[{
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2020-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2021-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"}],
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},"end":23,"dim":"time",
             "latent":false},{"body":"tomorrow","start":37,"value":{
             "values":[{
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":45,"dim":"time",
             "latent":false}]""",
    )
    httpretty.enable()

    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "DucklingHTTPExtractor"
        }]})
    _config.set_component_attr(0,
                               dimensions=["time"],
                               timezone="UTC",
                               url="http://localhost:8000")
    duckling = component_builder.create_component(_config.for_component(0),
                                                  _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 4

    # Test duckling with a defined date

    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"tomorrow","start":12,"value":{"values":[{
             "value":"2013-10-13T00:00:00.000Z","grain":"day",
             "type":"value"}],"value":"2013-10-13T00:00:00.000Z",
             "grain":"day","type":"value"},"end":20,
             "dim":"time","latent":false}]""",
    )

    # 1381536182 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"

    # Test dimension filtering includes only specified dimensions
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "DucklingHTTPExtractor"
        }]})
    _config.set_component_attr(0,
                               dimensions=["number"],
                               url="http://localhost:8000")
    ducklingNumber = component_builder.create_component(
        _config.for_component(0), _config)
    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"Yesterday","start":0,"value":{"values":[{
            "value":"2019-02-28T00:00:00.000+01:00","grain":"day",
            "type":"value"}],"value":"2019-02-28T00:00:00.000+01:00",
            "grain":"day","type":"value"},"end":9,"dim":"time"},
            {"body":"5","start":21,"value":{"value":5,"type":"value"},
            "end":22,"dim":"number"}]""",
    )

    message = Message("Yesterday there were 5 people in a room")
    ducklingNumber.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "5"
    assert entities[0]["value"] == 5
Пример #22
0
def test_run_cv_evaluation_with_response_selector(monkeypatch: MonkeyPatch):
    training_data_obj = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa.yml"
    )
    training_data_responses_obj = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa-responses.yml"
    )
    training_data_obj = training_data_obj.merge(training_data_responses_obj)

    nlu_config = RasaNLUModelConfig(
        {
            "language": "en",
            "pipeline": [
                {"name": "WhitespaceTokenizer"},
                {"name": "CountVectorsFeaturizer"},
                {"name": "DIETClassifier", EPOCHS: 2},
                {"name": "ResponseSelector", EPOCHS: 2},
            ],
        }
    )

    # mock training
    trainer = Trainer(nlu_config)
    trainer.pipeline = remove_pretrained_extractors(trainer.pipeline)
    mock = Mock(return_value=Interpreter(trainer.pipeline, None))
    monkeypatch.setattr(Trainer, "train", mock)

    n_folds = 2
    intent_results, entity_results, response_selection_results = cross_validate(
        training_data_obj,
        n_folds,
        nlu_config,
        successes=False,
        errors=False,
        disable_plotting=True,
        report_as_dict=True,
    )

    assert len(intent_results.train["Accuracy"]) == n_folds
    assert len(intent_results.train["Precision"]) == n_folds
    assert len(intent_results.train["F1-score"]) == n_folds
    assert len(intent_results.test["Accuracy"]) == n_folds
    assert len(intent_results.test["Precision"]) == n_folds
    assert len(intent_results.test["F1-score"]) == n_folds
    assert all(key in intent_results.evaluation for key in ["errors", "report"])
    assert any(
        isinstance(intent_report, dict)
        and intent_report.get("confused_with") is not None
        for intent_report in intent_results.evaluation["report"].values()
    )

    assert len(response_selection_results.train["Accuracy"]) == n_folds
    assert len(response_selection_results.train["Precision"]) == n_folds
    assert len(response_selection_results.train["F1-score"]) == n_folds
    assert len(response_selection_results.test["Accuracy"]) == n_folds
    assert len(response_selection_results.test["Precision"]) == n_folds
    assert len(response_selection_results.test["F1-score"]) == n_folds
    assert all(
        key in response_selection_results.evaluation for key in ["errors", "report"]
    )
    assert any(
        isinstance(intent_report, dict)
        and intent_report.get("confused_with") is not None
        for intent_report in response_selection_results.evaluation["report"].values()
    )

    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
    for extractor_evaluation in entity_results.evaluation.values():
        assert all(key in extractor_evaluation for key in ["errors", "report"])
Пример #23
0
 def get_nlu_model(self) -> RasaNLUModelConfig:
     return RasaNLUModelConfig({
         "language": self.language,
         "pipeline": self.pipeline
     })
Пример #24
0
def test_run_cv_evaluation_with_response_selector():
    training_data_obj = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa.md")
    training_data_responses_obj = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa-responses.md")
    training_data_obj = training_data_obj.merge(training_data_responses_obj)

    nlu_config = RasaNLUModelConfig({
        "language":
        "en",
        "pipeline": [
            {
                "name": "WhitespaceTokenizer"
            },
            {
                "name": "CountVectorsFeaturizer"
            },
            {
                "name": "DIETClassifier",
                EPOCHS: 2
            },
            {
                "name": "ResponseSelector",
                EPOCHS: 2
            },
        ],
    })

    n_folds = 2
    intent_results, entity_results, response_selection_results = cross_validate(
        training_data_obj,
        n_folds,
        nlu_config,
        successes=False,
        errors=False,
        disable_plotting=True,
    )

    assert len(intent_results.train["Accuracy"]) == n_folds
    assert len(intent_results.train["Precision"]) == n_folds
    assert len(intent_results.train["F1-score"]) == n_folds
    assert len(intent_results.test["Accuracy"]) == n_folds
    assert len(intent_results.test["Precision"]) == n_folds
    assert len(intent_results.test["F1-score"]) == n_folds
    assert all(key in intent_results.evaluation
               for key in ["errors", "report"])

    assert len(response_selection_results.train["Accuracy"]) == n_folds
    assert len(response_selection_results.train["Precision"]) == n_folds
    assert len(response_selection_results.train["F1-score"]) == n_folds
    assert len(response_selection_results.test["Accuracy"]) == n_folds
    assert len(response_selection_results.test["Precision"]) == n_folds
    assert len(response_selection_results.test["F1-score"]) == n_folds
    assert all(key in response_selection_results.evaluation
               for key in ["errors", "report"])

    assert len(entity_results.train["DIETClassifier"]["Accuracy"]) == n_folds
    assert len(entity_results.train["DIETClassifier"]["Precision"]) == n_folds
    assert len(entity_results.train["DIETClassifier"]["F1-score"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["Accuracy"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["Precision"]) == n_folds
    assert len(entity_results.test["DIETClassifier"]["F1-score"]) == n_folds
    for extractor_evaluation in entity_results.evaluation.values():
        assert all(key in extractor_evaluation for key in ["errors", "report"])
Пример #25
0
def test_incremental_train_featurization(tmp_path: Path):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 5},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    # Test featurization of message
    expected = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    persist_value = featurizer.persist("ftr", str(tmp_path))
    loaded_featurizer = RegexFeaturizer.load(
        meta={
            "number_additional_patterns": 5,
            "file": persist_value["file"],
        },
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    new_patterns = [
        {
            "pattern": "\\btoday*",
            "name": "day",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey+",
            "name": "hello",
            "usage": "intent"
        },
    ]

    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    loaded_featurizer.train(
        TrainingData([message], regex_features=patterns + new_patterns),
        RasaNLUModelConfig(),
    )

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0])
    expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 8) == seq_vecs.shape
    assert (1, 8) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[-2] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]
Пример #26
0
def test_persist_load_for_finetuning(tmp_path: Path):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 5},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    persist_value = featurizer.persist("ftr", str(tmp_path))

    # Test all artifacts stored as part of persist
    assert persist_value["file"] == "ftr"
    assert (tmp_path / "ftr.patterns.pkl").exists()
    assert (tmp_path / "ftr.vocabulary_stats.pkl").exists()
    assert featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 3,
    }

    loaded_featurizer = RegexFeaturizer.load(
        meta={
            "number_additional_patterns": 5,
            "file": persist_value["file"],
        },
        should_finetune=True,
        model_dir=str(tmp_path),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode
    assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats

    new_lookups = [{
        "name": "plates",
        "elements": "data/test/lookup_tables/plates.txt"
    }]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
    assert loaded_featurizer.vocabulary_stats == {
        "max_number_patterns": 8,
        "pattern_slots_filled": 4,
    }
Пример #27
0
def test_regex_featurizer_train():

    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = RegexFeaturizer.create({"number_additional_patterns": 0},
                                        RasaNLUModelConfig())

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    WhitespaceTokenizer().train(TrainingData([message]))

    featurizer.train(TrainingData([message], regex_features=patterns),
                     RasaNLUModelConfig())

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
Пример #28
0
async def test_adjusting_layers_incremental_training(
        component_builder: ComponentBuilder, tmpdir: Path):
    """Tests adjusting sparse layers of `ResponseSelector` to increased sparse
       feature sizes during incremental training.

       Testing is done by checking the layer sizes.
       Checking if they were replaced correctly is also important
       and is done in `test_replace_dense_for_sparse_layers`
       in `test_rasa_layers.py`.
    """
    iter1_data_path = "data/test_incremental_training/iter1/"
    iter2_data_path = "data/test_incremental_training/"
    pipeline = [
        {
            "name": "WhitespaceTokenizer"
        },
        {
            "name": "LexicalSyntacticFeaturizer"
        },
        {
            "name": "RegexFeaturizer"
        },
        {
            "name": "CountVectorsFeaturizer"
        },
        {
            "name": "CountVectorsFeaturizer",
            "analyzer": "char_wb",
            "min_ngram": 1,
            "max_ngram": 4,
        },
        {
            "name": "ResponseSelector",
            EPOCHS: 1
        },
    ]
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})

    (_, trained, persisted_path) = await rasa.nlu.train.train(
        _config,
        path=str(tmpdir),
        data=iter1_data_path,
        component_builder=component_builder,
    )
    assert trained.pipeline
    old_data_signature = trained.pipeline[-1].model.data_signature
    old_predict_data_signature = trained.pipeline[
        -1].model.predict_data_signature
    message = Message.build(text="Rasa is great!")
    trained.featurize_message(message)
    old_sparse_feature_sizes = message.get_sparse_feature_sizes(attribute=TEXT)
    initial_rs_layers = (
        trained.pipeline[-1].model._tf_layers["sequence_layer.text"].
        _tf_layers["feature_combining"])
    initial_rs_sequence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    initial_rs_sentence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0]
    initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0]
    assert initial_rs_sequence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert initial_rs_sentence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])

    loaded = Interpreter.load(
        persisted_path,
        component_builder,
        new_config=_config,
    )
    assert loaded.pipeline
    assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!")
    (_, trained, _) = await rasa.nlu.train.train(
        _config,
        path=str(tmpdir),
        data=iter2_data_path,
        component_builder=component_builder,
        model_to_finetune=loaded,
    )
    assert trained.pipeline
    message = Message.build(text="Rasa is great!")
    trained.featurize_message(message)
    new_sparse_feature_sizes = message.get_sparse_feature_sizes(attribute=TEXT)

    final_rs_layers = (
        trained.pipeline[-1].model._tf_layers["sequence_layer.text"].
        _tf_layers["feature_combining"])
    final_rs_sequence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    final_rs_sentence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0]
    final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0]
    assert final_rs_sequence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert final_rs_sentence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])
    # check if the data signatures were correctly updated
    new_data_signature = trained.pipeline[-1].model.data_signature
    new_predict_data_signature = trained.pipeline[
        -1].model.predict_data_signature
    iter2_data = load_data(iter2_data_path)
    expected_sequence_lengths = len([
        message for message in iter2_data.training_examples
        if message.get(INTENT_RESPONSE_KEY)
    ])

    def test_data_signatures(
        new_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
        old_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
    ):
        # Wherever attribute / feature_type signature is not
        # expected to change, directly compare it to old data signature.
        # Else compute its expected signature and compare
        attributes_expected_to_change = [TEXT]
        feature_types_expected_to_change = [
            FEATURE_TYPE_SEQUENCE,
            FEATURE_TYPE_SENTENCE,
        ]

        for attribute, signatures in new_signature.items():

            for feature_type, feature_signatures in signatures.items():

                if feature_type == "sequence_lengths":
                    assert feature_signatures[
                        0].units == expected_sequence_lengths

                elif feature_type not in feature_types_expected_to_change:
                    assert feature_signatures == old_signature.get(
                        attribute).get(feature_type)
                else:
                    for index, feature_signature in enumerate(
                            feature_signatures):
                        if (feature_signature.is_sparse and attribute
                                in attributes_expected_to_change):
                            assert feature_signature.units == sum(
                                new_sparse_feature_sizes.get(feature_type))
                        else:
                            # dense signature or attributes that are not
                            # expected to change can be compared directly
                            assert (
                                feature_signature.units == old_signature.get(
                                    attribute).get(feature_type)[index].units)

    test_data_signatures(new_data_signature, old_data_signature)
    test_data_signatures(new_predict_data_signature,
                         old_predict_data_signature)
Пример #29
0
    def train(self, training_data: TrainingData, config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        from seq2label.input import build_input_func
        from seq2label.model import Model

        raw_config = self.component_config

        print(raw_config)

        if 'result_dir' not in raw_config:
            raw_config['result_dir'] = tempfile.mkdtemp()

        model = Model(raw_config)

        config = model.get_default_config()
        config.update(raw_config)

        # task_status = TaskStatus(config)

        # read data according configure
        train_data_generator_func = kwargs.get('addons_tf_input_fn')
        corpus_meta_data = kwargs.get('addons_tf_input_meta')

        config['tags_data'] = corpus_meta_data['label']
        config['num_classes'] = len(config['tags_data'])

        print('')

        # build model according configure

        # send START status to monitor system
        # task_status.send_status(task_status.START)

        # train and evaluate model
        train_input_func = build_input_func(train_data_generator_func, config)

        # train_iterator = train_input_func()
        # import tensorflow as tf
        # import sys
        #
        # with tf.Session() as sess:
        #     sess.run(tf.tables_initializer())
        #
        #     counter = 0
        #     while True:
        #         try:
        #             value = sess.run(train_iterator[0]['words'])
        #             counter += 1
        #             print(value)
        #             break
        #         except tf.errors.OutOfRangeError:
        #             break
        #
        # print(counter)
        # #
        # sys.exit(0)

        evaluate_result, export_results, final_saved_model = model.train_and_eval_then_save(
            train_input_func, None, config)

        # task_status.send_status(task_status.DONE)

        self.result_dir = final_saved_model
Пример #30
0
async def test_sparse_feature_sizes_decreased_incremental_training(
    iter1_path: Text,
    iter2_path: Text,
    should_raise_exception: bool,
    component_builder: ComponentBuilder,
    tmpdir: Path,
):
    pipeline = [
        {
            "name": "WhitespaceTokenizer"
        },
        {
            "name": "LexicalSyntacticFeaturizer"
        },
        {
            "name": "RegexFeaturizer"
        },
        {
            "name": "CountVectorsFeaturizer"
        },
        {
            "name": "CountVectorsFeaturizer",
            "analyzer": "char_wb",
            "min_ngram": 1,
            "max_ngram": 4,
        },
        {
            "name": "ResponseSelector",
            EPOCHS: 1
        },
    ]
    _config = RasaNLUModelConfig({"pipeline": pipeline, "language": "en"})

    (_, trained, persisted_path) = await rasa.nlu.train.train(
        _config,
        path=str(tmpdir),
        data=iter1_path,
        component_builder=component_builder,
    )
    assert trained.pipeline

    loaded = Interpreter.load(
        persisted_path,
        component_builder,
        new_config=_config,
    )
    assert loaded.pipeline
    assert loaded.parse("Rasa is great!") == trained.parse("Rasa is great!")
    if should_raise_exception:
        with pytest.raises(Exception) as exec_info:
            (_, trained, _) = await rasa.nlu.train.train(
                _config,
                path=str(tmpdir),
                data=iter2_path,
                component_builder=component_builder,
                model_to_finetune=loaded,
            )
        assert "Sparse feature sizes have decreased" in str(exec_info.value)
    else:
        (_, trained, _) = await rasa.nlu.train.train(
            _config,
            path=str(tmpdir),
            data=iter2_path,
            component_builder=component_builder,
            model_to_finetune=loaded,
        )
        assert trained.pipeline