示例#1
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"})
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        })
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 6
    assert message.features[0].origin == "cvf_word"
    assert message.features[0].type == FEATURE_TYPE_SEQUENCE
    assert message.features[1].origin == "cvf_word"
    assert message.features[1].type == FEATURE_TYPE_SENTENCE
    # cvf word is also extracted for the intent
    assert message.features[2].origin == "cvf_word"
    assert message.features[2].type == FEATURE_TYPE_SEQUENCE
    assert message.features[3].origin == "cvf_char"
    assert message.features[3].type == FEATURE_TYPE_SEQUENCE
    assert message.features[4].origin == "cvf_char"
    assert message.features[4].type == FEATURE_TYPE_SENTENCE
    assert message.features[5].origin == "LexicalSyntacticFeaturizer"
    assert message.features[5].type == FEATURE_TYPE_SEQUENCE

    sequence_feature_dim = (message.features[0].features.shape[1] +
                            message.features[5].features.shape[1])
    sentence_feature_dim = message.features[0].features.shape[1]

    classifier = DIETClassifier(component_config={
        FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]
    })
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1
    assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1
    assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0
    assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == (
        5,
        sequence_feature_dim,
    )
    assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == (
        1,
        sentence_feature_dim,
    )
    assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
示例#2
0
def test_model_data_signature_with_entities(messages: List[Message],
                                            entity_expected: bool):
    classifier = DIETClassifier({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
示例#3
0
def test_flexible_nlu_pipeline():
    message = Message("This is a test message.", data={"intent": "test"})
    training_data = TrainingData([message, message, message, message, message])

    tokenizer = WhitespaceTokenizer()
    tokenizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}
    )
    featurizer.train(training_data)

    featurizer = CountVectorsFeaturizer(
        component_config={
            FEATURIZER_CLASS_ALIAS: "cvf_char",
            "min_ngram": 1,
            "max_ngram": 3,
            "analyzer": "char_wb",
        }
    )
    featurizer.train(training_data)

    featurizer = LexicalSyntacticFeaturizer({})
    featurizer.train(training_data)

    assert len(message.features) == 4
    assert message.features[0].origin == "cvf_word"
    # cvf word is also extracted for the intent
    assert message.features[1].origin == "cvf_word"
    assert message.features[2].origin == "cvf_char"
    assert message.features[3].origin == "LexicalSyntacticFeaturizer"

    feature_dim = (
        message.features[0].features.shape[1] + message.features[3].features.shape[1]
    )

    classifier = DIETClassifier(
        component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]}
    )
    model_data = classifier.preprocess_train_data(training_data)

    assert len(model_data.get("text_features")) == 1
    assert len(model_data.get("label_features")) == 1
    assert model_data.get("text_features")[0][0].shape == (6, feature_dim)
    assert model_data.get("label_features")[0][0].shape == (1, 1)