def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"}) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", }) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 6 assert message.features[0].origin == "cvf_word" assert message.features[0].type == FEATURE_TYPE_SEQUENCE assert message.features[1].origin == "cvf_word" assert message.features[1].type == FEATURE_TYPE_SENTENCE # cvf word is also extracted for the intent assert message.features[2].origin == "cvf_word" assert message.features[2].type == FEATURE_TYPE_SEQUENCE assert message.features[3].origin == "cvf_char" assert message.features[3].type == FEATURE_TYPE_SEQUENCE assert message.features[4].origin == "cvf_char" assert message.features[4].type == FEATURE_TYPE_SENTENCE assert message.features[5].origin == "LexicalSyntacticFeaturizer" assert message.features[5].type == FEATURE_TYPE_SEQUENCE sequence_feature_dim = (message.features[0].features.shape[1] + message.features[5].features.shape[1]) sentence_feature_dim = message.features[0].features.shape[1] classifier = DIETClassifier(component_config={ FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"] }) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get(TEXT_SENTENCE_FEATURES)) == 1 assert len(model_data.get(TEXT_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SEQUENCE_FEATURES)) == 1 assert len(model_data.get(LABEL_SENTENCE_FEATURES)) == 0 assert model_data.get(TEXT_SEQUENCE_FEATURES)[0][0].shape == ( 5, sequence_feature_dim, ) assert model_data.get(TEXT_SENTENCE_FEATURES)[0][0].shape == ( 1, sentence_feature_dim, ) assert model_data.get(LABEL_SEQUENCE_FEATURES)[0][0].shape == (1, 1)
def test_model_data_signature_with_entities(messages: List[Message], entity_expected: bool): classifier = DIETClassifier({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} ) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", } ) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 4 assert message.features[0].origin == "cvf_word" # cvf word is also extracted for the intent assert message.features[1].origin == "cvf_word" assert message.features[2].origin == "cvf_char" assert message.features[3].origin == "LexicalSyntacticFeaturizer" feature_dim = ( message.features[0].features.shape[1] + message.features[3].features.shape[1] ) classifier = DIETClassifier( component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} ) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get("text_features")) == 1 assert len(model_data.get("label_features")) == 1 assert model_data.get("text_features")[0][0].shape == (6, feature_dim) assert model_data.get("label_features")[0][0].shape == (1, 1)