def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_additional_vocab_size_deprecation(): with pytest.warns(FutureWarning) as warning: _ = CountVectorsFeaturizer.create( {"additional_vocabulary_size": {TEXT: 5, RESPONSE: 10}}, RasaNLUModelConfig(), ) assert "The parameter has been deprecated" in warning[0].message.args[0]
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) assert (1, 1) == vecs.shape assert np.all(vecs.toarray()[0] == np.array([1]))
def inner(config: Optional[Dict[Text, Any]] = None) -> CountVectorsFeaturizer: config = config or {} return CountVectorsFeaturizer.create( {**CountVectorsFeaturizer.get_default_config(), **config}, default_model_storage, Resource("count_vectors_featurizer"), default_execution_context, )
def test_convert_training_examples( spacy_nlp: Language, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer.create( SpacyTokenizer.get_default_config(), default_model_storage, Resource("tokenizer"), default_execution_context, ) count_vectors_featurizer = CountVectorsFeaturizer.create( CountVectorsFeaturizer.get_default_config(), default_model_storage, Resource("count_featurizer"), default_execution_context, ) spacy_featurizer = SpacyFeaturizer.create( SpacyFeaturizer.get_default_config(), default_model_storage, Resource("spacy_featurizer"), default_execution_context, ) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.process_training_data(training_data) count_vectors_featurizer.train(training_data) count_vectors_featurizer.process_training_data(training_data) spacy_featurizer.process_training_data(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes