def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) train_message.set("intent", "bla") # this is needed for a valid training example data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) message = Message(sentence) message.set("intent", "bla") data = TrainingData([message]) ftr.train(data) ftr.process(message) assert np.all(message.get("text_features")[0] == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__'}) train_message = Message(sentence) train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({ "token_pattern": r'(?u)\b\w+\b', "OOV_token": '__oov__' }) train_message = Message(sentence) train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer(sentence, expected): from rasa_nlu.featurizers.count_vectors_featurizer import \ CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": 'char'}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)