def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(test_message) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected)
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": True }) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) vec = train_message.get_sparse_features(TEXT, []) assert np.all(vec.toarray()[0] == expected)
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) WhitespaceTokenizer().process(test_message) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()[0] == expected)
def test_count_vector_featurizer_char(sentence, expected): ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char" }) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) WhitespaceTokenizer().process(test_message) ftr.process(test_message) vec = train_message.get_sparse_features(TEXT, []) assert np.all(vec.toarray()[0] == expected)
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual[0] == expected) assert np.all(actual[-1] == expected_cls)
def test_count_vector_featurizer_oov_token(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__", "return_sequence": True, }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(vecs, scipy.sparse.coo_matrix) actual_vecs = vecs.toarray() assert np.all(actual_vecs[0] == expected) assert np.all(actual_vecs[-1] == expected_cls)
def test_count_vector_featurizer_char(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", "return_sequence": True }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_count_vector_featurizer_no_sequence(sentence, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": False }) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix) actual = test_message.get("text_sparse_features").toarray() assert np.all(actual == expected)