def test_vocabulary_overflow_log(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer( {"number_additional_patterns": 1}, known_patterns=patterns, finetune_mode=True, pattern_vocabulary_stats={"max_number_patterns": 4, "pattern_slots_filled": 3}, ) additional_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhello+", "name": "greet", "usage": "intent"}, ] with pytest.warns(UserWarning) as warning: featurizer.train(TrainingData([], regex_features=additional_patterns)) assert ( "The originally trained model was configured to handle " "a maximum number of 4 patterns" in warning[0].message.args[0] )
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer_case_sensitive( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], case_sensitive: bool, spacy_nlp: Any, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer( {"case_sensitive": case_sensitive, "number_additional_patterns": 0}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected, labeled_tokens): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.tokenizers.tokenizer import Token lookups = [ { "name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"], }, { "name": "dates", "elements": ["昨天", "今天", "明天", "后天"], }, ] ftr = RegexFeaturizer({"use_word_boundaries": False}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nlp): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray()[0], expected, atol=1e-10) assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 5}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], additional_vocabulary_size: int, spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer( {"number_additional_patterns": additional_vocabulary_size}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer(): """ Last one is union of value above :return: """ from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer sentence, expected, labeled_tokens = ( "hey how are you today", [ [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], ], [0], ) patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) assert show_message(message, False) == { "response": "hey how are you today", "text": "hey how are you today" } message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) # assert show_message(message) == {'response': 'hey how are you today', 'text_spacy_doc': spacy_nlp("hey how are you today"), # 'tokens': ['hey', 'how', 'are', 'you', 'today', '__CLS__'], # 'text': 'hey how are you today'} # result = ftr._features_for_patterns(message, TEXT) ftr.process(message) # [TEXT, RESPONSE] show_message(message) assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
def inner( config: Dict[Text, Any] = None, known_patterns: Optional[List[Dict[Text, Any]]] = None, ) -> RegexFeaturizer: config = config or {} return RegexFeaturizer( {**RegexFeaturizer.get_default_config(), **config}, default_model_storage, resource, default_execution_context, known_patterns, )
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def test_regex_featurizer_train(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 0}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) message.set(INTENT_ATTRIBUTE, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def test_vocabulary_expand_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, resource: Resource, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey hey 2020" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) featurizer.process_training_data(training_data) # Test featurization of message expected = np.array([1, 0]) expected_cls = np.array([1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (3, 2) == seq_vecs.shape assert (1, 2) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) new_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhey+", "name": "hello", "usage": "intent"}, ] new_sentence = "hey today" message = Message(data={TEXT: new_sentence}) message.set(RESPONSE, new_sentence) message.set(INTENT, "intent") new_training_data = TrainingData([message], regex_features=patterns + new_patterns) whitespace_tokenizer.process_training_data(new_training_data) loaded_featurizer.train(new_training_data) loaded_featurizer.process_training_data(new_training_data) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([1, 0, 0]) expected_token_2 = np.array([0, 0, 1]) expected_cls = np.array([1, 0, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (2, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[1] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # let's check if the order of patterns is preserved for old_index, pattern in enumerate(featurizer.known_patterns): assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"] # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def test_incremental_train_featurization(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) # Test featurization of message expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) new_patterns = [ { "pattern": "\\btoday*", "name": "day", "usage": "intent" }, { "pattern": "\\bhey+", "name": "hello", "usage": "intent" }, ] message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) loaded_featurizer.train( TrainingData([message], regex_features=patterns + new_patterns), RasaNLUModelConfig(), ) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[-2] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]