def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 5}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_regex_featurizer_train(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 0}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_regex_featurizer_train(): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) message.set(INTENT_ATTRIBUTE, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert (7, 3) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def test_incremental_train_featurization(tmp_path: Path): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = RegexFeaturizer.create({"number_additional_patterns": 5}, RasaNLUModelConfig()) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message], regex_features=patterns), RasaNLUModelConfig()) # Test featurization of message expected = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 0, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) persist_value = featurizer.persist("ftr", str(tmp_path)) loaded_featurizer = RegexFeaturizer.load( meta={ "number_additional_patterns": 5, "file": persist_value["file"], }, should_finetune=True, model_dir=str(tmp_path), ) new_patterns = [ { "pattern": "\\btoday*", "name": "day", "usage": "intent" }, { "pattern": "\\bhey+", "name": "hello", "usage": "intent" }, ] message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) loaded_featurizer.train( TrainingData([message], regex_features=patterns + new_patterns), RasaNLUModelConfig(), ) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([0, 1, 0, 0, 0, 0, 0, 0]) expected_token_2 = np.array([0, 0, 0, 1, 0, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 0, 0, 0, 0]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 8) == seq_vecs.shape assert (1, 8) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[-2] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]