def test_vocabulary_overflow_log(): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer( {"number_additional_patterns": 1}, known_patterns=patterns, finetune_mode=True, pattern_vocabulary_stats={"max_number_patterns": 4, "pattern_slots_filled": 3}, ) additional_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhello+", "name": "greet", "usage": "intent"}, ] with pytest.warns(UserWarning) as warning: featurizer.train(TrainingData([], regex_features=additional_patterns)) assert ( "The originally trained model was configured to handle " "a maximum number of 4 patterns" in warning[0].message.args[0] )
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected, labeled_tokens): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.tokenizers.tokenizer import Token lookups = [ { "name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"], }, { "name": "dates", "elements": ["昨天", "今天", "明天", "后天"], }, ] ftr = RegexFeaturizer({"use_word_boundaries": False}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)