def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [{ "pattern": '[0-9]+', "name": "number", "usage": "intent" }, { "pattern": '\\bhey*', "name": "hello", "usage": "intent" }, { "pattern": '[0-1]+', "name": "binary", "usage": "intent" }] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [{ "name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"] }, { "name": 'plates', "elements": "data/test/lookup_tables/plates.txt" }] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}, {"pattern": '[0-1]+', "name": "binary", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 for i, token in enumerate(message.get("tokens")): if i in labeled_tokens: assert token.get("pattern") in [0, 1] else: # if the token is not part of a regex the pattern should not be set assert token.get("pattern") is None
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))