def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [{ "pattern": '[0-9]+', "name": "number", "usage": "intent" }, { "pattern": '\\bhey*', "name": "hello", "usage": "intent" }, { "pattern": '[0-1]+', "name": "binary", "usage": "intent" }] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 for i, token in enumerate(message.get("tokens")): if i in labeled_tokens: assert token.get("pattern") in [0, 1] else: # if the token is not part of a regex the pattern should not be set assert token.get("pattern") is None
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [{ "name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"] }, { "name": 'plates', "elements": "data/test/lookup_tables/plates.txt" }] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert (num_matches == labeled_tokens.count(i))
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": '[0-9]+', "name": "number", "usage": "intent"}, {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}, {"pattern": '[0-1]+', "name": "binary", "usage": "intent"} ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_spacy(spacy_nlp): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer() assert [t.text for t in tk.tokenize(spacy_nlp("Forecast for lunch"))] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize(spacy_nlp("Forecast for lunch"))] == [0, 9, 13] assert [t.text for t in tk.tokenize(spacy_nlp("hey ńöñàśçií how're you?"))] == \ ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [t.offset for t in tk.tokenize(spacy_nlp("hey ńöñàśçií how're you?"))] == [0, 4, 13, 16, 20, 23]
def __init__(self, resource_name, backend, language_name): self.intent_examples = [] self.entity_examples = [] self.resource_name = resource_name self.files = util.recursively_find_files(resource_name) self.fformat = self.guess_format(self.files) self.tokenizer = None self.language_name = language_name if backend in ['mitie', 'mitie_sklearn']: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in ['spacy_sklearn']: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(language_name) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace" ) if self.fformat == 'luis': self.load_luis_data(self.files[0]) elif self.fformat == 'wit': self.load_wit_data(self.files[0]) elif self.fformat == 'api': self.load_api_data(self.files) elif self.fformat == 'rasa_nlu': self.load_data(self.files[0]) else: raise ValueError("unknown training file format : {0}".format( self.fformat))
def init_tokenizer(self, backend, nlp): if backend in [mitie.MITIE_BACKEND_NAME, mitie.MITIE_SKLEARN_BACKEND_NAME]: from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer self.tokenizer = MITIETokenizer() elif backend in [spacy.SPACY_BACKEND_NAME]: from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer self.tokenizer = SpacyTokenizer(nlp) else: from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer self.tokenizer = WhitespaceTokenizer() warnings.warn( "backend not recognised by TrainingData : defaulting to tokenizing by splitting on whitespace")
def tokenizer_from_name(name, language): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer if name == MitieTokenizer.name: return MitieTokenizer() elif name == SpacyTokenizer.name: import spacy nlp = spacy.load(language, parser=False, entity=False) return SpacyTokenizer(nlp) elif name == WhitespaceTokenizer.name: return WhitespaceTokenizer()
def test_spacy(spacy_nlp): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer() assert [t.text for t in tk.tokenize(spacy_nlp("Forecast for lunch")) ] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize(spacy_nlp("Forecast for lunch")) ] == [0, 9, 13] assert [t.text for t in tk.tokenize(spacy_nlp("hey ńöñàśçií how're you?"))] == \ ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [ t.offset for t in tk.tokenize(spacy_nlp("hey ńöñàśçií how're you?")) ] == [0, 4, 13, 16, 20, 23]
def tokenize_sentence(sentence, expected_result, language): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer(language) assert tk.tokenize(sentence) == expected_result
def tokenize_sentence(sentence, expected_result): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer(spacy_nlp_en) assert tk.tokenize(sentence) == expected_result
from rasa_nlu.train import load_data from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.utils.spacy_utils import SpacyNLP from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor import spacy config = RasaNLUModelConfig() training_data = load_data("data/examples/rasa/demo-rasa.json") SpacyNLP(nlp=spacy.load("en")).train(training_data, config) SpacyTokenizer().train(training_data, config) print(training_data.training_examples[25].as_dict()) crf = CRFEntityExtractor() filtered_data = crf.filter_trainable_entities(training_data.training_examples) # Create Dataset # dataset = crf._create_dataset(filtered_data) ## Convert Examples dataset = [] ## Convert JSON TO CRF for training_example in filtered_data: entity_offsets = crf._convert_example(training_example) print("Entity Offset", entity_offsets) # b = crf._from_json_to_crf(training_example, entity_offsets) # print("JSON to CRF", b)
def tokenize_sentence(sentence, expected_result): from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer() assert tk.tokenize(sentence, spacy_nlp) == expected_result