def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_regex_featurizer_case_sensitive( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], case_sensitive: bool, spacy_nlp: Any, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer( {"case_sensitive": case_sensitive, "number_additional_patterns": 0}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer_no_sequence( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizer], spacy_tokenizer: SpacyTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = create_featurizer(known_patterns=patterns) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nlp): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray()[0], expected, atol=1e-10) assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], additional_vocabulary_size: int, spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer( {"number_additional_patterns": additional_vocabulary_size}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_regex_featurizer(): """ Last one is union of value above :return: """ from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer sentence, expected, labeled_tokens = ( "hey how are you today", [ [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], ], [0], ) patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) assert show_message(message, False) == { "response": "hey how are you today", "text": "hey how are you today" } message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) # assert show_message(message) == {'response': 'hey how are you today', 'text_spacy_doc': spacy_nlp("hey how are you today"), # 'tokens': ['hey', 'how', 'are', 'you', 'today', '__CLS__'], # 'text': 'hey how are you today'} # result = ftr._features_for_patterns(message, TEXT) ftr.process(message) # [TEXT, RESPONSE] show_message(message) assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
def test_crf_json_from_BILOU(spacy_nlp): ext = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "bias", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", ], ["low", "title", "upper", "pos", "pos2"], ] } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) r = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"B-what": 1.0}, {"L-what": 1.0}, {"B-where": 1.0}, {"I-where": 1.0}, {"L-where": 1.0}, ], ) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor( component_config={ "BILOU_flag": False, "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ], } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) rs = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"what": 1.0}, {"what": 1.0}, {"where": 1.0}, {"where": 1.0}, {"where": 1.0}, ], ) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r["confidence"] # confidence should exist del r["confidence"] assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"} assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"} assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"} assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"} assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], spacy_nlp: Any, ): component_config = { "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } crf_extractor = crf_entity_extractor(component_config) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(data={TEXT: text}) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) if dense_features: dense_features = dense_features.features for i in range(0, len(dense_features[0])): assert (features[0]["0:text_dense_features"]["text_dense_features"][ str(i)] == dense_features[0][i])
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], spacy_tokenizer: SpacyTokenizer, ): config = { "use_lemma": use_lemma, "OOV_words": ["drinks"], "OOV_token": "OOV" } ftr = create_featurizer(config) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([train_message]) spacy_tokenizer.process([test_message]) ftr.train(TrainingData([train_message]), model=SpacyModel(spacy_nlp, "en")) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features) loaded = load_featurizer(config) assert loaded.OOV_words == ftr.OOV_words
def test_crf_use_dense_features(spacy_nlp: Any): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features = message.get_dense_features(TEXT, []) for i in range(0, len(dense_features[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == dense_features[0][i] )
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
async def test_train_persist_with_different_configurations( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], config_params: Dict[Text, Any], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, spacy_tokenizer: SpacyTokenizer, spacy_featurizer: SpacyFeaturizer, spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel, ): crf_extractor = crf_entity_extractor(config_params) importer = RasaFileImporter(training_data_paths=["data/examples/rasa"]) training_data = importer.get_nlu_data() training_data = spacy_nlp_component.process_training_data( training_data, spacy_model) training_data = spacy_tokenizer.process_training_data(training_data) training_data = spacy_featurizer.process_training_data(training_data) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) messages = spacy_nlp_component.process([message], spacy_model) messages = spacy_tokenizer.process(messages) message = spacy_featurizer.process(messages)[0] message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( { **CRFEntityExtractor.get_default_config(), **config_params }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() detected_entities = processed_message2.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizer], spacy_tokenizer: SpacyTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = create_featurizer(known_patterns=patterns) # adds tokens to the message message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray(), expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray(), expected_sentence_features, atol=1e-10 ) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_crf_extractor(spacy_nlp): examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"), }, ), ] extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ] } ) tokenizer = SpacyTokenizer() training_data = TrainingData(training_examples=examples) tokenizer.train(training_data) extractor.train(training_data) sentence = "italian restaurant" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer.process(message) extractor.process(message) detected_entities = message.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"