def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def test_regex_featurizer_case_sensitive( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], case_sensitive: bool, spacy_nlp: Any, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer( {"case_sensitive": case_sensitive, "number_additional_patterns": 0}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, ): ftr = CountVectorsFeaturizer({"use_lemma": use_lemma}) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features)
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nlp): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray()[0], expected, atol=1e-10) assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected)
def test_regex_featurizer_no_sequence( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizer], spacy_tokenizer: SpacyTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = create_featurizer(known_patterns=patterns) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_convert_training_examples( spacy_nlp: Any, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer() count_vectors_featurizer = CountVectorsFeaturizer() spacy_featurizer = SpacyFeaturizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.train(training_data) count_vectors_featurizer.train(training_data) spacy_featurizer.train(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec, ) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.data.get("pos") for t in tokens] == expected_pos_tags
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], additional_vocabulary_size: int, spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer( {"number_additional_patterns": additional_vocabulary_size}, known_patterns=patterns, ) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer() message = Message(text) message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_regex_featurizer(): """ Last one is union of value above :return: """ from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer sentence, expected, labeled_tokens = ( "hey how are you today", [ [0.0, 1.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0], ], [0], ) patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) assert show_message(message, False) == { "response": "hey how are you today", "text": "hey how are you today" } message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) # assert show_message(message) == {'response': 'hey how are you today', 'text_spacy_doc': spacy_nlp("hey how are you today"), # 'tokens': ['hey', 'how', 'are', 'you', 'today', '__CLS__'], # 'text': 'hey how are you today'} # result = ftr._features_for_patterns(message, TEXT) ftr.process(message) # [TEXT, RESPONSE] show_message(message) assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
def test_custom_intent_symbol(text, expected_tokens, spacy_nlp): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} tk = SpacyTokenizer(component_config) message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_crf_json_from_BILOU(spacy_nlp): ext = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "bias", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", ], ["low", "title", "upper", "pos", "pos2"], ] } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) r = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"B-what": 1.0}, {"L-what": 1.0}, {"B-where": 1.0}, {"I-where": 1.0}, {"L-where": 1.0}, ], ) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
async def test_train_persist_with_different_configurations( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], config_params: Dict[Text, Any], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, spacy_tokenizer: SpacyTokenizer, spacy_featurizer: SpacyFeaturizer, spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel, ): crf_extractor = crf_entity_extractor(config_params) importer = RasaFileImporter(training_data_paths=["data/examples/rasa"]) training_data = importer.get_nlu_data() training_data = spacy_nlp_component.process_training_data( training_data, spacy_model) training_data = spacy_tokenizer.process_training_data(training_data) training_data = spacy_featurizer.process_training_data(training_data) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) messages = spacy_nlp_component.process([message], spacy_model) messages = spacy_tokenizer.process(messages) message = spacy_featurizer.process(messages)[0] message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( { **CRFEntityExtractor.get_default_config(), **config_params }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() detected_entities = processed_message2.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_spacy_intent_tokenizer(spacy_nlp_component): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer td = training_data.load_data("data/examples/rasa/demo-rasa.json") spacy_nlp_component.train(td, config=None) spacy_tokenizer = SpacyTokenizer() spacy_tokenizer.train(td, config=None) intent_tokens_exist = [ True if example.get("intent_tokens") is not None else False for example in td.intent_examples ] # no intent tokens should have been set assert not any(intent_tokens_exist)
def test_spacy_add_cls_token(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer component_config = {"use_cls_token": True} tk = SpacyTokenizer(component_config) text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "Forecast", "for", "lunch", CLS_TOKEN, ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor( component_config={ "BILOU_flag": False, "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ], } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) rs = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"what": 1.0}, {"what": 1.0}, {"where": 1.0}, {"where": 1.0}, {"where": 1.0}, ], ) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r["confidence"] # confidence should exist del r["confidence"] assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"} assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"} assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"} assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"} assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], spacy_nlp: Any, ): component_config = { "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } crf_extractor = crf_entity_extractor(component_config) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(data={TEXT: text}) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) if dense_features: dense_features = dense_features.features for i in range(0, len(dense_features[0])): assert (features[0]["0:text_dense_features"]["text_dense_features"][ str(i)] == dense_features[0][i])
def test_spacy(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer component_config = {"use_cls_token": False} tk = SpacyTokenizer(component_config) text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "Forecast", "for", "lunch", ] assert [t.lemma for t in tk.tokenize(spacy_nlp(text))] == [ "forecast", "for", "lunch", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "hey", "ńöñàśçií", "how", "'re", "you", "?", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer() message = Message(text) message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text)) message.set(RESPONSE_ATTRIBUTE, text) message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]: tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], spacy_tokenizer: SpacyTokenizer, ): config = { "use_lemma": use_lemma, "OOV_words": ["drinks"], "OOV_token": "OOV" } ftr = create_featurizer(config) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([train_message]) spacy_tokenizer.process([test_message]) ftr.train(TrainingData([train_message]), model=SpacyModel(spacy_nlp, "en")) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features) loaded = load_featurizer(config) assert loaded.OOV_words == ftr.OOV_words
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(RESPONSE, text) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.process_training_data(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_crf_use_dense_features(spacy_nlp: Any): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features = message.get_dense_features(TEXT, []) for i in range(0, len(dense_features[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == dense_features[0][i] )
def test_text_featurizer_using_pos(sentence, expected, spacy_nlp): featurizer = LexicalSyntacticFeaturizer({"features": [["pos", "pos2"]]}) train_message = Message(sentence) test_message = Message(sentence) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) SpacyTokenizer().process(train_message) SpacyTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual == expected)
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer({}) message = Message(sentence, data={RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)