def test_regex_featurizer_case_sensitive( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], case_sensitive: bool, spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizerGraphComponent], spacy_tokenizer: SpacyTokenizerGraphComponent, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = create_featurizer( {"case_sensitive": case_sensitive}, known_patterns=patterns, ) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT) assert np.allclose( sequence_features.toarray()[0], expected_sequence_features, atol=1e-10 ) assert np.allclose( sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10 )
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizerGraphComponent], spacy_tokenizer: SpacyTokenizerGraphComponent, ): ftr = create_featurizer({"use_word_boundaries": use_word_boundaries}) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
async def test_train_persist_with_different_configurations( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], config_params: Dict[Text, Any], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, spacy_tokenizer: SpacyTokenizerGraphComponent, spacy_nlp: Language, ): crf_extractor = crf_entity_extractor(config_params) importer = RasaFileImporter(training_data_paths=["data/examples/rasa"]) training_data = importer.get_nlu_data() spacy_model = SpacyModel(model=spacy_nlp, model_name="en_core_web_md") training_data = SpacyPreprocessor({}).process_training_data( training_data, spacy_model) training_data = spacy_tokenizer.process_training_data(training_data) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) messages = SpacyPreprocessor({}).process([message], spacy_model) message = spacy_tokenizer.process(messages)[0] message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractorGraphComponent.load( { **CRFEntityExtractorGraphComponent.get_default_config(), **config_params }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() detected_entities = processed_message2.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"
def test_regex_featurizer( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[int], spacy_nlp: Any, create_featurizer: Callable[..., RegexFeaturizerGraphComponent], spacy_tokenizer: SpacyTokenizerGraphComponent, ): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = create_featurizer(known_patterns=patterns) # adds tokens to the message message = Message(data={TEXT: sentence, RESPONSE: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, spacy_tokenizer: SpacyTokenizerGraphComponent, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) spacy_tokenizer.process([message]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_crf_use_dense_features( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], spacy_nlp: Any, spacy_featurizer: SpacyFeaturizerGraphComponent, spacy_tokenizer: SpacyTokenizerGraphComponent, ): component_config = { "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } crf_extractor = crf_entity_extractor(component_config) text = "Rasa is a company in Berlin" message = Message(data={TEXT: text}) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process([message]) spacy_featurizer.process([message]) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) if dense_features: dense_features = dense_features.features for i in range(0, len(dense_features[0])): assert (features[0]["0:text_dense_features"]["text_dense_features"][ str(i)] == dense_features[0][i])
def test_count_vector_featurizer_use_lemma( spacy_nlp: Any, sentence: Text, sequence_features: List[List[int]], sentence_features: List[List[int]], use_lemma: bool, create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], spacy_tokenizer: SpacyTokenizerGraphComponent, ): config = {"use_lemma": use_lemma, "OOV_words": ["drinks"], "OOV_token": "OOV"} ftr = create_featurizer(config) train_message = Message(data={TEXT: sentence}) train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) test_message = Message(data={TEXT: sentence}) test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) spacy_tokenizer.process([train_message]) spacy_tokenizer.process([test_message]) ftr.train(TrainingData([train_message]), spacy_nlp=SpacyModel(spacy_nlp, "en")) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix) assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.features.toarray() actual_sen_vecs = sen_vecs.features.toarray() assert np.all(actual_seq_vecs[0] == sequence_features) assert np.all(actual_sen_vecs[-1] == sentence_features) loaded = load_featurizer(config) assert loaded.OOV_words == ftr.OOV_words
def test_convert_training_examples( spacy_nlp: Language, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizerGraphComponent.create( SpacyTokenizerGraphComponent.get_default_config(), default_model_storage, Resource("tokenizer"), default_execution_context, ) count_vectors_featurizer = CountVectorsFeaturizerGraphComponent.create( CountVectorsFeaturizerGraphComponent.get_default_config(), default_model_storage, Resource("count_featurizer"), default_execution_context, ) spacy_featurizer = SpacyFeaturizerGraphComponent.create( SpacyFeaturizerGraphComponent.get_default_config(), default_model_storage, Resource("spacy_featurizer"), default_execution_context, ) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.process_training_data(training_data) count_vectors_featurizer.train(training_data) count_vectors_featurizer.process_training_data(training_data) spacy_featurizer.process_training_data(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec, ) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def spacy_tokenizer() -> SpacyTokenizerGraphComponent: return SpacyTokenizerGraphComponent( SpacyTokenizerGraphComponent.get_default_config() )