def test_process_does_not_overwrite_any_entities( create_or_load_extractor: Callable[..., RegexEntityExtractorGraphComponent], ): pre_existing_entity = { ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max", ENTITY_ATTRIBUTE_START: 0, ENTITY_ATTRIBUTE_END: 3, EXTRACTOR: "other extractor", } message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = create_or_load_extractor(config={}) entity_extractor.train(training_data) entity_extractor.process([message]) entities = message.get(ENTITIES) assert entities == [ pre_existing_entity, { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin", ENTITY_ATTRIBUTE_START: 13, ENTITY_ATTRIBUTE_END: 19, EXTRACTOR: RegexEntityExtractorGraphComponent.__name__, }, ]
def test_process( text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message(data={ TEXT: "Hi Max!", "entities": [{ "entity": "person", "value": "Max" }] }), Message( data={ TEXT: "I live in Berlin", "entities": [{ "entity": "city", "value": "Berlin" }], }), ] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected, labeled_tokens): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.tokenizers.tokenizer import Token lookups = [ { "name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"], }, { "name": "dates", "elements": ["昨天", "今天", "明天", "后天"], }, ] ftr = RegexFeaturizer({"use_word_boundaries": False}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lowercase( text: Text, case_sensitive: bool, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{"entity": "person", "value": "Max"}], } ), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{"entity": "city", "value": "Berlin"}], } ), ] entity_extractor = RegexEntityExtractor({"case_sensitive": case_sensitive}) entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizerGraphComponent], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizerGraphComponent.load( RegexFeaturizerGraphComponent.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{ "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def test_lookup_with_and_without_boundaries( sentence: Text, expected_sequence_features: List[List[float]], expected_sentence_features: List[float], labeled_tokens: List[float], use_word_boundaries: bool, spacy_nlp: Any, ): ftr = RegexFeaturizer({ "use_word_boundaries": use_word_boundaries, "number_additional_patterns": 0 }) training_data = TrainingData() # we use lookups because the "use_word_boundaries" flag is only used when # producing patterns from lookup tables lookups = [{"name": "how", "elements": ["how"]}] training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) (sequence_features, sentence_features) = ftr._features_for_patterns(message, TEXT) sequence_features = sequence_features.toarray() sentence_features = sentence_features.toarray() num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups]) assert sequence_features.shape == ( len(message.get(TOKENS_NAMES[TEXT])), num_of_patterns, ) num_of_lookup_tables = len(lookups) assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns) # sequence_features should be {0,1} for each token: 1 if match, 0 if not assert np.allclose(sequence_features, expected_sequence_features, atol=1e-10) # sentence_features should be {0,1} for each lookup table: 1 if sentence # contains match from that table, 0 if not assert np.allclose(sentence_features, expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) # labeled_tokens should list the token(s) which match a pattern assert num_matches == labeled_tokens.count(i)
def test_do_not_overwrite_any_entities(): message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ "entity": "person", "value": "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 5}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }
def test_non_word_boundaries( text: Text, lookup: List[Dict[Text, List[Text]]], non_word_boundary: List[Text], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = [lookup] training_data.training_examples = [ Message( data={ TEXT: "I love New York", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "New York" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), Message( data={ TEXT: "I like apples", INTENT: "inform", ENTITIES: [{ "entity": "fruit", "value": "apples" }], }), Message( data={ TEXT: "oranges are my fave", INTENT: "inform", ENTITIES: [{ "entity": "fruit", "value": "oranges" }], }), ] entity_extractor = FlashTextEntityExtractor( {"non_word_boundaries": non_word_boundary}) entity_extractor.train(training_data) entity_extractor.process(message) entities = [e["value"] for e in message.get(ENTITIES)] assert entities == expected_entities
def test_extract_patterns( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], expected_patterns: Dict[Text, Text], ): training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns(training_data) assert actual_patterns == expected_patterns
def test_lookup_tables( sentence: Text, expected_sequence_features: List[float], expected_sentence_features: List[float], labeled_tokens: List[float], spacy_nlp: Any, ): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer({"number_additional_patterns": 0}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(data={TEXT: sentence}) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected_sequence_features, atol=1e-10) assert np.allclose(sentence_features.toarray(), expected_sentence_features, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_train_and_process( create_or_load_extractor: Callable[..., RegexEntityExtractorGraphComponent], config: Dict[Text, Any], text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], test_loading: bool, ): message = Message(data={TEXT: text}) if test_loading: message_copy = copy.deepcopy(message) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ "entity": "person", "value": "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), ] entity_extractor = create_or_load_extractor(config) entity_extractor.train(training_data) entity_extractor.process([message]) entities = message.get(ENTITIES) assert entities == expected_entities if test_loading: loaded_entity_extractor = create_or_load_extractor(config, load=True) loaded_entity_extractor.process([message_copy]) loaded_entity_extractor.patterns == entity_extractor.patterns
def test_extract_patterns_use_only_lookup_tables_or_regex_features( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], use_lookup_tables: bool, use_regex_features: bool, expected_patterns: Dict[Text, Text], ): training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns( training_data, use_lookup_tables=use_lookup_tables, use_regexes=use_regex_features, ) assert actual_patterns == expected_patterns
def test_extract_patterns_use_only_entities_lookup_tables( entity: Text, lookup_tables: Dict[Text, Text], expected_patterns: Dict[Text, Text]): training_data = TrainingData() if entity: training_data.training_examples = [ Message(data={ "text": "text", "entities": [{ "entity": entity, "value": "text" }] }) ] if lookup_tables: training_data.lookup_tables = [lookup_tables] actual_patterns = pattern_utils.extract_patterns(training_data, use_only_entities=True) assert actual_patterns == expected_patterns
def test_regex_validation( lookup_tables: Dict[Text, List[Text]], regex_features: Dict[Text, Text], use_lookup_tables: bool, use_regex_features: bool, ): """Tests if exception is raised when regex patterns are invalid.""" training_data = TrainingData() if lookup_tables: training_data.lookup_tables = [lookup_tables] if regex_features: training_data.regex_features = [regex_features] with pytest.raises(Exception) as e: pattern_utils.extract_patterns( training_data, use_lookup_tables=use_lookup_tables, use_regexes=use_regex_features, ) assert "Model training failed." in str(e.value) assert "not a valid regex." in str(e.value) assert "Please update your nlu training data configuration" in str(e.value)