def test_model_data_signature_with_entities( messages: List[Message], entity_expected: bool, create_diet: Callable[..., DIETClassifier], whitespace_tokenizer: WhitespaceTokenizer, ): classifier = create_diet({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET whitespace_tokenizer.process_training_data(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_custom_intent_symbol(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } tk = WhitespaceTokenizer(component_config) message = Message.build(text=text) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_split_intent_response_key(text, expected_tokens): component_config = { "intent_tokenization_flag": True, "intent_split_symbol": "+" } tk = WhitespaceTokenizer(component_config) message = Message(text) message.set(INTENT_RESPONSE_KEY, text) assert [ t.text for t in tk._split_intent(message, attribute=INTENT_RESPONSE_KEY) ] == expected_tokens
def test_whitespace_cls_token(): from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer component_config = {"use_cls_token": True} tk = WhitespaceTokenizer(component_config) assert [t.text for t in tk.tokenize("Forecast for lunch")] == [ "Forecast", "for", "lunch", CLS_TOKEN, ] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
def test_check_check_correct_entity_annotations(text: Text, warnings: int): reader = MarkdownReader() tokenizer = WhitespaceTokenizer() training_data = reader.reads(text) tokenizer.train(training_data) with pytest.warns(UserWarning) as record: EntityExtractor.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all( [excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"] )
def test_convert_tags_to_entities( text: Text, tags: Dict[Text, List[Text]], confidences: Dict[Text, List[float]], expected_entities: List[Dict[Text, Any]], ): extractor = EntityExtractor() tokenizer = WhitespaceTokenizer() message = Message(text) tokens = tokenizer.tokenize(message, TEXT) actual_entities = extractor.convert_predictions_into_entities( text, tokens, tags, confidences) assert actual_entities == expected_entities
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features ): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_check_correct_entity_annotations( text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizer ): reader = RasaYAMLReader() training_data = reader.reads(text) whitespace_tokenizer.process_training_data(training_data) with pytest.warns(UserWarning) as record: EntityExtractorMixin.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all( [excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"] )
def test_elmo_featurizer_tokens_to_text(sentence, expected_text): tokens = WhitespaceTokenizer().tokenize(Message(sentence), attribute=TEXT_ATTRIBUTE) actual_text = ElmoFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_elmo_featurizer_train(): featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE) message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens) message.set(TOKENS_NAMES[RESPONSE_ATTRIBUTE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_convert_tags_to_entities( text: Text, tags: Dict[Text, List[Text]], confidences: Dict[Text, List[float]], expected_entities: List[Dict[Text, Any]], ): extractor = EntityExtractor() tokenizer = WhitespaceTokenizer() message = Message(data={TEXT: text}) tokens = tokenizer.tokenize(message, TEXT) split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True} actual_entities = extractor.convert_predictions_into_entities( text, tokens, tags, split_entities_config, confidences) assert actual_entities == expected_entities
def test_multiword_entities(): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = training_data.load_data(f.name) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example.text) start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens) assert start == 4 assert end == 7
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) vecs = message.get(SPARSE_FEATURE_NAMES[TEXT]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[RESPONSE]) assert (6, 5) == vecs.shape assert np.all(vecs.toarray()[0] == expected) assert np.all(vecs.toarray()[-1] == expected_cls) vecs = message.get(SPARSE_FEATURE_NAMES[INTENT]) assert (1, 1) == vecs.shape assert np.all(vecs.toarray()[0] == np.array([1]))
def test_repeated_entities(tmp_path): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": "3" } ] } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT) start, end = MitieEntityExtractor.find_entity(entities[0], example.get(TEXT), tokens) assert start == 9 assert end == 10
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(ACTION_NAME, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) # check action_name attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME]) assert [t.text for t in tokens] == [text]
def test_count_vector_featurizer_oov_words(sentence, expected): ftr = CountVectorsFeaturizer({ "OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"], "additional_vocabulary_size": { "text": 0 }, }) train_message = Message(data={TEXT: sentence}) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(data={TEXT: sentence}) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_multiword_entities(tmp_path): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "show me flights to New York City", "intent": "unk", "entities": [ { "entity": "destination", "start": 19, "end": 32, "value": "New York City" } ] } ] } }""" f = tmp_path / "tmp_training_data.json" f.write_text(data, rasa.shared.utils.io.DEFAULT_ENCODING) td = load_data(str(f)) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example, attribute=TEXT) start, end = MitieEntityExtractor.find_entity(entities[0], example.get(TEXT), tokens) assert start == 4 assert end == 7
def test_repeated_entities(): data = """ { "rasa_nlu_data": { "common_examples" : [ { "text": "book a table today from 3 to 6 for 3 people", "intent": "unk", "entities": [ { "entity": "description", "start": 35, "end": 36, "value": "3" } ] } ] } }""" with tempfile.NamedTemporaryFile(suffix="_tmp_training_data.json") as f: f.write(data.encode("utf-8")) f.flush() td = training_data.load_data(f.name) assert len(td.entity_examples) == 1 example = td.entity_examples[0] entities = example.get("entities") assert len(entities) == 1 tokens = WhitespaceTokenizer().tokenize(example.text) start, end = MitieEntityExtractor.find_entity(entities[0], example.text, tokens) assert start == 9 assert end == 10
def test_count_vector_featurizer_attribute_featurization( sentence: Text, intent: Text, response: Optional[Text], intent_features: List[List[int]], response_features: Optional[List[List[int]]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_whitespace(): from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer tk = WhitespaceTokenizer() assert ([t.text for t in tk.tokenize("Forecast for lunch") ] == ['Forecast', 'for', 'lunch']) assert ([t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13]) # we ignore .,!? assert ([t.text for t in tk.tokenize("hey ńöñàśçií how're you?") ] == ['hey', 'ńöñàśçií', 'how', 're', 'you']) assert ([t.offset for t in tk.tokenize("hey ńöñàśçií how're you?") ] == [0, 4, 13, 17, 20]) assert ([ t.text for t in tk.tokenize("привет! 10.000, ńöñàśçií. " "(how're you?)") ] == ['привет', '10.000', 'ńöñàśçií', 'how', 're', 'you']) assert ([ t.offset for t in tk.tokenize("привет! 10.000, ńöñàśçií. " "(how're you?)") ] == [0, 8, 16, 27, 31, 34]) # urls are single token assert ([ t.text for t in tk.tokenize("https://www.google.com/search?client=" "safari&rls=en&q=" "i+like+rasa&ie=UTF-8&oe=UTF-8 " "https://rasa.com/docs/nlu/" "components/#tokenizer-whitespace") ] == [ "https://www.google.com/search?" "client=safari&rls=en&q=i+like+rasa&ie=UTF-8&oe=UTF-8", "https://rasa.com/docs/nlu/components/#tokenizer-whitespace" ]) assert ([ t.offset for t in tk.tokenize("https://www.google.com/search?client=" "safari&rls=en&q=" "i+like+rasa&ie=UTF-8&oe=UTF-8 " "https://rasa.com/docs/nlu/" "components/#tokenizer-whitespace") ] == [0, 83])
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, ): ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "additional_vocabulary_size": { "text": 0, "response": 0, "action_text": 0 }, }) tk = WhitespaceTokenizer() # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) tk.train(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) for module in [tk, ftr]: module.process(test_message) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_count_vector_featurizer_shared_vocab( sentence: Text, intent: Text, response: Text, text_features: List[List[int]], intent_features: List[List[int]], response_features: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "use_shared_vocab": True, }) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def test_flexible_nlu_pipeline(): message = Message("This is a test message.", data={"intent": "test"}) training_data = TrainingData([message, message, message, message, message]) tokenizer = WhitespaceTokenizer() tokenizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={FEATURIZER_CLASS_ALIAS: "cvf_word"} ) featurizer.train(training_data) featurizer = CountVectorsFeaturizer( component_config={ FEATURIZER_CLASS_ALIAS: "cvf_char", "min_ngram": 1, "max_ngram": 3, "analyzer": "char_wb", } ) featurizer.train(training_data) featurizer = LexicalSyntacticFeaturizer({}) featurizer.train(training_data) assert len(message.features) == 4 assert message.features[0].origin == "cvf_word" # cvf word is also extracted for the intent assert message.features[1].origin == "cvf_word" assert message.features[2].origin == "cvf_char" assert message.features[3].origin == "LexicalSyntacticFeaturizer" feature_dim = ( message.features[0].features.shape[1] + message.features[3].features.shape[1] ) classifier = DIETClassifier( component_config={FEATURIZERS: ["cvf_word", "LexicalSyntacticFeaturizer"]} ) model_data = classifier.preprocess_train_data(training_data) assert len(model_data.get("text_features")) == 1 assert len(model_data.get("label_features")) == 1 assert model_data.get("text_features")[0][0].shape == (6, feature_dim) assert model_data.get("label_features")[0][0].shape == (1, 1)
def process_training_text( texts: List[Text], model_name: Text, model_weights: Text, create_language_model_featurizer: Callable[[Dict[Text, Any]], LanguageModelFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ) -> List[Message]: """ Creates a featurizer and process training data """ config = create_pretrained_transformers_config(model_name, model_weights) lm_featurizer = create_language_model_featurizer(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.process_training_data(td) lm_featurizer.process_training_data(td) return messages
def __init__( self, component_config: Optional[Dict[Text, Any]] = None, skip_model_load: bool = False, ) -> None: """Initializes HFTransformsNLP with the models specified.""" super(HFTransformersNLP, self).__init__(component_config) self._load_model_metadata() self._load_model_instance(skip_model_load) self.whitespace_tokenizer = WhitespaceTokenizer() rasa.shared.utils.io.raise_warning( f"'{self.__class__.__name__}' is deprecated and " f"will be removed in the future. " f"It is recommended to use the '{LanguageModelFeaturizer.__name__}' " f"instead.", category=DeprecationWarning, )
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizer], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizer, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizer.load( RegexFeaturizer.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer( {"additional_vocabulary_size": { "text": 0, "response": 0 }}) tk = WhitespaceTokenizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_whitespace_training(supervised_embeddings_config): examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }, ), ] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get("tokens")[0].text == "any" assert examples[0].data.get("tokens")[1].text == "mexican" assert examples[0].data.get("tokens")[2].text == "restaurant" assert examples[0].data.get("tokens")[3].text == "will" assert examples[0].data.get("tokens")[4].text == "do" assert examples[1].data.get("tokens")[0].text == "i" assert examples[1].data.get("tokens")[1].text == "want" assert examples[1].data.get("tokens")[2].text == "tacos"
def test_persist_load_for_finetuning(tmp_path: Path): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = RegexFeaturizer.create( {"number_additional_patterns": 5}, RasaNLUModelConfig() ) sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message], regex_features=patterns), RasaNLUModelConfig() ) persist_value = featurizer.persist("ftr", str(tmp_path)) # Test all artifacts stored as part of persist assert persist_value["file"] == "ftr" assert (tmp_path / "ftr.patterns.pkl").exists() assert (tmp_path / "ftr.vocabulary_stats.pkl").exists() assert featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 3, } loaded_featurizer = RegexFeaturizer.load( meta={"number_additional_patterns": 5, "file": persist_value["file"],}, should_finetune=True, model_dir=str(tmp_path), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode assert loaded_featurizer.pattern_vocabulary_stats == featurizer.vocabulary_stats new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4 assert loaded_featurizer.vocabulary_stats == { "max_number_patterns": 8, "pattern_slots_filled": 4, }