def test_convert_featurizer_number_of_sub_tokens( create_or_load_convert_featurizer: Callable[ [Dict[Text, Any]], ConveRTFeaturizerGraphComponent ], text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): monkeypatch.setattr( ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in tokens ] == expected_number_of_sub_tokens
def create_whitespace_tokenizer( config: Optional[Dict[Text, Any]] = None) -> WhitespaceTokenizerGraphComponent: return WhitespaceTokenizerGraphComponent({ **WhitespaceTokenizerGraphComponent.get_default_config(), **(config if config else {}), })
async def test_train_persist_load_with_composite_entities( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): importer = RasaFileImporter( training_data_paths=["data/test/demo-rasa-composite-entities.yml"]) training_data = importer.get_nlu_data() whitespace_tokenizer.process_training_data(training_data) crf_extractor = crf_entity_extractor({}) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) whitespace_tokenizer.process([message]) message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractorGraphComponent.load( CRFEntityExtractorGraphComponent.get_default_config(), default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint()
def test_count_vector_featurizer( sentence: Text, expected: List[List[int]], expected_cls: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) whitespace_tokenizer.process([test_message]) ftr.train(TrainingData([train_message])) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_use_shared_vocab_exception( initial_train_text: Text, additional_train_text: Text, use_shared_vocab: bool, create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): """Tests if an exception is raised when `use_shared_vocab` is set to True during incremental training.""" config = {"use_shared_vocab": use_shared_vocab} initial_cvf = create_featurizer(config) train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) initial_cvf.train(data) new_cvf = load_featurizer(config, is_finetuning=True) additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) whitespace_tokenizer.process_training_data(data) if use_shared_vocab: with pytest.raises(Exception) as exec_info: new_cvf.train(data) assert ( "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported" in str(exec_info.value) ) else: new_cvf.train(data)
def test_convert_featurizer_tokens_to_text( create_or_load_convert_featurizer: Callable[ [Dict[Text, Any]], ConveRTFeaturizerGraphComponent ], sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): monkeypatch.setattr( ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizerGraphComponent._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_convert_featurizer_token_edge_cases( create_or_load_convert_featurizer: Callable[ [Dict[Text, Any]], ConveRTFeaturizerGraphComponent ], text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]], monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): monkeypatch.setattr( ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_convert_featurizer_train( create_or_load_convert_featurizer: Callable[ [Dict[Text, Any]], ConveRTFeaturizerGraphComponent ], monkeypatch: MonkeyPatch, load: bool, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): monkeypatch.setattr( ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config, load=True) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.process_training_data(TrainingData([message])) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_persist_load_for_finetuning( create_featurizer: Callable[..., RegexFeaturizerGraphComponent], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource: Resource, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) loaded_featurizer = RegexFeaturizerGraphComponent.load( RegexFeaturizerGraphComponent.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) # Test component loaded in finetune mode and also with # same patterns as before and vocabulary statistics assert loaded_featurizer.known_patterns == featurizer.known_patterns assert loaded_featurizer.finetune_mode new_lookups = [{ "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }] training_data = TrainingData() training_data.lookup_tables = new_lookups loaded_featurizer.train(training_data) # Test merging of a new pattern to an already trained component. assert len(loaded_featurizer.known_patterns) == 4
def create_whitespace_tokenizer( config: Optional[Dict] = None, ) -> WhitespaceTokenizerGraphComponent: config = config if config else {} return WhitespaceTokenizerGraphComponent( { **WhitespaceTokenizerGraphComponent.get_default_config(), **config }, )
def test_count_vector_featurizer_action_attribute_featurization( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer({"token_pattern": r"(?u)\b\w+\b",}) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(ACTION_NAME, action_name) train_message.set(ACTION_TEXT, action_text) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(ACTION_TEXT, "hi") second_message.set(ACTION_NAME, "greet") data = TrainingData([train_message, second_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features( ACTION_NAME, [] ) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( ACTION_TEXT, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if action_name_features: assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None else: assert action_name_seq_vecs is None assert action_name_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_regex_featurizer_train( create_featurizer: Callable[..., RegexFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey how are you today 19.12.2019 ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) training_data = TrainingData([message], regex_features=patterns) featurizer.train(training_data) featurizer.process_training_data(training_data) expected = np.array([0, 1, 0]) expected_cls = np.array([1, 1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (6, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vecs, sen_vec = message.get_sparse_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert seq_vecs is None assert sen_vec is None
def test_count_vector_featurizer_response_attribute_featurization( sentence: Text, intent: Text, response: Optional[Text], intent_features: List[List[int]], response_features: Optional[List[List[int]]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message(data={TEXT: "hello"}) second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, []) if intent_seq_vecs: intent_seq_vecs = intent_seq_vecs.features if intent_sen_vecs: intent_sen_vecs = intent_sen_vecs.features response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, [] ) if response_seq_vecs: response_seq_vecs = response_seq_vecs.features if response_sen_vecs: response_sen_vecs = response_sen_vecs.features if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizerGraphComponent): message_1 = Message.build( text="Germany is part of the European Union", intent="inform" ) message_1.set( ENTITIES, [ {"start": 0, "end": 7, "value": "Germany", "entity": "location"}, { "start": 23, "end": 37, "value": "European Union", "entity": "organisation", }, ], ) message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform") message_2.set( ENTITIES, [ {"start": 0, "end": 6, "value": "Berlin", "entity": "location"}, {"start": 25, "end": 32, "value": "Germany", "entity": "location"}, ], ) training_data = TrainingData([message_1, message_2]) whitespace_tokenizer.process_training_data(training_data) bilou_utils.apply_bilou_schema(training_data) assert message_1.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "B-organisation", "L-organisation", ] assert message_2.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "U-location", ]
def test_count_vectors_featurizer_train( create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): featurizer = create_featurizer() sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") whitespace_tokenizer.process_training_data(TrainingData([message])) data = TrainingData([message]) featurizer.train(data) featurizer.process_training_data(data) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_check_correct_entity_annotations( text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizerGraphComponent ): reader = RasaYAMLReader() training_data = reader.reads(text) whitespace_tokenizer.process_training_data(training_data) with pytest.warns(UserWarning) as record: EntityExtractorMixin.check_correct_entity_annotations(training_data) assert len(record) == warnings assert all( [excerpt in record[0].message.args[0]] for excerpt in ["Misaligned entity annotation in sentence"] )
def test_model_data_signature_with_entities( messages: List[Message], entity_expected: bool, create_diet: Callable[..., DIETClassifierGraphComponent], ): classifier = create_diet({"BILOU_flag": False}) training_data = TrainingData(messages) # create tokens for entity parsing inside DIET tokenizer = WhitespaceTokenizerGraphComponent( WhitespaceTokenizerGraphComponent.get_default_config() ) tokenizer.process_training_data(training_data) model_data = classifier.preprocess_train_data(training_data) entity_exists = "entities" in model_data.get_signature().keys() assert entity_exists == entity_expected
def test_count_vector_featurizer_shared_vocab( sentence: Text, intent: Text, response: Text, text_features: List[List[int]], intent_features: List[List[int]], response_features: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer({"use_shared_vocab": True,}) train_message = Message(data={TEXT: sentence}) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == text_features) assert sen_vec is not None seq_vec, sen_vec = train_message.get_sparse_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == intent_features) assert sen_vec is None seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == response_features) assert sen_vec is not None
def process_training_text( texts: List[Text], model_name: Text, model_weights: Text, create_language_model_featurizer: Callable[ [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent ], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ) -> List[Message]: """ Creates a featurizer and process training data """ config = create_pretrained_transformers_config(model_name, model_weights) lm_featurizer = create_language_model_featurizer(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.process_training_data(td) lm_featurizer.process_training_data(td) return messages
def process_messages( texts: List[Text], model_name: Text, model_weights: Text, create_language_model_featurizer: Callable[ [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent ], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ) -> List[Message]: """ Creates a featurizer and processes messages """ config = create_pretrained_transformers_config(model_name, model_weights) lm_featurizer = create_language_model_featurizer(config) messages = [] for text in texts: message = Message.build(text=text) whitespace_tokenizer.process([message]) messages.append(message) lm_featurizer.process(messages) return messages
def test_count_vector_featurizer_oov_token( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): ftr = create_featurizer({"OOV_token": "__oov__"}) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_cvf_incremental_training( initial_train_text: Text, additional_train_text: Text, initial_vocabulary_size: int, final_vocabulary_size: int, create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): initial_cvf = create_featurizer() train_message = Message(data={"text": initial_train_text}) data = TrainingData([train_message]) whitespace_tokenizer.process_training_data(data) initial_cvf.train(data) # Check initial vocabulary size initial_vocab = initial_cvf.vectorizers["text"].vocabulary_ assert len(initial_vocab) == initial_vocabulary_size # persist and load initial cvf new_cvf = load_featurizer(is_finetuning=True) # Check vocabulary size again assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size additional_train_message = Message(data={"text": additional_train_text}) data = TrainingData([train_message, additional_train_message]) whitespace_tokenizer.process_training_data(data) new_cvf.train(data) new_vocab = new_cvf.vectorizers["text"].vocabulary_ # Check vocabulary size after finetuning assert len(new_vocab) == final_vocabulary_size # Check indices of initial vocabulary haven't changed in the new vocabulary for vocab_token, vocab_index in initial_vocab.items(): assert vocab_token in new_vocab assert new_vocab.get(vocab_token) == vocab_index
def check_subtokens( texts: List[Text], messages: List[Message], expected_number_of_sub_tokens: List[List[float]], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): """ Checks that we get the correct number of sub tokens """ for index, message in enumerate(messages): assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) ] == expected_number_of_sub_tokens[index] assert len(message.get(TOKENS_NAMES[TEXT])) == len( whitespace_tokenizer.tokenize(Message.build(text=texts[index]), TEXT) )
def test_log_longer_sequence( sequence_length: int, model_name: Text, model_weights: Text, should_overflow: bool, caplog: LogCaptureFixture, create_language_model_featurizer: Callable[ [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent ], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): config = {"model_name": model_name, "model_weights": model_weights} featurizer = create_language_model_featurizer(config) text = " ".join(["hi"] * sequence_length) message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) caplog.set_level(logging.DEBUG) featurizer.process([message]) if should_overflow: assert "hi hi hi" in caplog.text assert len(message.features) >= 2
async def test_interpreter_parses_text_tokens( response_selector_interpreter: Interpreter, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): text = "Hello there" tokens = whitespace_tokenizer.tokenize(Message(data={"text": text}), "text") indices = [(t.start, t.end) for t in tokens] parsed_data = response_selector_interpreter.parse(text) assert "text_tokens" in parsed_data.keys() parsed_tokens = parsed_data.get("text_tokens") assert parsed_tokens == indices
def test_convert_tags_to_entities( text: Text, tags: Dict[Text, List[Text]], confidences: Dict[Text, List[float]], expected_entities: List[Dict[Text, Any]], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): extractor = EntityExtractorMixin() message = Message(data={TEXT: text}) tokens = whitespace_tokenizer.tokenize(message, TEXT) split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True} actual_entities = extractor.convert_predictions_into_entities( text, tokens, tags, split_entities_config, confidences ) assert actual_entities == expected_entities
def test_count_vector_featurizer_persist_load( create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = create_featurizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(data={TEXT: sentence1}) train_message2 = Message(data={TEXT: sentence2}) whitespace_tokenizer.process([train_message1]) whitespace_tokenizer.process([train_message2]) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) train_ftr.process_training_data(data) # persist featurizer train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_} ) test_ftr = load_featurizer(config) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(data={TEXT: sentence1}) whitespace_tokenizer.process([test_message1]) test_ftr.process([test_message1]) test_message2 = Message(data={TEXT: sentence2}) whitespace_tokenizer.process([test_message2]) test_ftr.process([test_message2]) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, []) if test_seq_vec_1: test_seq_vec_1 = test_seq_vec_1.features if test_sen_vec_1: test_sen_vec_1 = test_sen_vec_1.features train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, []) if train_seq_vec_1: train_seq_vec_1 = train_seq_vec_1.features if train_sen_vec_1: train_sen_vec_1 = train_sen_vec_1.features test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, []) if test_seq_vec_2: test_seq_vec_2 = test_seq_vec_2.features if test_sen_vec_2: test_sen_vec_2 = test_sen_vec_2.features train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, []) if train_seq_vec_2: train_seq_vec_2 = train_seq_vec_2.features if train_sen_vec_2: train_sen_vec_2 = train_sen_vec_2.features # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
def test_whitespace_language_support(language, is_not_supported): assert (language in WhitespaceTokenizerGraphComponent.not_supported_languages() ) == is_not_supported
def test_vocabulary_expand_for_finetuning( create_featurizer: Callable[..., RegexFeaturizerGraphComponent], default_model_storage: ModelStorage, resource: Resource, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, ] featurizer = create_featurizer() sentence = "hey hey 2020" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") training_data = TrainingData([message], regex_features=patterns) whitespace_tokenizer.process_training_data(training_data) featurizer.train(training_data) featurizer.process_training_data(training_data) # Test featurization of message expected = np.array([1, 0]) expected_cls = np.array([1, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (3, 2) == seq_vecs.shape assert (1, 2) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) loaded_featurizer = RegexFeaturizerGraphComponent.load( RegexFeaturizerGraphComponent.get_default_config(), default_model_storage, resource, dataclasses.replace(default_execution_context, is_finetuning=True), ) new_patterns = [ {"pattern": "\\btoday*", "name": "day", "usage": "intent"}, {"pattern": "\\bhey+", "name": "hello", "usage": "intent"}, ] new_sentence = "hey today" message = Message(data={TEXT: new_sentence}) message.set(RESPONSE, new_sentence) message.set(INTENT, "intent") new_training_data = TrainingData([message], regex_features=patterns + new_patterns) whitespace_tokenizer.process_training_data(new_training_data) loaded_featurizer.train(new_training_data) loaded_featurizer.process_training_data(new_training_data) # Test featurization of message, this time for the extra pattern as well. expected_token_1 = np.array([1, 0, 0]) expected_token_2 = np.array([0, 0, 1]) expected_cls = np.array([1, 0, 1]) seq_vecs, sen_vec = message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vec: sen_vec = sen_vec.features assert (2, 3) == seq_vecs.shape assert (1, 3) == sen_vec.shape assert np.all(seq_vecs.toarray()[0] == expected_token_1) assert np.all(seq_vecs.toarray()[1] == expected_token_2) assert np.all(sen_vec.toarray()[-1] == expected_cls) # let's check if the order of patterns is preserved for old_index, pattern in enumerate(featurizer.known_patterns): assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"] # we also modified a pattern, check if that is correctly modified pattern_to_check = [ pattern for pattern in loaded_featurizer.known_patterns if pattern["name"] == "hello" ] assert pattern_to_check == [new_patterns[1]]
def whitespace_tokenizer() -> WhitespaceTokenizerGraphComponent: return WhitespaceTokenizerGraphComponent( WhitespaceTokenizerGraphComponent.get_default_config() )