def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() message = Message(text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1] ] == expected_number_of_sub_tokens
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer(component_config) message = Message(text) message.set(INTENT, text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens