def train_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() transformer = HFTransformersNLP(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.train(td) transformer.train(td) return messages
def test_lm_featurizer_shape_values(model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) transformers_nlp.train(td) lm_featurizer.train(td) for index in range(len(texts)): computed_sequence_vec, computed_sentence_vec = messages[ index].get_dense_features(TEXT, []) if computed_sequence_vec: computed_sequence_vec = computed_sequence_vec.features if computed_sentence_vec: computed_sentence_vec = computed_sentence_vec.features assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1 assert computed_sequence_vec.shape[1] == expected_shape[index][1] assert computed_sentence_vec.shape[0] == 1 assert computed_sentence_vec.shape[1] == expected_shape[index][1] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[:len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose(computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-5) intent_sequence_vec, intent_sentence_vec = messages[ index].get_dense_features(INTENT, []) if intent_sequence_vec: intent_sequence_vec = intent_sequence_vec.features if intent_sentence_vec: intent_sentence_vec = intent_sentence_vec.features assert intent_sequence_vec is None assert intent_sentence_vec is None
def test_lm_tokenizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer() message = Message(text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])[:-1] ] == expected_number_of_sub_tokens
def test_lm_tokenizer_custom_intent_symbol(text, expected_tokens): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} transformers_config = {"model_name": "bert"} # Test for one should be enough transformers_nlp = HFTransformersNLP(transformers_config) lm_tokenizer = LanguageModelTokenizer(component_config) message = Message(text) message.set(INTENT, text) td = TrainingData([message]) transformers_nlp.train(td) lm_tokenizer.train(td) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0] transformers_config = {"model_name": model_name} transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"}) transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"}) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp_bert.train(td) show_training_data(td) transformers_nlp_gpt.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td)
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3] transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td) for index in range(len(texts)): computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT]) computed_sequence_vec, computed_sentence_vec = ( computed_feature_vec[:-1], computed_feature_vec[-1], ) assert computed_feature_vec.shape == expected_shape[index] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[: len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose( computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 ) intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT]) assert intent_vec is None