def test_preserve_sentence_and_sequence_features_old_config(): attribute = "text" message = Message.build("hi there") transformers_nlp = HFTransformersNLP({ "model_name": "bert", "model_weights": "bert-base-uncased" }) transformers_nlp.process(message) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"}) lm_featurizer.process(message) message.set(LANGUAGE_MODEL_DOCS[attribute], None) lm_docs = lm_featurizer._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] hf_docs = transformers_nlp._get_docs_for_batch([message], attribute=attribute, inference_mode=True)[0] assert not (message.features[0].features == lm_docs[SEQUENCE_FEATURES]).any() assert not (message.features[1].features == lm_docs[SENTENCE_FEATURES]).any() assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all() assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
def test_lm_featurizer_edge_cases(model_name, model_weights, texts, expected_tokens, expected_indices): if model_weights is None: model_weights_config = {} else: model_weights_config = {"model_weights": model_weights} transformers_config = { **{ "model_name": model_name }, **model_weights_config } lm_featurizer = LanguageModelFeaturizer(transformers_config) whitespace_tokenizer = WhitespaceTokenizer() for text, gt_tokens, gt_indices in zip(texts, expected_tokens, expected_indices): message = Message.build(text=text) tokens = whitespace_tokenizer.tokenize(message, TEXT) message.set(TOKENS_NAMES[TEXT], tokens) lm_featurizer.process(message) assert [t.text for t in tokens] == gt_tokens assert [t.start for t in tokens] == [i[0] for i in gt_indices] assert [t.end for t in tokens] == [i[1] for i in gt_indices]
def inner(config: Dict[Text, Any]) -> LanguageModelFeaturizer: return LanguageModelFeaturizer.create( config={**LanguageModelFeaturizer.get_default_config(), **config}, model_storage=default_model_storage, resource=resource_language_model_featurizer, execution_context=default_execution_context, )
def test_attention_mask(actual_sequence_length: int, max_input_sequence_length: int, zero_start_index: int): component = LanguageModelFeaturizer({"model_name": "bert"}, skip_model_load=True) attention_mask = component._compute_attention_mask( [actual_sequence_length], max_input_sequence_length) mask_ones = attention_mask[0][:zero_start_index] mask_zeros = attention_mask[0][zero_start_index:] assert np.all(mask_ones == 1) assert np.all(mask_zeros == 0)
def train_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() lm_featurizer = LanguageModelFeaturizer(config) messages = [Message.build(text=text) for text in texts] td = TrainingData(messages) whitespace_tokenizer.train(td) lm_featurizer.train(td) return messages
class LaBSEScorer: def __init__(self): self.featurizer = LanguageModelFeaturizer({ "model_name": "bert", "model_weights": "rasa/LaBSE" }) self.tokenizer = WhitespaceTokenizer() @staticmethod def compute_similarity_score(feature_vec_a: np.ndarray, feature_vec_b: np.ndarray): return 1 - cosine(feature_vec_a, feature_vec_b) def compute_similarity_for_pair(self, a: Message, b: Message): features_a = a.features[0].features features_b = b.features[0].features return self.compute_similarity_score(features_a, features_b) def compute_features(self, example: Message): self.tokenizer.process(example) self.featurizer.process(example) def compute_similarity_with_paraphrases(self, example: Message): # Set features for text of example itself first. self.featurizer.process(example) paraphrases = example.get("metadata").get("paraphrases") similarity_scores = [] # construct individual message for each paraphrase for paraphrase in paraphrases: message = Message.build(text=paraphrase) self.compute_features(message) similarity = self.compute_similarity_for_pair(example, message) similarity_scores.append(similarity) return similarity_scores def compute_similarities(self, examples: List[Message]) -> List[List[float]]: scores_for_collection = [] for example in examples: similarity_scores = self.compute_similarity_with_paraphrases( example) scores_for_collection.append(similarity_scores) return scores_for_collection
def test_input_padding( token_ids: List[List[int]], max_sequence_length_model: int, resulting_length: int, padding_added: bool, ): component = LanguageModelFeaturizer({"model_name": "bert"}, skip_model_load=True) component.pad_token_id = 0 padded_input = component._add_padding_to_batch(token_ids, max_sequence_length_model) assert len(padded_input[0]) == resulting_length if padding_added: original_length = len(token_ids[0]) assert np.all(np.array(padded_input[0][original_length:]) == 0)
def process_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() lm_featurizer = LanguageModelFeaturizer(config) messages = [] for text in texts: message = Message.build(text=text) whitespace_tokenizer.process(message) lm_featurizer.process(message) messages.append(message) return messages
def test_lm_featurizer_shape_values(model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec): transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) transformers_nlp.train(td) lm_featurizer.train(td) for index in range(len(texts)): computed_sequence_vec, computed_sentence_vec = messages[ index].get_dense_features(TEXT, []) if computed_sequence_vec: computed_sequence_vec = computed_sequence_vec.features if computed_sentence_vec: computed_sentence_vec = computed_sentence_vec.features assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1 assert computed_sequence_vec.shape[1] == expected_shape[index][1] assert computed_sentence_vec.shape[0] == 1 assert computed_sentence_vec.shape[1] == expected_shape[index][1] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[:len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose(computed_sentence_vec[0][:5], expected_cls_vec[index], atol=1e-5) intent_sequence_vec, intent_sentence_vec = messages[ index].get_dense_features(INTENT, []) if intent_sequence_vec: intent_sequence_vec = intent_sequence_vec.features if intent_sentence_vec: intent_sentence_vec = intent_sentence_vec.features assert intent_sequence_vec is None assert intent_sentence_vec is None
def test_log_deprecation_warning_with_old_config(caplog: LogCaptureFixture): message = Message.build("hi there") transformers_nlp = HFTransformersNLP( {"model_name": "bert", "model_weights": "bert-base-uncased"} ) transformers_nlp.process(message) caplog.set_level(logging.DEBUG) lm_tokenizer = LanguageModelTokenizer() lm_tokenizer.process(message) lm_featurizer = LanguageModelFeaturizer(skip_model_load=True) caplog.clear() with caplog.at_level(logging.DEBUG): lm_featurizer.process(message) assert "deprecated component HFTransformersNLP" in caplog.text
def test_lm_featurizer_number_of_sub_tokens(text, expected_number_of_sub_tokens): config = { "model_name": "bert", "model_weights": "bert-base-uncased", } # Test for one should be enough lm_featurizer = LanguageModelFeaturizer(config) whitespace_tokenizer = WhitespaceTokenizer() message = Message.build(text=text) td = TrainingData([message]) whitespace_tokenizer.train(td) lm_featurizer.train(td) assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) ] == expected_number_of_sub_tokens
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0] transformers_config = {"model_name": model_name} transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"}) transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"}) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp_bert.train(td) show_training_data(td) transformers_nlp_gpt.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td)
def test_lm_featurizer_correctly_handle_whitespace_token( text, tokens, expected_feature_tokens): from rasa.nlu.tokenizers.tokenizer import Token config = { "model_name": "bert", "model_weights": "bert-base-chinese", } lm_featurizer = LanguageModelFeaturizer(config) message = Message.build(text=text) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) result, _ = lm_featurizer._tokenize_example(message, TEXT) assert [(token.text, token.start) for token in result] == expected_feature_tokens
def test_lm_featurizer_shape_values(): model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3] transformers_config = {"model_name": model_name} transformers_nlp = HFTransformersNLP(transformers_config) lm_featurizer = LanguageModelFeaturizer() messages = [] for text in texts: messages.append(Message.build(text=text)) td = TrainingData(messages) show_training_data(td) transformers_nlp.train(td) show_training_data(td) lm_featurizer.train(td) show_training_data(td) for index in range(len(texts)): computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT]) computed_sequence_vec, computed_sentence_vec = ( computed_feature_vec[:-1], computed_feature_vec[-1], ) assert computed_feature_vec.shape == expected_shape[index] # Look at the value of first dimension for a few starting timesteps assert np.allclose( computed_sequence_vec[: len(expected_sequence_vec[index]), 0], expected_sequence_vec[index], atol=1e-5, ) # Look at the first value of first five dimensions assert np.allclose( computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5 ) intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT]) assert intent_vec is None
def test_log_longer_sequence( sequence_length: int, model_name: Text, model_weights: Text, should_overflow: bool, caplog: LogCaptureFixture, ): config = {"model_name": model_name, "model_weights": model_weights} featurizer = LanguageModelFeaturizer(config) text = " ".join(["hi"] * sequence_length) tokenizer = WhitespaceTokenizer() message = Message.build(text=text) td = TrainingData([message]) tokenizer.train(td) caplog.set_level(logging.DEBUG) featurizer.process(message) if should_overflow: assert "hi hi hi" in caplog.text assert len(message.features) >= 2
def test_long_sequences_extra_padding( sequence_embeddings: np.ndarray, actual_sequence_lengths: List[int], model_name: Text, padding_needed: bool, ): component = LanguageModelFeaturizer({"model_name": model_name}, skip_model_load=True) modified_sequence_embeddings = component._add_extra_padding( sequence_embeddings, actual_sequence_lengths) if not padding_needed: assert np.all(modified_sequence_embeddings) == np.all( sequence_embeddings) else: assert modified_sequence_embeddings.shape[ 1] == actual_sequence_lengths[0] assert (modified_sequence_embeddings[0].shape[-1] == sequence_embeddings[0].shape[-1]) zero_embeddings = modified_sequence_embeddings[0][sequence_embeddings. shape[1]:] assert np.all(zero_embeddings == 0)
def test_sequence_length_overflow_train( input_sequence_length: int, model_name: Text, should_overflow: bool ): component = LanguageModelFeaturizer( {"model_name": model_name}, skip_model_load=True ) message = Message.build(text=" ".join(["hi"] * input_sequence_length)) if should_overflow: with pytest.raises(RuntimeError): component._validate_sequence_lengths( [input_sequence_length], [message], "text", inference_mode=False ) else: component._validate_sequence_lengths( [input_sequence_length], [message], "text", inference_mode=False )
def __init__(self): self.featurizer = LanguageModelFeaturizer({ "model_name": "bert", "model_weights": "rasa/LaBSE" }) self.tokenizer = WhitespaceTokenizer()