Пример #1
0
def test_preserve_sentence_and_sequence_features_old_config():
    attribute = "text"
    message = Message.build("hi there")

    transformers_nlp = HFTransformersNLP({
        "model_name": "bert",
        "model_weights": "bert-base-uncased"
    })
    transformers_nlp.process(message)
    lm_tokenizer = LanguageModelTokenizer()
    lm_tokenizer.process(message)

    lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"})
    lm_featurizer.process(message)

    message.set(LANGUAGE_MODEL_DOCS[attribute], None)
    lm_docs = lm_featurizer._get_docs_for_batch([message],
                                                attribute=attribute,
                                                inference_mode=True)[0]
    hf_docs = transformers_nlp._get_docs_for_batch([message],
                                                   attribute=attribute,
                                                   inference_mode=True)[0]
    assert not (message.features[0].features
                == lm_docs[SEQUENCE_FEATURES]).any()
    assert not (message.features[1].features
                == lm_docs[SENTENCE_FEATURES]).any()
    assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all()
    assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
Пример #2
0
def test_lm_featurizer_edge_cases(model_name, model_weights, texts,
                                  expected_tokens, expected_indices):

    if model_weights is None:
        model_weights_config = {}
    else:
        model_weights_config = {"model_weights": model_weights}
    transformers_config = {
        **{
            "model_name": model_name
        },
        **model_weights_config
    }

    lm_featurizer = LanguageModelFeaturizer(transformers_config)
    whitespace_tokenizer = WhitespaceTokenizer()

    for text, gt_tokens, gt_indices in zip(texts, expected_tokens,
                                           expected_indices):

        message = Message.build(text=text)
        tokens = whitespace_tokenizer.tokenize(message, TEXT)
        message.set(TOKENS_NAMES[TEXT], tokens)
        lm_featurizer.process(message)

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
Пример #3
0
 def inner(config: Dict[Text, Any]) -> LanguageModelFeaturizer:
     return LanguageModelFeaturizer.create(
         config={**LanguageModelFeaturizer.get_default_config(), **config},
         model_storage=default_model_storage,
         resource=resource_language_model_featurizer,
         execution_context=default_execution_context,
     )
Пример #4
0
def test_attention_mask(actual_sequence_length: int,
                        max_input_sequence_length: int, zero_start_index: int):
    component = LanguageModelFeaturizer({"model_name": "bert"},
                                        skip_model_load=True)

    attention_mask = component._compute_attention_mask(
        [actual_sequence_length], max_input_sequence_length)
    mask_ones = attention_mask[0][:zero_start_index]
    mask_zeros = attention_mask[0][zero_start_index:]

    assert np.all(mask_ones == 1)
    assert np.all(mask_zeros == 0)
Пример #5
0
def train_texts(texts: List[Text], model_name: Text,
                model_weights: Text) -> List[Message]:
    config = create_pretrained_transformers_config(model_name, model_weights)
    whitespace_tokenizer = WhitespaceTokenizer()
    lm_featurizer = LanguageModelFeaturizer(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.train(td)
    lm_featurizer.train(td)
    return messages
Пример #6
0
class LaBSEScorer:
    def __init__(self):
        self.featurizer = LanguageModelFeaturizer({
            "model_name": "bert",
            "model_weights": "rasa/LaBSE"
        })
        self.tokenizer = WhitespaceTokenizer()

    @staticmethod
    def compute_similarity_score(feature_vec_a: np.ndarray,
                                 feature_vec_b: np.ndarray):
        return 1 - cosine(feature_vec_a, feature_vec_b)

    def compute_similarity_for_pair(self, a: Message, b: Message):
        features_a = a.features[0].features
        features_b = b.features[0].features

        return self.compute_similarity_score(features_a, features_b)

    def compute_features(self, example: Message):
        self.tokenizer.process(example)
        self.featurizer.process(example)

    def compute_similarity_with_paraphrases(self, example: Message):

        # Set features for text of example itself first.
        self.featurizer.process(example)

        paraphrases = example.get("metadata").get("paraphrases")

        similarity_scores = []

        # construct individual message for each paraphrase
        for paraphrase in paraphrases:
            message = Message.build(text=paraphrase)
            self.compute_features(message)
            similarity = self.compute_similarity_for_pair(example, message)
            similarity_scores.append(similarity)

        return similarity_scores

    def compute_similarities(self,
                             examples: List[Message]) -> List[List[float]]:

        scores_for_collection = []

        for example in examples:

            similarity_scores = self.compute_similarity_with_paraphrases(
                example)
            scores_for_collection.append(similarity_scores)

        return scores_for_collection
Пример #7
0
def test_input_padding(
    token_ids: List[List[int]],
    max_sequence_length_model: int,
    resulting_length: int,
    padding_added: bool,
):
    component = LanguageModelFeaturizer({"model_name": "bert"}, skip_model_load=True)
    component.pad_token_id = 0
    padded_input = component._add_padding_to_batch(token_ids, max_sequence_length_model)
    assert len(padded_input[0]) == resulting_length
    if padding_added:
        original_length = len(token_ids[0])
        assert np.all(np.array(padded_input[0][original_length:]) == 0)
Пример #8
0
def process_texts(texts: List[Text], model_name: Text,
                  model_weights: Text) -> List[Message]:
    config = create_pretrained_transformers_config(model_name, model_weights)
    whitespace_tokenizer = WhitespaceTokenizer()
    lm_featurizer = LanguageModelFeaturizer(config)

    messages = []
    for text in texts:
        message = Message.build(text=text)
        whitespace_tokenizer.process(message)
        lm_featurizer.process(message)
        messages.append(message)
    return messages
Пример #9
0
def test_lm_featurizer_shape_values(model_name, texts, expected_shape,
                                    expected_sequence_vec, expected_cls_vec):
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)

    transformers_nlp.train(td)
    lm_featurizer.train(td)

    for index in range(len(texts)):

        computed_sequence_vec, computed_sentence_vec = messages[
            index].get_dense_features(TEXT, [])
        if computed_sequence_vec:
            computed_sequence_vec = computed_sequence_vec.features
        if computed_sentence_vec:
            computed_sentence_vec = computed_sentence_vec.features

        assert computed_sequence_vec.shape[0] == expected_shape[index][0] - 1
        assert computed_sequence_vec.shape[1] == expected_shape[index][1]
        assert computed_sentence_vec.shape[0] == 1
        assert computed_sentence_vec.shape[1] == expected_shape[index][1]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[:len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(computed_sentence_vec[0][:5],
                           expected_cls_vec[index],
                           atol=1e-5)

        intent_sequence_vec, intent_sentence_vec = messages[
            index].get_dense_features(INTENT, [])
        if intent_sequence_vec:
            intent_sequence_vec = intent_sequence_vec.features
        if intent_sentence_vec:
            intent_sentence_vec = intent_sentence_vec.features

        assert intent_sequence_vec is None
        assert intent_sentence_vec is None
Пример #10
0
def test_log_deprecation_warning_with_old_config(caplog: LogCaptureFixture):
    message = Message.build("hi there")

    transformers_nlp = HFTransformersNLP(
        {"model_name": "bert", "model_weights": "bert-base-uncased"}
    )
    transformers_nlp.process(message)

    caplog.set_level(logging.DEBUG)
    lm_tokenizer = LanguageModelTokenizer()
    lm_tokenizer.process(message)
    lm_featurizer = LanguageModelFeaturizer(skip_model_load=True)
    caplog.clear()
    with caplog.at_level(logging.DEBUG):
        lm_featurizer.process(message)

    assert "deprecated component HFTransformersNLP" in caplog.text
Пример #11
0
def test_lm_featurizer_number_of_sub_tokens(text,
                                            expected_number_of_sub_tokens):
    config = {
        "model_name": "bert",
        "model_weights": "bert-base-uncased",
    }  # Test for one should be enough

    lm_featurizer = LanguageModelFeaturizer(config)
    whitespace_tokenizer = WhitespaceTokenizer()

    message = Message.build(text=text)

    td = TrainingData([message])
    whitespace_tokenizer.train(td)
    lm_featurizer.train(td)

    assert [
        t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])
    ] == expected_number_of_sub_tokens
Пример #12
0
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[0]
    transformers_config = {"model_name": model_name}

    transformers_nlp_bert = HFTransformersNLP({"model_name": "bert"})
    transformers_nlp_gpt = HFTransformersNLP({"model_name": "gpt"})
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp_bert.train(td)
    show_training_data(td)
    transformers_nlp_gpt.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)
Пример #13
0
def test_lm_featurizer_correctly_handle_whitespace_token(
        text, tokens, expected_feature_tokens):
    from rasa.nlu.tokenizers.tokenizer import Token

    config = {
        "model_name": "bert",
        "model_weights": "bert-base-chinese",
    }

    lm_featurizer = LanguageModelFeaturizer(config)

    message = Message.build(text=text)
    message.set(TOKENS_NAMES[TEXT],
                [Token(word, start) for (word, start) in tokens])

    result, _ = lm_featurizer._tokenize_example(message, TEXT)

    assert [(token.text, token.start)
            for token in result] == expected_feature_tokens
def test_lm_featurizer_shape_values():
    model_name, texts, expected_shape, expected_sequence_vec, expected_cls_vec = samples[3]
    transformers_config = {"model_name": model_name}

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_featurizer = LanguageModelFeaturizer()

    messages = []
    for text in texts:
        messages.append(Message.build(text=text))
    td = TrainingData(messages)
    show_training_data(td)
    transformers_nlp.train(td)
    show_training_data(td)
    lm_featurizer.train(td)
    show_training_data(td)


    for index in range(len(texts)):
        computed_feature_vec = messages[index].get(DENSE_FEATURE_NAMES[TEXT])
        computed_sequence_vec, computed_sentence_vec = (
            computed_feature_vec[:-1],
            computed_feature_vec[-1],
        )

        assert computed_feature_vec.shape == expected_shape[index]

        # Look at the value of first dimension for a few starting timesteps
        assert np.allclose(
            computed_sequence_vec[: len(expected_sequence_vec[index]), 0],
            expected_sequence_vec[index],
            atol=1e-5,
        )

        # Look at the first value of first five dimensions
        assert np.allclose(
            computed_sentence_vec[:5], expected_cls_vec[index], atol=1e-5
        )

        intent_vec = messages[index].get(DENSE_FEATURE_NAMES[INTENT])

        assert intent_vec is None
Пример #15
0
def test_log_longer_sequence(
    sequence_length: int,
    model_name: Text,
    model_weights: Text,
    should_overflow: bool,
    caplog: LogCaptureFixture,
):
    config = {"model_name": model_name, "model_weights": model_weights}

    featurizer = LanguageModelFeaturizer(config)

    text = " ".join(["hi"] * sequence_length)
    tokenizer = WhitespaceTokenizer()
    message = Message.build(text=text)
    td = TrainingData([message])
    tokenizer.train(td)
    caplog.set_level(logging.DEBUG)
    featurizer.process(message)
    if should_overflow:
        assert "hi hi hi" in caplog.text
    assert len(message.features) >= 2
Пример #16
0
def test_long_sequences_extra_padding(
    sequence_embeddings: np.ndarray,
    actual_sequence_lengths: List[int],
    model_name: Text,
    padding_needed: bool,
):
    component = LanguageModelFeaturizer({"model_name": model_name},
                                        skip_model_load=True)
    modified_sequence_embeddings = component._add_extra_padding(
        sequence_embeddings, actual_sequence_lengths)
    if not padding_needed:
        assert np.all(modified_sequence_embeddings) == np.all(
            sequence_embeddings)
    else:
        assert modified_sequence_embeddings.shape[
            1] == actual_sequence_lengths[0]
        assert (modified_sequence_embeddings[0].shape[-1] ==
                sequence_embeddings[0].shape[-1])
        zero_embeddings = modified_sequence_embeddings[0][sequence_embeddings.
                                                          shape[1]:]
        assert np.all(zero_embeddings == 0)
Пример #17
0
def test_sequence_length_overflow_train(
    input_sequence_length: int, model_name: Text, should_overflow: bool
):
    component = LanguageModelFeaturizer(
        {"model_name": model_name}, skip_model_load=True
    )
    message = Message.build(text=" ".join(["hi"] * input_sequence_length))
    if should_overflow:
        with pytest.raises(RuntimeError):
            component._validate_sequence_lengths(
                [input_sequence_length], [message], "text", inference_mode=False
            )
    else:
        component._validate_sequence_lengths(
            [input_sequence_length], [message], "text", inference_mode=False
        )
Пример #18
0
 def __init__(self):
     self.featurizer = LanguageModelFeaturizer({
         "model_name": "bert",
         "model_weights": "rasa/LaBSE"
     })
     self.tokenizer = WhitespaceTokenizer()