Exemplo n.º 1
0
def test_count_vector_featurizer_char(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "min_ngram": 1,
        "max_ngram": 2,
        "analyzer": "char",
    })

    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
Exemplo n.º 2
0
def test_count_vector_featurizer(
    sentence: Text,
    expected: List[List[int]],
    expected_cls: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    whitespace_tokenizer.process([train_message])
    whitespace_tokenizer.process([test_message])

    ftr.train(TrainingData([train_message]))

    ftr.process([test_message])

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert isinstance(seq_vecs, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.toarray()
    actual_sen_vecs = sen_vecs.toarray()

    assert np.all(actual_seq_vecs[0] == expected)
    assert np.all(actual_sen_vecs[-1] == expected_cls)
Exemplo n.º 3
0
def test_message_fingerprint_includes_data_and_features(
    whitespace_tokenizer: WhitespaceTokenizer,
):
    message = Message(data={TEXT: "This is a test sentence."})
    fp1 = message.fingerprint()
    whitespace_tokenizer.process([message])
    fp2 = message.fingerprint()

    assert fp1 != fp2

    message.add_features(
        Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",)
    )

    fp3 = message.fingerprint()
    assert fp2 != fp3

    message.add_features(
        Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1")
    )

    fp4 = message.fingerprint()

    assert fp3 != fp4

    assert len({fp1, fp2, fp3, fp4}) == 4
def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)

    spacy_featurizer = SpacyFeaturizer()
    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set("spacy_doc", spacy_nlp(text))

    white_space_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
Exemplo n.º 5
0
async def test_train_persist_load_with_composite_entities(
    crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    importer = RasaFileImporter(
        training_data_paths=["data/test/demo-rasa-composite-entities.yml"])
    training_data = importer.get_nlu_data()

    whitespace_tokenizer.process_training_data(training_data)

    crf_extractor = crf_entity_extractor({})
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})

    whitespace_tokenizer.process([message])
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractor.load(
        CRFEntityExtractor.get_default_config(),
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()
    assert list(loaded_extractor.entity_taggers.keys()) == list(
        crf_extractor.entity_taggers.keys())
def test_features_are_sparse(
    whitespace_tokenizer: WhitespaceTokenizer,
    semantic_map_featurizer: SemanticMapFeaturizer,
):
    message = Message.build("word1 word3")

    whitespace_tokenizer.process(message)
    semantic_map_featurizer.process(message)

    for feature in message.features:
        assert scipy.sparse.issparse(feature.features)
Exemplo n.º 7
0
def test_process_tokenizer_action_name(text: Text,
                                       expected_tokens: List[Text]):
    tk = WhitespaceTokenizer({"intent_tokenization_flag": True})

    message = Message.build(text=text)
    message.set(ACTION_NAME, text)

    tk.process(message)

    tokens = message.get(TOKENS_NAMES[ACTION_NAME])

    assert [t.text for t in tokens] == expected_tokens
Exemplo n.º 8
0
def test_process_tokenizer(text, expected_tokens, expected_indices):
    tk = WhitespaceTokenizer()

    message = Message(text)

    tk.process(message)

    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Exemplo n.º 9
0
class LaBSEScorer:
    def __init__(self):
        self.featurizer = LanguageModelFeaturizer({
            "model_name": "bert",
            "model_weights": "rasa/LaBSE"
        })
        self.tokenizer = WhitespaceTokenizer()

    @staticmethod
    def compute_similarity_score(feature_vec_a: np.ndarray,
                                 feature_vec_b: np.ndarray):
        return 1 - cosine(feature_vec_a, feature_vec_b)

    def compute_similarity_for_pair(self, a: Message, b: Message):
        features_a = a.features[0].features
        features_b = b.features[0].features

        return self.compute_similarity_score(features_a, features_b)

    def compute_features(self, example: Message):
        self.tokenizer.process(example)
        self.featurizer.process(example)

    def compute_similarity_with_paraphrases(self, example: Message):

        # Set features for text of example itself first.
        self.featurizer.process(example)

        paraphrases = example.get("metadata").get("paraphrases")

        similarity_scores = []

        # construct individual message for each paraphrase
        for paraphrase in paraphrases:
            message = Message.build(text=paraphrase)
            self.compute_features(message)
            similarity = self.compute_similarity_for_pair(example, message)
            similarity_scores.append(similarity)

        return similarity_scores

    def compute_similarities(self,
                             examples: List[Message]) -> List[List[float]]:

        scores_for_collection = []

        for example in examples:

            similarity_scores = self.compute_similarity_with_paraphrases(
                example)
            scores_for_collection.append(similarity_scores)

        return scores_for_collection
Exemplo n.º 10
0
def test_process_tokenizer(text: Text, expected_tokens: List[Text],
                           expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)

    tk.process(message)

    tokens = message.get(TOKENS_NAMES[TEXT])

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Exemplo n.º 11
0
def process_texts(texts: List[Text], model_name: Text,
                  model_weights: Text) -> List[Message]:
    config = create_pretrained_transformers_config(model_name, model_weights)
    whitespace_tokenizer = WhitespaceTokenizer()
    transformer = HFTransformersNLP(config)

    messages = []
    for text in texts:
        message = Message.build(text=text)
        whitespace_tokenizer.process(message)
        transformer.process(message)
        messages.append(message)
    return messages
def test_feature_shapes(
    whitespace_tokenizer: WhitespaceTokenizer,
    semantic_map_featurizer: SemanticMapFeaturizer,
):
    message = Message.build("word1 word3")

    whitespace_tokenizer.process(message)
    semantic_map_featurizer.process(message)

    for feature in message.features:
        assert (feature.type == FEATURE_TYPE_SEQUENCE
                and feature.features.shape
                == (2, 37)) or (feature.type == FEATURE_TYPE_SENTENCE
                                and feature.features.shape == (1, 37))
Exemplo n.º 13
0
def process_messages(
    texts: List[Text],
    model_name: Text,
    model_weights: Text,
    create_language_model_featurizer: Callable[[Dict[Text, Any]],
                                               LanguageModelFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
) -> List[Message]:
    """ Creates a featurizer and processes messages """
    config = create_pretrained_transformers_config(model_name, model_weights)
    lm_featurizer = create_language_model_featurizer(config)

    messages = []
    for text in texts:
        message = Message.build(text=text)
        whitespace_tokenizer.process([message])
        messages.append(message)
    lm_featurizer.process(messages)
    return messages
Exemplo n.º 14
0
def test_count_vector_featurizer_process_by_attribute(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer({
        "token_pattern": r"(?u)\b\w+\b",
    })

    # add a second example that has some response, so that the vocabulary for
    # response exists
    train_message = Message(data={TEXT: "hello"})
    train_message.set(ACTION_NAME, "greet")

    train_message1 = Message(data={TEXT: "hello"})
    train_message1.set(ACTION_TEXT, "hi")

    data = TrainingData([train_message, train_message1])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)

    test_message = Message(data={TEXT: sentence})
    test_message.set(ACTION_NAME, action_name)
    test_message.set(ACTION_TEXT, action_text)

    whitespace_tokenizer.process([test_message])
    ftr.process([test_message])

    action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features(
        ACTION_NAME, [])
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features

    assert action_name_seq_vecs.toarray()[0] == action_name_features
    assert action_name_sen_vecs is None
Exemplo n.º 15
0
def test_count_vector_featurizer_oov_words(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    ftr = create_featurizer(
        {"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]}
    )
    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
def test_whitespace_processing_with_attribute(
    supervised_embeddings_config: RasaNLUModelConfig, ):
    message = Message(
        data={
            TEXT:
            "Any Mexican restaurant will do",
            "intent":
            "restaurant_search",
            "entities": [{
                "start": 4,
                "end": 11,
                "value": "Mexican",
                "entity": "cuisine"
            }],
        })
    expected_tokens_intent = ["restaurant_search"]
    expected_tokens_text = ["Any", "Mexican", "restaurant", "will", "do"]
    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    tk.process(message)
    tokens_intent = message.get(TOKENS_NAMES[INTENT])
    tk.process(message)
    tokens_text = message.get(TOKENS_NAMES[TEXT])
    assert [t.text for t in tokens_intent] == expected_tokens_intent
    assert [t.text for t in tokens_text] == expected_tokens_text

    message = Message(
        data={
            TEXT: "Where are you going?",
            ACTION_NAME: "Where are you going?",
            ACTION_TEXT: "Where are you going?",
        })
    expected_action_tokens_text = ["Where", "are", "you", "going"]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    tk.process(message)
    tokens_action_text = message.get(TOKENS_NAMES[ACTION_TEXT])
    tk.process(message)
    tokens_text = message.get(TOKENS_NAMES[TEXT])
    assert [t.text for t in tokens_action_text] == expected_action_tokens_text
    assert [t.text for t in tokens_text] == expected_action_tokens_text
Exemplo n.º 17
0
def test_count_vector_featurizer_persist_load(
    create_featurizer: Callable[..., CountVectorsFeaturizer],
    load_featurizer: Callable[..., CountVectorsFeaturizer],
    whitespace_tokenizer: WhitespaceTokenizer,
):
    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = create_featurizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"

    train_message1 = Message(data={TEXT: sentence1})
    train_message2 = Message(data={TEXT: sentence2})
    whitespace_tokenizer.process([train_message1])
    whitespace_tokenizer.process([train_message2])

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)
    train_ftr.process_training_data(data)

    # persist featurizer
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}
            )

    test_ftr = load_featurizer(config)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(data={TEXT: sentence1})
    whitespace_tokenizer.process([test_message1])
    test_ftr.process([test_message1])
    test_message2 = Message(data={TEXT: sentence2})
    whitespace_tokenizer.process([test_message2])
    test_ftr.process([test_message2])

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [])
    if test_seq_vec_1:
        test_seq_vec_1 = test_seq_vec_1.features
    if test_sen_vec_1:
        test_sen_vec_1 = test_sen_vec_1.features
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [])
    if train_seq_vec_1:
        train_seq_vec_1 = train_seq_vec_1.features
    if train_sen_vec_1:
        train_sen_vec_1 = train_sen_vec_1.features
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [])
    if test_seq_vec_2:
        test_seq_vec_2 = test_seq_vec_2.features
    if test_sen_vec_2:
        test_sen_vec_2 = test_sen_vec_2.features
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [])
    if train_seq_vec_2:
        train_seq_vec_2 = train_seq_vec_2.features
    if train_sen_vec_2:
        train_sen_vec_2 = train_sen_vec_2.features

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
Exemplo n.º 18
0
def test_whitespace_with_case():
    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "forecast",
        "for",
        "lunch",
    ]

    component_config = {"case_sensitive": True}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "Forecast",
        "for",
        "LUNCH",
    ]

    component_config = {}
    tk = WhitespaceTokenizer(component_config)
    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
        "Forecast",
        "for",
        "LUNCH",
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    message = Message("Forecast for LUNCH")
    tk.process(message)
    assert message.data.get("tokens")[0].text == "forecast"
    assert message.data.get("tokens")[1].text == "for"
    assert message.data.get("tokens")[2].text == "lunch"

    _config = utilities.base_test_conf("supervised_embeddings")
    examples = [
        Message(
            "Any Mexican restaurant will do",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
        Message(
            "I want Tacos!",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }],
            },
        ),
    ]

    component_config = {"case_sensitive": False}
    tk = WhitespaceTokenizer(component_config)
    tk.train(TrainingData(training_examples=examples), _config)
    assert examples[0].data.get("tokens")[0].text == "any"
    assert examples[0].data.get("tokens")[1].text == "mexican"
    assert examples[0].data.get("tokens")[2].text == "restaurant"
    assert examples[0].data.get("tokens")[3].text == "will"
    assert examples[0].data.get("tokens")[4].text == "do"
    assert examples[1].data.get("tokens")[0].text == "i"
    assert examples[1].data.get("tokens")[1].text == "want"
    assert examples[1].data.get("tokens")[2].text == "tacos"