Exemplo n.º 1
0
def test_spacy_training_sample_alignment(spacy_nlp_component: SpacyNLP,
                                         spacy_model: SpacyModel):
    from spacy.tokens import Doc

    m1 = Message.build(text="I have a feeling", intent="feeling")
    m2 = Message.build(text="", intent="feeling")
    m3 = Message.build(text="I am the last message", intent="feeling")
    td = TrainingData(training_examples=[m1, m2, m3])

    attribute_docs = spacy_nlp_component._docs_for_training_data(
        spacy_model.model, td)

    assert isinstance(attribute_docs["text"][0], Doc)
    assert isinstance(attribute_docs["text"][1], Doc)
    assert isinstance(attribute_docs["text"][2], Doc)

    assert [t.text for t in attribute_docs["text"][0]
            ] == ["i", "have", "a", "feeling"]
    assert [t.text for t in attribute_docs["text"][1]] == []
    assert [t.text for t in attribute_docs["text"][2]] == [
        "i",
        "am",
        "the",
        "last",
        "message",
    ]
Exemplo n.º 2
0
def test_replacing_fallback_intent():
    expected_intent = "greet"
    expected_confidence = 0.345
    fallback_prediction = {
        INTENT: {
            INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
            PREDICTED_CONFIDENCE_KEY: 1,
        },
        INTENT_RANKING_KEY: [
            {
                INTENT_NAME_KEY: DEFAULT_NLU_FALLBACK_INTENT_NAME,
                PREDICTED_CONFIDENCE_KEY: 1,
            },
            {
                INTENT_NAME_KEY: expected_intent,
                PREDICTED_CONFIDENCE_KEY: expected_confidence,
            },
            {INTENT_NAME_KEY: "some", PREDICTED_CONFIDENCE_KEY: 0.1},
        ],
    }

    interpreter = ConstantInterpreter(fallback_prediction)
    training_data = TrainingData(
        [Message.build("hi", "greet"), Message.build("bye", "bye")]
    )

    intent_evaluations, _, _ = get_eval_data(interpreter, training_data)

    assert all(
        prediction.intent_prediction == expected_intent
        and prediction.confidence == expected_confidence
        for prediction in intent_evaluations
    )
Exemplo n.º 3
0
def test_build_tag_id_dict():
    message_1 = Message.build(
        text="Germany is part of the European Union", intent="inform"
    )
    message_1.set(
        BILOU_ENTITIES,
        ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"],
    )

    message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform")
    message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"])

    training_data = TrainingData([message_1, message_2])

    tag_id_dict = bilou_utils.build_tag_id_dict(training_data)

    assert tag_id_dict == {
        "O": 0,
        "B-location": 1,
        "I-location": 2,
        "L-location": 3,
        "U-location": 4,
        "B-organisation": 5,
        "I-organisation": 6,
        "L-organisation": 7,
        "U-organisation": 8,
    }
Exemplo n.º 4
0
def test_preprocess_selector_multiple_retrieval_intents():

    # use some available data
    training_data = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa.yml"
    )
    training_data_responses = rasa.shared.nlu.training_data.loading.load_data(
        "data/examples/rasa/demo-rasa-responses.yml"
    )
    training_data_extra_intent = TrainingData(
        [
            Message.build(
                text="Is it possible to detect the version?", intent="faq/q1"
            ),
            Message.build(text="How can I get a new virtual env", intent="faq/q2"),
        ]
    )
    training_data = training_data.merge(training_data_responses).merge(
        training_data_extra_intent
    )

    response_selector = ResponseSelector()

    response_selector.preprocess_train_data(training_data)

    assert sorted(response_selector.all_retrieval_intents) == ["chitchat", "faq"]
Exemplo n.º 5
0
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizerGraphComponent):

    message_1 = Message.build(
        text="Germany is part of the European Union", intent="inform"
    )
    message_1.set(
        ENTITIES,
        [
            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
            {
                "start": 23,
                "end": 37,
                "value": "European Union",
                "entity": "organisation",
            },
        ],
    )

    message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform")
    message_2.set(
        ENTITIES,
        [
            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
        ],
    )

    training_data = TrainingData([message_1, message_2])

    whitespace_tokenizer.process_training_data(training_data)

    bilou_utils.apply_bilou_schema(training_data)

    assert message_1.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "B-organisation",
        "L-organisation",
    ]
    assert message_2.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "U-location",
    ]
Exemplo n.º 6
0
def read_collection_from_csv(file_path: Text) -> List[Message]:

    collection = read_from_csv(file_path)
    all_sentences = []
    for line in collection:
        if len(line) == 2:
            sentence, label = line[0], line[1]
            all_sentences.append(Message.build(text=sentence, intent=label))
        elif len(line) == 1:
            sentence = line[0]
            all_sentences.append(Message.build(text=sentence))
        else:
            raise RuntimeError(
                "Input CSV file does not adhere to the correct format")
    return all_sentences
Exemplo n.º 7
0
def test_convert_featurizer_number_of_sub_tokens(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    text: Text,
    expected_number_of_sub_tokens: List[int],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)

    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [t.get(NUMBER_OF_SUB_TOKENS)
            for t in tokens] == expected_number_of_sub_tokens
Exemplo n.º 8
0
def test_convert_featurizer_token_edge_cases(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    text: Text,
    expected_tokens: List[Text],
    expected_indices: List[Tuple[int]],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Exemplo n.º 9
0
def test_convert_featurizer_tokens_to_text(
    create_or_load_convert_featurizer: Callable[[Dict[Text, Any]],
                                                ConveRTFeaturizer],
    sentence: Text,
    expected_text: Text,
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizer,
):

    monkeypatch.setattr(
        ConveRTFeaturizer,
        "_validate_model_url",
        lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=sentence)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
Exemplo n.º 10
0
def test_generate_message_raises_on_overlapping_but_not_identical_spans(
    message_text: Text,
    entities: List[Dict[Text, Any]],
):
    message = Message.build(message_text, "dummy_intent", entities=entities)
    with pytest.raises(ValueError):
        TrainingDataWriter.generate_message(message)
Exemplo n.º 11
0
    def read_from_json(self, js: Dict[Text, Any], **_: Any) -> "TrainingData":
        """Loads training data stored in the rasa NLU data format."""
        import rasa.shared.nlu.training_data.schemas.data_schema as schema
        import rasa.shared.utils.validation as validation_utils

        validation_utils.validate_training_data(js,
                                                schema.rasa_nlu_data_schema())

        data = js["rasa_nlu_data"]
        common_examples = data.get("common_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        training_examples = []
        for ex in common_examples:
            # taking care of custom entries
            msg = Message.build(
                text=ex.pop(TEXT, ""),
                intent=ex.pop(INTENT, None),
                entities=ex.pop(ENTITIES, None),
                **ex,
            )
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables)
Exemplo n.º 12
0
def test_preserve_sentence_and_sequence_features_old_config():
    attribute = "text"
    message = Message.build("hi there")

    transformers_nlp = HFTransformersNLP({
        "model_name": "bert",
        "model_weights": "bert-base-uncased"
    })
    transformers_nlp.process(message)
    lm_tokenizer = LanguageModelTokenizer()
    lm_tokenizer.process(message)

    lm_featurizer = LanguageModelFeaturizer({"model_name": "gpt2"})
    lm_featurizer.process(message)

    message.set(LANGUAGE_MODEL_DOCS[attribute], None)
    lm_docs = lm_featurizer._get_docs_for_batch([message],
                                                attribute=attribute,
                                                inference_mode=True)[0]
    hf_docs = transformers_nlp._get_docs_for_batch([message],
                                                   attribute=attribute,
                                                   inference_mode=True)[0]
    assert not (message.features[0].features
                == lm_docs[SEQUENCE_FEATURES]).any()
    assert not (message.features[1].features
                == lm_docs[SENTENCE_FEATURES]).any()
    assert (message.features[0].features == hf_docs[SEQUENCE_FEATURES]).all()
    assert (message.features[1].features == hf_docs[SENTENCE_FEATURES]).all()
Exemplo n.º 13
0
def test_lm_featurizer_edge_cases(model_name, model_weights, texts,
                                  expected_tokens, expected_indices):

    if model_weights is None:
        model_weights_config = {}
    else:
        model_weights_config = {"model_weights": model_weights}
    transformers_config = {
        **{
            "model_name": model_name
        },
        **model_weights_config
    }

    lm_featurizer = LanguageModelFeaturizer(transformers_config)
    whitespace_tokenizer = WhitespaceTokenizer()

    for text, gt_tokens, gt_indices in zip(texts, expected_tokens,
                                           expected_indices):

        message = Message.build(text=text)
        tokens = whitespace_tokenizer.tokenize(message, TEXT)
        message.set(TOKENS_NAMES[TEXT], tokens)
        lm_featurizer.process(message)

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
Exemplo n.º 14
0
def test_lm_tokenizer_edge_cases(
    model_name,
    model_weights,
    texts,
    expected_tokens,
    expected_indices,
    expected_num_token_ids,
):

    if model_weights is None:
        model_weights_config = {}
    else:
        model_weights_config = {"model_weights": model_weights}
    transformers_config = {
        **{
            "model_name": model_name
        },
        **model_weights_config
    }

    transformers_nlp = HFTransformersNLP(transformers_config)
    lm_tokenizer = LanguageModelTokenizer()

    for text, gt_tokens, gt_indices, gt_num_indices in zip(
            texts, expected_tokens, expected_indices, expected_num_token_ids):

        message = Message.build(text=text)
        transformers_nlp.process(message)
        tokens = lm_tokenizer.tokenize(message, TEXT)
        token_ids = message.get(LANGUAGE_MODEL_DOCS[TEXT])[TOKEN_IDS]

        assert [t.text for t in tokens] == gt_tokens
        assert [t.start for t in tokens] == [i[0] for i in gt_indices]
        assert [t.end for t in tokens] == [i[1] for i in gt_indices]
        assert len(token_ids) == gt_num_indices
Exemplo n.º 15
0
def test_train_tokenizer(text: Text, expected_tokens: List[Text],
                         expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(RESPONSE, text)
    message.set(INTENT, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE, TEXT]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT])

    assert [t.text for t in tokens] == [text]
Exemplo n.º 16
0
def test_convert_featurizer_process(monkeypatch: MonkeyPatch):
    tokenizer = WhitespaceTokenizer()

    monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url",
                        lambda x: RESTRICTED_ACCESS_URL)
    component_config = {
        "name": "ConveRTFeaturizer",
        "model_url": RESTRICTED_ACCESS_URL
    }
    featurizer = ConveRTFeaturizer(component_config)
    sentence = "Hey how are you today ?"
    message = Message.build(text=sentence)

    td = TrainingData([message])
    tokenizer.train(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    featurizer.process(message, tf_hub_module=featurizer.module)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
Exemplo n.º 17
0
def test_sequence_length_overflow_train(
    input_sequence_length: int,
    model_name: Text,
    should_overflow: bool,
    create_language_model_featurizer: Callable[[Dict[Text, Any]],
                                               LanguageModelFeaturizer],
    monkeypatch: MonkeyPatch,
):
    monkeypatch.setattr(
        LanguageModelFeaturizer,
        "_load_model_instance",
        lambda _: None,
    )
    component = create_language_model_featurizer({"model_name": model_name})
    message = Message.build(text=" ".join(["hi"] * input_sequence_length))
    if should_overflow:
        with pytest.raises(RuntimeError):
            component._validate_sequence_lengths([input_sequence_length],
                                                 [message],
                                                 "text",
                                                 inference_mode=False)
    else:
        component._validate_sequence_lengths([input_sequence_length],
                                             [message],
                                             "text",
                                             inference_mode=False)
Exemplo n.º 18
0
    def _parse_intent(self, intent_data: Dict[Text, Any]) -> None:
        import rasa.shared.nlu.training_data.entities_parser as entities_parser
        import rasa.shared.nlu.training_data.synonyms_parser as synonyms_parser

        intent = intent_data.get(KEY_INTENT, "")
        if not intent:
            rasa.shared.utils.io.raise_warning(
                f"Issue found while processing '{self.filename}': "
                f"The intent has an empty name. "
                f"Intents should have a name defined under the {KEY_INTENT} key. "
                f"It will be skipped.",
                docs=DOCS_URL_TRAINING_DATA,
            )
            return

        examples = intent_data.get(KEY_INTENT_EXAMPLES, "")
        intent_metadata = intent_data.get(KEY_METADATA)
        for example, entities, metadata in self._parse_training_examples(
                examples, intent):

            plain_text = entities_parser.replace_entities(example)

            synonyms_parser.add_synonyms_from_entities(plain_text, entities,
                                                       self.entity_synonyms)

            self.training_examples.append(
                Message.build(plain_text, intent, entities, intent_metadata,
                              metadata))
Exemplo n.º 19
0
def parse_training_example(example: Text,
                           intent: Optional[Text] = None) -> "Message":
    """Extract entities and synonyms, and convert to plain text."""

    entities = find_entities_in_training_example(example)
    plain_text = replace_entities(example)

    return Message.build(plain_text, intent, entities)
Exemplo n.º 20
0
def test_whitespace_does_not_throw_error():
    texts = rasa.shared.utils.io.read_json_file(
        "data/test_tokenizers/naughty_strings.json")

    tk = WhitespaceTokenizer()

    for text in texts:
        tk.tokenize(Message.build(text=text), attribute=TEXT)
Exemplo n.º 21
0
def test_mitie(text, expected_tokens, expected_indices):
    tk = MitieTokenizer()

    tokens = tk.tokenize(Message.build(text=text), attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Exemplo n.º 22
0
def test_split_action_name(text: Text, expected_tokens: List[Text]):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    tk = create_whitespace_tokenizer(component_config)

    message = Message.build(text=text)
    message.set(ACTION_NAME, text)

    assert [t.text for t in tk._split_name(message, ACTION_NAME)] == expected_tokens
Exemplo n.º 23
0
def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp):
    tk = SpacyTokenizer(SpacyTokenizer.get_default_config())

    message = Message.build(text=text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    tokens = tk.tokenize(message, attribute=TEXT)

    assert [t.data.get("pos") for t in tokens] == expected_pos_tags
Exemplo n.º 24
0
def test_generate_message(
    message_text: Text,
    expected_text: Text,
    entities: List[Dict[Text, Any]],
):
    message = Message.build(message_text, "dummy_intent", entities=entities)
    message_text = TrainingDataWriter.generate_message(message)

    assert message_text == expected_text
Exemplo n.º 25
0
 async def replace_placeholders(
     self, example: Message, faker_: Faker, matches: List[Tuple[Any, ...]], count: int
 ) -> AsyncIterator[Message]:
     original_text = await self.rebuild_original_text(example)
     for _ in range(count):
         text = await self.replace_placeholders_in_text(example.data.get("text"), faker_, matches)
         original_text = await self.replace_placeholders_in_text(original_text, faker_, matches)
         entities = find_entities_in_training_example(original_text)
         new_message = Message.build(text, example.get("intent"), entities)
         yield new_message
Exemplo n.º 26
0
def test_preprocess_selector_multiple_retrieval_intents(
    response_selector_training_data: TrainingData,
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
):

    training_data_extra_intent = TrainingData([
        Message.build(text="Is it possible to detect the version?",
                      intent="faq/q1"),
        Message.build(text="How can I get a new virtual env", intent="faq/q2"),
    ])
    training_data = response_selector_training_data.merge(
        training_data_extra_intent)

    response_selector = create_response_selector({})

    response_selector.preprocess_train_data(training_data)

    assert sorted(
        response_selector.all_retrieval_intents) == ["chitchat", "faq"]
Exemplo n.º 27
0
    def _read_intent(self, intent: Dict[Text, Any],
                     examples: List[Dict[Text, Any]]) -> "TrainingData":
        """Reads the intent and examples from respective jsons."""
        intent = intent.get("name")

        training_examples = []
        for ex in examples:
            text, entities = self._join_text_chunks(ex["data"])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Exemplo n.º 28
0
def test_split_intent_response_key(text, expected_tokens):
    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}

    tk = create_whitespace_tokenizer(component_config)

    message = Message.build(text=text)
    message.set(INTENT_RESPONSE_KEY, text)

    assert [
        t.text for t in tk._split_name(message, attribute=INTENT_RESPONSE_KEY)
    ] == expected_tokens
def test_features_are_sparse(
    whitespace_tokenizer: WhitespaceTokenizer,
    semantic_map_featurizer: SemanticMapFeaturizer,
):
    message = Message.build("word1 word3")

    whitespace_tokenizer.process(message)
    semantic_map_featurizer.process(message)

    for feature in message.features:
        assert scipy.sparse.issparse(feature.features)
def test_no_features_on_no_tokens(
        semantic_map_featurizer: SemanticMapFeaturizer):
    """The component does not set any sparse features if tokens are not available."""
    message = Message.build("word1 word3")

    # We skip: whitespace_tokenizer.process(message)
    semantic_map_featurizer.process(message)

    seq_vecs, sen_vecs = message.get_sparse_features(TEXT, [])
    assert not seq_vecs
    assert not sen_vecs