예제 #1
0
def test_convert_featurizer_number_of_sub_tokens(
    create_or_load_convert_featurizer: Callable[
        [Dict[Text, Any]], ConveRTFeaturizerGraphComponent
    ],
    text: Text,
    expected_number_of_sub_tokens: List[int],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):

    monkeypatch.setattr(
        ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)

    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [
        t.get(NUMBER_OF_SUB_TOKENS) for t in tokens
    ] == expected_number_of_sub_tokens
예제 #2
0
def create_whitespace_tokenizer(
    config: Optional[Dict[Text,
                          Any]] = None) -> WhitespaceTokenizerGraphComponent:
    return WhitespaceTokenizerGraphComponent({
        **WhitespaceTokenizerGraphComponent.get_default_config(),
        **(config if config else {}),
    })
async def test_train_persist_load_with_composite_entities(
    crf_entity_extractor: Callable[[Dict[Text, Any]],
                                   CRFEntityExtractorGraphComponent],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    importer = RasaFileImporter(
        training_data_paths=["data/test/demo-rasa-composite-entities.yml"])
    training_data = importer.get_nlu_data()

    whitespace_tokenizer.process_training_data(training_data)

    crf_extractor = crf_entity_extractor({})
    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})

    whitespace_tokenizer.process([message])
    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractorGraphComponent.load(
        CRFEntityExtractorGraphComponent.get_default_config(),
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()
def test_count_vector_featurizer(
    sentence: Text,
    expected: List[List[int]],
    expected_cls: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    test_message = Message(data={TEXT: sentence})

    whitespace_tokenizer.process([train_message])
    whitespace_tokenizer.process([test_message])

    ftr.train(TrainingData([train_message]))

    ftr.process([test_message])

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vecs:
        sen_vecs = sen_vecs.features

    assert isinstance(seq_vecs, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.toarray()
    actual_sen_vecs = sen_vecs.toarray()

    assert np.all(actual_seq_vecs[0] == expected)
    assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_use_shared_vocab_exception(
    initial_train_text: Text,
    additional_train_text: Text,
    use_shared_vocab: bool,
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    """Tests if an exception is raised when `use_shared_vocab` is set to True
    during incremental training."""
    config = {"use_shared_vocab": use_shared_vocab}
    initial_cvf = create_featurizer(config)
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    initial_cvf.train(data)

    new_cvf = load_featurizer(config, is_finetuning=True)

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    whitespace_tokenizer.process_training_data(data)
    if use_shared_vocab:
        with pytest.raises(Exception) as exec_info:
            new_cvf.train(data)
        assert (
            "Using a shared vocabulary in `CountVectorsFeaturizer` is not supported"
            in str(exec_info.value)
        )
    else:
        new_cvf.train(data)
예제 #6
0
def test_convert_featurizer_tokens_to_text(
    create_or_load_convert_featurizer: Callable[
        [Dict[Text, Any]], ConveRTFeaturizerGraphComponent
    ],
    sentence: Text,
    expected_text: Text,
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):

    monkeypatch.setattr(
        ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=sentence)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    actual_text = ConveRTFeaturizerGraphComponent._tokens_to_text([tokens])[0]

    assert expected_text == actual_text
예제 #7
0
def test_convert_featurizer_token_edge_cases(
    create_or_load_convert_featurizer: Callable[
        [Dict[Text, Any]], ConveRTFeaturizerGraphComponent
    ],
    text: Text,
    expected_tokens: List[Text],
    expected_indices: List[Tuple[int]],
    monkeypatch: MonkeyPatch,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):

    monkeypatch.setattr(
        ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    tokens = featurizer.tokenize(message, attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
예제 #8
0
def test_convert_featurizer_train(
    create_or_load_convert_featurizer: Callable[
        [Dict[Text, Any]], ConveRTFeaturizerGraphComponent
    ],
    monkeypatch: MonkeyPatch,
    load: bool,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):

    monkeypatch.setattr(
        ConveRTFeaturizerGraphComponent, "_validate_model_url", lambda _: None,
    )
    component_config = {
        FEATURIZER_CLASS_ALIAS: "alias",
        "model_url": RESTRICTED_ACCESS_URL,
    }
    featurizer = create_or_load_convert_featurizer(component_config, load=True)

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)

    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)

    tokens = featurizer.tokenize(message, attribute=TEXT)

    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.process_training_data(TrainingData([message]))

    expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
    )

    seq_vecs, sent_vecs = message.get_dense_features(TEXT, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, [])

    seq_vecs = seq_vecs.features
    sent_vecs = sent_vecs.features

    assert len(tokens) == len(seq_vecs)
    assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)

    seq_vecs, sent_vecs = message.get_dense_features(INTENT, [])

    assert seq_vecs is None
    assert sent_vecs is None
예제 #9
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizerGraphComponent.load(
        RegexFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{
        "name": "plates",
        "elements": "data/test/lookup_tables/plates.txt"
    }]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
def create_whitespace_tokenizer(
    config: Optional[Dict] = None, ) -> WhitespaceTokenizerGraphComponent:
    config = config if config else {}
    return WhitespaceTokenizerGraphComponent(
        {
            **WhitespaceTokenizerGraphComponent.get_default_config(),
            **config
        }, )
def test_count_vector_featurizer_action_attribute_featurization(
    sentence: Text,
    action_name: Text,
    action_text: Text,
    action_name_features: np.ndarray,
    response_features: np.ndarray,
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer({"token_pattern": r"(?u)\b\w+\b",})

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(ACTION_NAME, action_name)
    train_message.set(ACTION_TEXT, action_text)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(ACTION_TEXT, "hi")
    second_message.set(ACTION_NAME, "greet")

    data = TrainingData([train_message, second_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    action_name_seq_vecs, action_name_sen_vecs = train_message.get_sparse_features(
        ACTION_NAME, []
    )
    if action_name_seq_vecs:
        action_name_seq_vecs = action_name_seq_vecs.features
    if action_name_sen_vecs:
        action_name_sen_vecs = action_name_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        ACTION_TEXT, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if action_name_features:
        assert action_name_seq_vecs.toarray()[0] == action_name_features
        assert action_name_sen_vecs is None
    else:
        assert action_name_seq_vecs is None
        assert action_name_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
예제 #12
0
def test_regex_featurizer_train(
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()
    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")

    whitespace_tokenizer.process_training_data(TrainingData([message]))
    training_data = TrainingData([message], regex_features=patterns)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    expected = np.array([0, 1, 0])
    expected_cls = np.array([1, 1, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (6, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vecs, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert seq_vecs is None
    assert sen_vec is None
def test_count_vector_featurizer_response_attribute_featurization(
    sentence: Text,
    intent: Text,
    response: Optional[Text],
    intent_features: List[List[int]],
    response_features: Optional[List[List[int]]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer()

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    # add a second example that has some response, so that the vocabulary for
    # response exists
    second_message = Message(data={TEXT: "hello"})
    second_message.set(RESPONSE, "hi")
    second_message.set(INTENT, "greet")

    data = TrainingData([train_message, second_message])

    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features(INTENT, [])
    if intent_seq_vecs:
        intent_seq_vecs = intent_seq_vecs.features
    if intent_sen_vecs:
        intent_sen_vecs = intent_sen_vecs.features
    response_seq_vecs, response_sen_vecs = train_message.get_sparse_features(
        RESPONSE, []
    )
    if response_seq_vecs:
        response_seq_vecs = response_seq_vecs.features
    if response_sen_vecs:
        response_sen_vecs = response_sen_vecs.features

    if intent_features:
        assert intent_seq_vecs.toarray()[0] == intent_features
        assert intent_sen_vecs is None
    else:
        assert intent_seq_vecs is None
        assert intent_sen_vecs is None

    if response_features:
        assert response_seq_vecs.toarray()[0] == response_features
        assert response_sen_vecs is not None
    else:
        assert response_seq_vecs is None
        assert response_sen_vecs is None
예제 #14
0
def test_apply_bilou_schema(whitespace_tokenizer: WhitespaceTokenizerGraphComponent):

    message_1 = Message.build(
        text="Germany is part of the European Union", intent="inform"
    )
    message_1.set(
        ENTITIES,
        [
            {"start": 0, "end": 7, "value": "Germany", "entity": "location"},
            {
                "start": 23,
                "end": 37,
                "value": "European Union",
                "entity": "organisation",
            },
        ],
    )

    message_2 = Message.build(text="Berlin is the capital of Germany", intent="inform")
    message_2.set(
        ENTITIES,
        [
            {"start": 0, "end": 6, "value": "Berlin", "entity": "location"},
            {"start": 25, "end": 32, "value": "Germany", "entity": "location"},
        ],
    )

    training_data = TrainingData([message_1, message_2])

    whitespace_tokenizer.process_training_data(training_data)

    bilou_utils.apply_bilou_schema(training_data)

    assert message_1.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "B-organisation",
        "L-organisation",
    ]
    assert message_2.get(BILOU_ENTITIES) == [
        "U-location",
        "O",
        "O",
        "O",
        "O",
        "U-location",
    ]
def test_count_vectors_featurizer_train(
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    featurizer = create_featurizer()

    sentence = "Hey how are you today ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    whitespace_tokenizer.process_training_data(TrainingData([message]))

    data = TrainingData([message])
    featurizer.train(data)
    featurizer.process_training_data(data)

    expected = np.array([0, 1, 0, 0, 0])
    expected_cls = np.array([1, 1, 1, 1, 1])

    seq_vec, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (5, 5) == seq_vec.shape
    assert (1, 5) == sen_vec.shape
    assert np.all(seq_vec.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    seq_vec, sen_vec = message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert sen_vec is None
    assert (1, 1) == seq_vec.shape
    assert np.all(seq_vec.toarray()[0] == np.array([1]))
예제 #16
0
def test_check_correct_entity_annotations(
    text: Text, warnings: int, whitespace_tokenizer: WhitespaceTokenizerGraphComponent
):
    reader = RasaYAMLReader()

    training_data = reader.reads(text)
    whitespace_tokenizer.process_training_data(training_data)

    with pytest.warns(UserWarning) as record:
        EntityExtractorMixin.check_correct_entity_annotations(training_data)

    assert len(record) == warnings
    assert all(
        [excerpt in record[0].message.args[0]]
        for excerpt in ["Misaligned entity annotation in sentence"]
    )
예제 #17
0
def test_model_data_signature_with_entities(
    messages: List[Message],
    entity_expected: bool,
    create_diet: Callable[..., DIETClassifierGraphComponent],
):
    classifier = create_diet({"BILOU_flag": False})
    training_data = TrainingData(messages)

    # create tokens for entity parsing inside DIET
    tokenizer = WhitespaceTokenizerGraphComponent(
        WhitespaceTokenizerGraphComponent.get_default_config()
    )
    tokenizer.process_training_data(training_data)

    model_data = classifier.preprocess_train_data(training_data)
    entity_exists = "entities" in model_data.get_signature().keys()
    assert entity_exists == entity_expected
def test_count_vector_featurizer_shared_vocab(
    sentence: Text,
    intent: Text,
    response: Text,
    text_features: List[List[int]],
    intent_features: List[List[int]],
    response_features: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer({"use_shared_vocab": True,})

    train_message = Message(data={TEXT: sentence})
    # this is needed for a valid training example
    train_message.set(INTENT, intent)
    train_message.set(RESPONSE, response)

    data = TrainingData([train_message])
    whitespace_tokenizer.process_training_data(data)
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == text_features)
    assert sen_vec is not None
    seq_vec, sen_vec = train_message.get_sparse_features(INTENT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == intent_features)
    assert sen_vec is None
    seq_vec, sen_vec = train_message.get_sparse_features(RESPONSE, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == response_features)
    assert sen_vec is not None
예제 #19
0
def process_training_text(
    texts: List[Text],
    model_name: Text,
    model_weights: Text,
    create_language_model_featurizer: Callable[
        [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent
    ],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
) -> List[Message]:
    """ Creates a featurizer and process training data """
    config = create_pretrained_transformers_config(model_name, model_weights)
    lm_featurizer = create_language_model_featurizer(config)

    messages = [Message.build(text=text) for text in texts]
    td = TrainingData(messages)

    whitespace_tokenizer.process_training_data(td)
    lm_featurizer.process_training_data(td)
    return messages
예제 #20
0
def process_messages(
    texts: List[Text],
    model_name: Text,
    model_weights: Text,
    create_language_model_featurizer: Callable[
        [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent
    ],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
) -> List[Message]:
    """ Creates a featurizer and processes messages """
    config = create_pretrained_transformers_config(model_name, model_weights)
    lm_featurizer = create_language_model_featurizer(config)

    messages = []
    for text in texts:
        message = Message.build(text=text)
        whitespace_tokenizer.process([message])
        messages.append(message)
    lm_featurizer.process(messages)
    return messages
def test_count_vector_featurizer_oov_token(
    sentence: Text,
    expected: List[List[int]],
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    ftr = create_featurizer({"OOV_token": "__oov__"})
    train_message = Message(data={TEXT: sentence})
    whitespace_tokenizer.process([train_message])

    data = TrainingData([train_message])
    ftr.train(data)
    ftr.process_training_data(data)

    seq_vec, sen_vec = train_message.get_sparse_features(TEXT, [])
    if seq_vec:
        seq_vec = seq_vec.features
    if sen_vec:
        sen_vec = sen_vec.features
    assert np.all(seq_vec.toarray()[0] == expected)
    assert sen_vec is not None
def test_cvf_incremental_training(
    initial_train_text: Text,
    additional_train_text: Text,
    initial_vocabulary_size: int,
    final_vocabulary_size: int,
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    initial_cvf = create_featurizer()
    train_message = Message(data={"text": initial_train_text})
    data = TrainingData([train_message])

    whitespace_tokenizer.process_training_data(data)
    initial_cvf.train(data)

    # Check initial vocabulary size
    initial_vocab = initial_cvf.vectorizers["text"].vocabulary_
    assert len(initial_vocab) == initial_vocabulary_size

    # persist and load initial cvf
    new_cvf = load_featurizer(is_finetuning=True)

    # Check vocabulary size again
    assert len(new_cvf.vectorizers["text"].vocabulary_) == initial_vocabulary_size

    additional_train_message = Message(data={"text": additional_train_text})
    data = TrainingData([train_message, additional_train_message])
    whitespace_tokenizer.process_training_data(data)
    new_cvf.train(data)

    new_vocab = new_cvf.vectorizers["text"].vocabulary_

    # Check vocabulary size after finetuning
    assert len(new_vocab) == final_vocabulary_size

    # Check indices of initial vocabulary haven't changed in the new vocabulary
    for vocab_token, vocab_index in initial_vocab.items():
        assert vocab_token in new_vocab
        assert new_vocab.get(vocab_token) == vocab_index
예제 #23
0
 def check_subtokens(
     texts: List[Text],
     messages: List[Message],
     expected_number_of_sub_tokens: List[List[float]],
     whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
 ):
     """ Checks that we get the correct number of sub tokens """
     for index, message in enumerate(messages):
         assert [
             t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT])
         ] == expected_number_of_sub_tokens[index]
         assert len(message.get(TOKENS_NAMES[TEXT])) == len(
             whitespace_tokenizer.tokenize(Message.build(text=texts[index]), TEXT)
         )
예제 #24
0
def test_log_longer_sequence(
    sequence_length: int,
    model_name: Text,
    model_weights: Text,
    should_overflow: bool,
    caplog: LogCaptureFixture,
    create_language_model_featurizer: Callable[
        [Dict[Text, Any]], LanguageModelFeaturizerGraphComponent
    ],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    config = {"model_name": model_name, "model_weights": model_weights}

    featurizer = create_language_model_featurizer(config)

    text = " ".join(["hi"] * sequence_length)
    message = Message.build(text=text)
    td = TrainingData([message])
    whitespace_tokenizer.process_training_data(td)
    caplog.set_level(logging.DEBUG)
    featurizer.process([message])
    if should_overflow:
        assert "hi hi hi" in caplog.text
    assert len(message.features) >= 2
예제 #25
0
async def test_interpreter_parses_text_tokens(
    response_selector_interpreter: Interpreter,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    text = "Hello there"
    tokens = whitespace_tokenizer.tokenize(Message(data={"text": text}),
                                           "text")
    indices = [(t.start, t.end) for t in tokens]

    parsed_data = response_selector_interpreter.parse(text)
    assert "text_tokens" in parsed_data.keys()

    parsed_tokens = parsed_data.get("text_tokens")

    assert parsed_tokens == indices
예제 #26
0
def test_convert_tags_to_entities(
    text: Text,
    tags: Dict[Text, List[Text]],
    confidences: Dict[Text, List[float]],
    expected_entities: List[Dict[Text, Any]],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    extractor = EntityExtractorMixin()

    message = Message(data={TEXT: text})
    tokens = whitespace_tokenizer.tokenize(message, TEXT)

    split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True}
    actual_entities = extractor.convert_predictions_into_entities(
        text, tokens, tags, split_entities_config, confidences
    )
    assert actual_entities == expected_entities
def test_count_vector_featurizer_persist_load(
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    # set non default values to config
    config = {
        "analyzer": "char",
        "strip_accents": "ascii",
        "stop_words": "stop",
        "min_df": 2,
        "max_df": 3,
        "min_ngram": 2,
        "max_ngram": 3,
        "max_features": 10,
        "lowercase": False,
    }
    train_ftr = create_featurizer(config)

    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"

    train_message1 = Message(data={TEXT: sentence1})
    train_message2 = Message(data={TEXT: sentence2})
    whitespace_tokenizer.process([train_message1])
    whitespace_tokenizer.process([train_message2])

    data = TrainingData([train_message1, train_message2])
    train_ftr.train(data)
    train_ftr.process_training_data(data)

    # persist featurizer
    train_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in train_ftr.vectorizers.items()
    }

    # add trained vocabulary to vectorizer params
    for attribute, attribute_vect_params in train_vect_params.items():
        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
            train_vect_params[attribute].update(
                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}
            )

    test_ftr = load_featurizer(config)
    test_vect_params = {
        attribute: vectorizer.get_params()
        for attribute, vectorizer in test_ftr.vectorizers.items()
    }

    assert train_vect_params == test_vect_params

    # check if vocaculary was loaded correctly
    assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_")

    test_message1 = Message(data={TEXT: sentence1})
    whitespace_tokenizer.process([test_message1])
    test_ftr.process([test_message1])
    test_message2 = Message(data={TEXT: sentence2})
    whitespace_tokenizer.process([test_message2])
    test_ftr.process([test_message2])

    test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, [])
    if test_seq_vec_1:
        test_seq_vec_1 = test_seq_vec_1.features
    if test_sen_vec_1:
        test_sen_vec_1 = test_sen_vec_1.features
    train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, [])
    if train_seq_vec_1:
        train_seq_vec_1 = train_seq_vec_1.features
    if train_sen_vec_1:
        train_sen_vec_1 = train_sen_vec_1.features
    test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, [])
    if test_seq_vec_2:
        test_seq_vec_2 = test_seq_vec_2.features
    if test_sen_vec_2:
        test_sen_vec_2 = test_sen_vec_2.features
    train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, [])
    if train_seq_vec_2:
        train_seq_vec_2 = train_seq_vec_2.features
    if train_sen_vec_2:
        train_sen_vec_2 = train_sen_vec_2.features

    # check that train features and test features after loading are the same
    assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray())
    assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray())
    assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray())
    assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
def test_whitespace_language_support(language, is_not_supported):
    assert (language
            in WhitespaceTokenizerGraphComponent.not_supported_languages()
            ) == is_not_supported
예제 #29
0
def test_vocabulary_expand_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    default_model_storage: ModelStorage,
    resource: Resource,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizerGraphComponent,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey hey 2020"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)

    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    # Test featurization of message
    expected = np.array([1, 0])
    expected_cls = np.array([1, 1])
    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (3, 2) == seq_vecs.shape
    assert (1, 2) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    loaded_featurizer = RegexFeaturizerGraphComponent.load(
        RegexFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    new_patterns = [
        {"pattern": "\\btoday*", "name": "day", "usage": "intent"},
        {"pattern": "\\bhey+", "name": "hello", "usage": "intent"},
    ]
    new_sentence = "hey today"
    message = Message(data={TEXT: new_sentence})
    message.set(RESPONSE, new_sentence)
    message.set(INTENT, "intent")
    new_training_data = TrainingData([message], regex_features=patterns + new_patterns)

    whitespace_tokenizer.process_training_data(new_training_data)

    loaded_featurizer.train(new_training_data)
    loaded_featurizer.process_training_data(new_training_data)

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([1, 0, 0])
    expected_token_2 = np.array([0, 0, 1])
    expected_cls = np.array([1, 0, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (2, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[1] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # let's check if the order of patterns is preserved
    for old_index, pattern in enumerate(featurizer.known_patterns):
        assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"]

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern
        for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]
예제 #30
0
def whitespace_tokenizer() -> WhitespaceTokenizerGraphComponent:
    return WhitespaceTokenizerGraphComponent(
        WhitespaceTokenizerGraphComponent.get_default_config()
    )