Пример #1
0
 def inner(
     config: Dict[Text, Any] = None,
     known_patterns: Optional[List[Dict[Text, Any]]] = None,
 ) -> RegexFeaturizer:
     config = config or {}
     return RegexFeaturizer(
         {**RegexFeaturizer.get_default_config(), **config},
         default_model_storage,
         resource,
         default_execution_context,
         known_patterns,
     )
Пример #2
0
def test_persist_load_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    resource: Resource,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey how are you today 19.12.2019 ?"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)
    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    # Test component loaded in finetune mode and also with
    # same patterns as before and vocabulary statistics
    assert loaded_featurizer.known_patterns == featurizer.known_patterns
    assert loaded_featurizer.finetune_mode

    new_lookups = [{"name": "plates", "elements": "data/test/lookup_tables/plates.txt"}]

    training_data = TrainingData()
    training_data.lookup_tables = new_lookups
    loaded_featurizer.train(training_data)

    # Test merging of a new pattern to an already trained component.
    assert len(loaded_featurizer.known_patterns) == 4
Пример #3
0
def test_vocabulary_expand_for_finetuning(
    create_featurizer: Callable[..., RegexFeaturizer],
    default_model_storage: ModelStorage,
    resource: Resource,
    default_execution_context: ExecutionContext,
    whitespace_tokenizer: WhitespaceTokenizer,
):
    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
    ]

    featurizer = create_featurizer()

    sentence = "hey hey 2020"
    message = Message(data={TEXT: sentence})
    message.set(RESPONSE, sentence)
    message.set(INTENT, "intent")
    training_data = TrainingData([message], regex_features=patterns)

    whitespace_tokenizer.process_training_data(training_data)

    featurizer.train(training_data)
    featurizer.process_training_data(training_data)

    # Test featurization of message
    expected = np.array([1, 0])
    expected_cls = np.array([1, 1])
    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (3, 2) == seq_vecs.shape
    assert (1, 2) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    loaded_featurizer = RegexFeaturizer.load(
        RegexFeaturizer.get_default_config(),
        default_model_storage,
        resource,
        dataclasses.replace(default_execution_context, is_finetuning=True),
    )

    new_patterns = [
        {"pattern": "\\btoday*", "name": "day", "usage": "intent"},
        {"pattern": "\\bhey+", "name": "hello", "usage": "intent"},
    ]
    new_sentence = "hey today"
    message = Message(data={TEXT: new_sentence})
    message.set(RESPONSE, new_sentence)
    message.set(INTENT, "intent")
    new_training_data = TrainingData([message], regex_features=patterns + new_patterns)

    whitespace_tokenizer.process_training_data(new_training_data)

    loaded_featurizer.train(new_training_data)
    loaded_featurizer.process_training_data(new_training_data)

    # Test featurization of message, this time for the extra pattern as well.
    expected_token_1 = np.array([1, 0, 0])
    expected_token_2 = np.array([0, 0, 1])
    expected_cls = np.array([1, 0, 1])

    seq_vecs, sen_vec = message.get_sparse_features(TEXT, [])
    if seq_vecs:
        seq_vecs = seq_vecs.features
    if sen_vec:
        sen_vec = sen_vec.features

    assert (2, 3) == seq_vecs.shape
    assert (1, 3) == sen_vec.shape
    assert np.all(seq_vecs.toarray()[0] == expected_token_1)
    assert np.all(seq_vecs.toarray()[1] == expected_token_2)
    assert np.all(sen_vec.toarray()[-1] == expected_cls)

    # let's check if the order of patterns is preserved
    for old_index, pattern in enumerate(featurizer.known_patterns):
        assert pattern["name"] == loaded_featurizer.known_patterns[old_index]["name"]

    # we also modified a pattern, check if that is correctly modified
    pattern_to_check = [
        pattern
        for pattern in loaded_featurizer.known_patterns
        if pattern["name"] == "hello"
    ]
    assert pattern_to_check == [new_patterns[1]]