예제 #1
0
def test_regex_featurizer_case_sensitive(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    case_sensitive: bool,
    spacy_nlp: Any,
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]
    ftr = create_featurizer(
        {"case_sensitive": case_sensitive}, known_patterns=patterns,
    )

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    spacy_tokenizer.process([message])

    sequence_features, sentence_features = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(
        sequence_features.toarray()[0], expected_sequence_features, atol=1e-10
    )
    assert np.allclose(
        sentence_features.toarray()[-1], expected_sentence_features, atol=1e-10
    )
예제 #2
0
def test_lookup_with_and_without_boundaries(
    sentence: Text,
    expected_sequence_features: List[List[float]],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    use_word_boundaries: bool,
    spacy_nlp: Any,
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):
    ftr = create_featurizer({"use_word_boundaries": use_word_boundaries})
    training_data = TrainingData()

    # we use lookups because the "use_word_boundaries" flag is only used when
    # producing patterns from lookup tables
    lookups = [{"name": "how", "elements": ["how"]}]
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    spacy_tokenizer.process([message])

    (sequence_features,
     sentence_features) = ftr._features_for_patterns(message, TEXT)

    sequence_features = sequence_features.toarray()
    sentence_features = sentence_features.toarray()
    num_of_patterns = sum([len(lookup["elements"]) for lookup in lookups])
    assert sequence_features.shape == (
        len(message.get(TOKENS_NAMES[TEXT])),
        num_of_patterns,
    )
    num_of_lookup_tables = len(lookups)
    assert sentence_features.shape == (num_of_lookup_tables, num_of_patterns)

    # sequence_features should be {0,1} for each token: 1 if match, 0 if not
    assert np.allclose(sequence_features,
                       expected_sequence_features,
                       atol=1e-10)
    # sentence_features should be {0,1} for each lookup table: 1 if sentence
    # contains match from that table, 0 if not
    assert np.allclose(sentence_features,
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0

    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        # labeled_tokens should list the token(s) which match a pattern
        assert num_matches == labeled_tokens.count(i)
async def test_train_persist_with_different_configurations(
    crf_entity_extractor: Callable[[Dict[Text, Any]],
                                   CRFEntityExtractorGraphComponent],
    config_params: Dict[Text, Any],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
    spacy_tokenizer: SpacyTokenizerGraphComponent,
    spacy_nlp: Language,
):

    crf_extractor = crf_entity_extractor(config_params)

    importer = RasaFileImporter(training_data_paths=["data/examples/rasa"])
    training_data = importer.get_nlu_data()

    spacy_model = SpacyModel(model=spacy_nlp, model_name="en_core_web_md")
    training_data = SpacyPreprocessor({}).process_training_data(
        training_data, spacy_model)
    training_data = spacy_tokenizer.process_training_data(training_data)

    crf_extractor.train(training_data)

    message = Message(data={TEXT: "I am looking for an italian restaurant"})
    messages = SpacyPreprocessor({}).process([message], spacy_model)
    message = spacy_tokenizer.process(messages)[0]

    message2 = copy.deepcopy(message)

    processed_message = crf_extractor.process([message])[0]

    loaded_extractor = CRFEntityExtractorGraphComponent.load(
        {
            **CRFEntityExtractorGraphComponent.get_default_config(),
            **config_params
        },
        default_model_storage,
        Resource("CRFEntityExtractor"),
        default_execution_context,
    )

    processed_message2 = loaded_extractor.process([message2])[0]

    assert processed_message2.fingerprint() == processed_message.fingerprint()

    detected_entities = processed_message2.get(ENTITIES)

    assert len(detected_entities) == 1
    assert detected_entities[0]["entity"] == "cuisine"
    assert detected_entities[0]["value"] == "italian"
예제 #4
0
def test_regex_featurizer(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[int],
    spacy_nlp: Any,
    create_featurizer: Callable[..., RegexFeaturizerGraphComponent],
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):
    patterns = [
        {
            "pattern": "[0-9]+",
            "name": "number",
            "usage": "intent"
        },
        {
            "pattern": "\\bhey*",
            "name": "hello",
            "usage": "intent"
        },
        {
            "pattern": "[0-1]+",
            "name": "binary",
            "usage": "intent"
        },
    ]
    ftr = create_featurizer(known_patterns=patterns)

    # adds tokens to the message
    message = Message(data={TEXT: sentence, RESPONSE: sentence})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    spacy_tokenizer.process([message])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(),
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray(),
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
예제 #5
0
def test_lookup_tables(
    sentence: Text,
    expected_sequence_features: List[float],
    expected_sentence_features: List[float],
    labeled_tokens: List[float],
    spacy_nlp: Any,
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer()
    training_data = TrainingData()
    training_data.lookup_tables = lookups
    ftr.train(training_data)

    # adds tokens to the message
    message = Message(data={TEXT: sentence})
    message.set("text_spacy_doc", spacy_nlp(sentence))
    spacy_tokenizer.process([message])

    sequence_features, sentence_features = ftr._features_for_patterns(
        message, TEXT)
    assert np.allclose(sequence_features.toarray(),
                       expected_sequence_features,
                       atol=1e-10)
    assert np.allclose(sentence_features.toarray(),
                       expected_sentence_features,
                       atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
def test_crf_use_dense_features(
    crf_entity_extractor: Callable[[Dict[Text, Any]],
                                   CRFEntityExtractorGraphComponent],
    spacy_nlp: Any,
    spacy_featurizer: SpacyFeaturizerGraphComponent,
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):
    component_config = {
        "features": [
            ["low", "title", "upper", "pos", "pos2"],
            [
                "low",
                "suffix3",
                "suffix2",
                "upper",
                "title",
                "digit",
                "pos",
                "pos2",
                "text_dense_features",
            ],
            ["low", "title", "upper", "pos", "pos2"],
        ]
    }
    crf_extractor = crf_entity_extractor(component_config)

    text = "Rasa is a company in Berlin"
    message = Message(data={TEXT: text})
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process([message])
    spacy_featurizer.process([message])

    text_data = crf_extractor._convert_to_crf_tokens(message)
    features = crf_extractor._crf_tokens_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    dense_features, _ = message.get_dense_features(TEXT, [])
    if dense_features:
        dense_features = dense_features.features

    for i in range(0, len(dense_features[0])):
        assert (features[0]["0:text_dense_features"]["text_dense_features"][
            str(i)] == dense_features[0][i])
def test_count_vector_featurizer_use_lemma(
    spacy_nlp: Any,
    sentence: Text,
    sequence_features: List[List[int]],
    sentence_features: List[List[int]],
    use_lemma: bool,
    create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    load_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent],
    spacy_tokenizer: SpacyTokenizerGraphComponent,
):
    config = {"use_lemma": use_lemma, "OOV_words": ["drinks"], "OOV_token": "OOV"}
    ftr = create_featurizer(config)

    train_message = Message(data={TEXT: sentence})
    train_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))
    test_message = Message(data={TEXT: sentence})
    test_message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence))

    spacy_tokenizer.process([train_message])
    spacy_tokenizer.process([test_message])

    ftr.train(TrainingData([train_message]), spacy_nlp=SpacyModel(spacy_nlp, "en"))

    ftr.process([test_message])

    seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(seq_vecs.features, scipy.sparse.coo_matrix)
    assert isinstance(sen_vecs.features, scipy.sparse.coo_matrix)

    actual_seq_vecs = seq_vecs.features.toarray()
    actual_sen_vecs = sen_vecs.features.toarray()

    assert np.all(actual_seq_vecs[0] == sequence_features)
    assert np.all(actual_sen_vecs[-1] == sentence_features)

    loaded = load_featurizer(config)
    assert loaded.OOV_words == ftr.OOV_words
def test_convert_training_examples(
    spacy_nlp: Language,
    text: Text,
    intent: Optional[Text],
    entities: Optional[List[Dict[Text, Any]]],
    attributes: List[Text],
    real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]],
    default_model_storage: ModelStorage,
    default_execution_context: ExecutionContext,
):
    message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities})

    tokenizer = SpacyTokenizerGraphComponent.create(
        SpacyTokenizerGraphComponent.get_default_config(),
        default_model_storage,
        Resource("tokenizer"),
        default_execution_context,
    )
    count_vectors_featurizer = CountVectorsFeaturizerGraphComponent.create(
        CountVectorsFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        Resource("count_featurizer"),
        default_execution_context,
    )
    spacy_featurizer = SpacyFeaturizerGraphComponent.create(
        SpacyFeaturizerGraphComponent.get_default_config(),
        default_model_storage,
        Resource("spacy_featurizer"),
        default_execution_context,
    )

    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    training_data = TrainingData([message])
    tokenizer.process_training_data(training_data)
    count_vectors_featurizer.train(training_data)
    count_vectors_featurizer.process_training_data(training_data)
    spacy_featurizer.process_training_data(training_data)

    entity_tag_spec = [
        EntityTagSpec(
            "entity",
            {
                0: "O",
                1: "name",
                2: "location"
            },
            {
                "O": 0,
                "name": 1,
                "location": 2
            },
            3,
        )
    ]
    output, sparse_feature_sizes = model_data_utils.featurize_training_examples(
        [message],
        attributes=attributes,
        entity_tag_specs=entity_tag_spec,
    )

    assert len(output) == 1
    for attribute in attributes:
        assert attribute in output[0]
    for attribute in {INTENT, TEXT, ENTITIES} - set(attributes):
        assert attribute not in output[0]
    # we have sparse sentence, sparse sequence, dense sentence, and dense sequence
    # features in the list
    assert len(output[0][TEXT]) == 4
    if INTENT in attributes:
        # we will just have space sentence features
        assert len(output[0][INTENT]) == 1
    if ENTITIES in attributes:
        # we will just have space sentence features
        assert len(output[0][ENTITIES]) == len(entity_tag_spec)
    # check that it calculates sparse_feature_sizes correctly
    assert sparse_feature_sizes == real_sparse_feature_sizes
예제 #9
0
def spacy_tokenizer() -> SpacyTokenizerGraphComponent:
    return SpacyTokenizerGraphComponent(
        SpacyTokenizerGraphComponent.get_default_config()
    )