def test_process(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(data={
            TEXT: "Hi Max!",
            "entities": [{
                "entity": "person",
                "value": "Max"
            }]
        }),
        Message(
            data={
                TEXT: "I live in Berlin",
                "entities": [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
def test_process_does_not_overwrite_any_entities(
    create_or_load_extractor: Callable[...,
                                       RegexEntityExtractorGraphComponent], ):

    pre_existing_entity = {
        ENTITY_ATTRIBUTE_TYPE: "person",
        ENTITY_ATTRIBUTE_VALUE: "Max",
        ENTITY_ATTRIBUTE_START: 0,
        ENTITY_ATTRIBUTE_END: 3,
        EXTRACTOR: "other extractor",
    }
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT:
                "Hi Max!",
                INTENT:
                "greet",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "person",
                    ENTITY_ATTRIBUTE_VALUE: "Max"
                }],
            }),
        Message(
            data={
                TEXT:
                "I live in Berlin",
                INTENT:
                "inform",
                ENTITIES: [{
                    ENTITY_ATTRIBUTE_TYPE: "city",
                    ENTITY_ATTRIBUTE_VALUE: "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = create_or_load_extractor(config={})
    entity_extractor.train(training_data)
    entity_extractor.process([message])

    entities = message.get(ENTITIES)
    assert entities == [
        pre_existing_entity,
        {
            ENTITY_ATTRIBUTE_TYPE: "city",
            ENTITY_ATTRIBUTE_VALUE: "Berlin",
            ENTITY_ATTRIBUTE_START: 13,
            ENTITY_ATTRIBUTE_END: 19,
            EXTRACTOR: RegexEntityExtractorGraphComponent.__name__,
        },
    ]
示例#3
0
def test_train_tokenizer(text: Text, expected_tokens: List[Text],
                         expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(RESPONSE, text)
    message.set(INTENT, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE, TEXT]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT])

    assert [t.text for t in tokens] == [text]
示例#4
0
def test_lowercase(
    text: Text,
    case_sensitive: bool,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})
    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{"entity": "person", "value": "Max"}],
            }
        ),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{"entity": "city", "value": "Berlin"}],
            }
        ),
    ]

    entity_extractor = RegexEntityExtractor({"case_sensitive": case_sensitive})
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == expected_entities
def test_do_not_overwrite_any_entities():
    message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"})
    message.set(ENTITIES, [{
        "entity": "person",
        "value": "Max",
        "start": 0,
        "end": 3
    }])

    training_data = TrainingData()
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]
    training_data.lookup_tables = [{
        "name":
        "city",
        "elements": ["London", "Berlin", "Amsterdam"]
    }]

    entity_extractor = RegexEntityExtractor()
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = message.get(ENTITIES)
    assert entities == [
        {
            "entity": "person",
            "value": "Max",
            "start": 0,
            "end": 3
        },
        {
            "entity": "city",
            "value": "Berlin",
            "start": 13,
            "end": 19,
            "extractor": "RegexEntityExtractor",
        },
    ]
示例#6
0
def test_non_word_boundaries(
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    non_word_boundary: List[Text],
    expected_entities: List[Dict[Text, Any]],
):
    message = Message(data={TEXT: text})
    training_data = TrainingData()
    training_data.lookup_tables = [lookup]
    training_data.training_examples = [
        Message(
            data={
                TEXT: "I love New York",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "New York"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
        Message(
            data={
                TEXT: "I like apples",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "apples"
                }],
            }),
        Message(
            data={
                TEXT: "oranges are my fave",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "fruit",
                    "value": "oranges"
                }],
            }),
    ]

    entity_extractor = FlashTextEntityExtractor(
        {"non_word_boundaries": non_word_boundary})
    entity_extractor.train(training_data)
    entity_extractor.process(message)

    entities = [e["value"] for e in message.get(ENTITIES)]
    assert entities == expected_entities
示例#7
0
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text],
                                     expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(ACTION_NAME, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    # check action_name attribute
    tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME])

    assert [t.text for t in tokens] == [text]
def test_train_and_process(
    create_or_load_extractor: Callable[...,
                                       RegexEntityExtractorGraphComponent],
    config: Dict[Text, Any],
    text: Text,
    lookup: List[Dict[Text, List[Text]]],
    expected_entities: List[Dict[Text, Any]],
    test_loading: bool,
):
    message = Message(data={TEXT: text})
    if test_loading:
        message_copy = copy.deepcopy(message)

    training_data = TrainingData()
    training_data.lookup_tables = lookup
    training_data.training_examples = [
        Message(
            data={
                TEXT: "Hi Max!",
                INTENT: "greet",
                ENTITIES: [{
                    "entity": "person",
                    "value": "Max"
                }],
            }),
        Message(
            data={
                TEXT: "I live in Berlin",
                INTENT: "inform",
                ENTITIES: [{
                    "entity": "city",
                    "value": "Berlin"
                }],
            }),
    ]

    entity_extractor = create_or_load_extractor(config)
    entity_extractor.train(training_data)
    entity_extractor.process([message])
    entities = message.get(ENTITIES)
    assert entities == expected_entities

    if test_loading:
        loaded_entity_extractor = create_or_load_extractor(config, load=True)
        loaded_entity_extractor.process([message_copy])
        loaded_entity_extractor.patterns == entity_extractor.patterns
示例#9
0
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp):
    tk = SpacyTokenizer()

    message = Message.build(text=text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))
    message.set(RESPONSE, text)
    message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text))

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE, TEXT]:
        tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]
示例#10
0
def test_train_tokenizer_e2e_actions(text: Text, expected_tokens: List[Text],
                                     expected_indices: List[Tuple[int]]):
    tk = WhitespaceTokenizer()

    message = Message.build(text=text)
    message.set(ACTION_TEXT, text)
    message.set(ACTION_NAME, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [ACTION_TEXT, TEXT]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]
示例#11
0
def test_extract_patterns_use_only_entities_lookup_tables(
        entity: Text, lookup_tables: Dict[Text, Text],
        expected_patterns: Dict[Text, Text]):
    training_data = TrainingData()
    if entity:
        training_data.training_examples = [
            Message(data={
                "text": "text",
                "entities": [{
                    "entity": entity,
                    "value": "text"
                }]
            })
        ]
    if lookup_tables:
        training_data.lookup_tables = [lookup_tables]

    actual_patterns = pattern_utils.extract_patterns(training_data,
                                                     use_only_entities=True)

    assert actual_patterns == expected_patterns
示例#12
0
def test_extract_patterns_use_only_entities_regexes(
        entity: Text, regex_features: Dict[Text, Text],
        expected_patterns: Dict[Text, Text]):
    training_data = TrainingData()
    if entity:
        training_data.training_examples = [
            Message(
                data={
                    "text": "text",
                    "intent": "greet",
                    "entities": [{
                        "entity": entity,
                        "value": "text"
                    }],
                })
        ]
    if regex_features:
        training_data.regex_features = [regex_features]

    actual_patterns = pattern_utils.extract_patterns(training_data,
                                                     use_only_entities=True)

    assert actual_patterns == expected_patterns