def test_process( text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message(data={ TEXT: "Hi Max!", "entities": [{ "entity": "person", "value": "Max" }] }), Message( data={ TEXT: "I live in Berlin", "entities": [{ "entity": "city", "value": "Berlin" }], }), ] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_process_does_not_overwrite_any_entities( create_or_load_extractor: Callable[..., RegexEntityExtractorGraphComponent], ): pre_existing_entity = { ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max", ENTITY_ATTRIBUTE_START: 0, ENTITY_ATTRIBUTE_END: 3, EXTRACTOR: "other extractor", } message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [copy.deepcopy(pre_existing_entity)]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "person", ENTITY_ATTRIBUTE_VALUE: "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = create_or_load_extractor(config={}) entity_extractor.train(training_data) entity_extractor.process([message]) entities = message.get(ENTITIES) assert entities == [ pre_existing_entity, { ENTITY_ATTRIBUTE_TYPE: "city", ENTITY_ATTRIBUTE_VALUE: "Berlin", ENTITY_ATTRIBUTE_START: 13, ENTITY_ATTRIBUTE_END: 19, EXTRACTOR: RegexEntityExtractorGraphComponent.__name__, }, ]
def test_train_tokenizer(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(RESPONSE, text) message.set(INTENT, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[INTENT]) assert [t.text for t in tokens] == [text]
def test_lowercase( text: Text, case_sensitive: bool, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{"entity": "person", "value": "Max"}], } ), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{"entity": "city", "value": "Berlin"}], } ), ] entity_extractor = RegexEntityExtractor({"case_sensitive": case_sensitive}) entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_do_not_overwrite_any_entities(): message = Message(data={TEXT: "Max lives in Berlin.", INTENT: "infrom"}) message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ "entity": "person", "value": "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_non_word_boundaries( text: Text, lookup: List[Dict[Text, List[Text]]], non_word_boundary: List[Text], expected_entities: List[Dict[Text, Any]], ): message = Message(data={TEXT: text}) training_data = TrainingData() training_data.lookup_tables = [lookup] training_data.training_examples = [ Message( data={ TEXT: "I love New York", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "New York" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), Message( data={ TEXT: "I like apples", INTENT: "inform", ENTITIES: [{ "entity": "fruit", "value": "apples" }], }), Message( data={ TEXT: "oranges are my fave", INTENT: "inform", ENTITIES: [{ "entity": "fruit", "value": "oranges" }], }), ] entity_extractor = FlashTextEntityExtractor( {"non_word_boundaries": non_word_boundary}) entity_extractor.train(training_data) entity_extractor.process(message) entities = [e["value"] for e in message.get(ENTITIES)] assert entities == expected_entities
def test_train_tokenizer_action_name(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(ACTION_NAME, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) # check action_name attribute tokens = training_data.training_examples[0].get(TOKENS_NAMES[ACTION_NAME]) assert [t.text for t in tokens] == [text]
def test_train_and_process( create_or_load_extractor: Callable[..., RegexEntityExtractorGraphComponent], config: Dict[Text, Any], text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], test_loading: bool, ): message = Message(data={TEXT: text}) if test_loading: message_copy = copy.deepcopy(message) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message( data={ TEXT: "Hi Max!", INTENT: "greet", ENTITIES: [{ "entity": "person", "value": "Max" }], }), Message( data={ TEXT: "I live in Berlin", INTENT: "inform", ENTITIES: [{ "entity": "city", "value": "Berlin" }], }), ] entity_extractor = create_or_load_extractor(config) entity_extractor.train(training_data) entity_extractor.process([message]) entities = message.get(ENTITIES) assert entities == expected_entities if test_loading: loaded_entity_extractor = create_or_load_extractor(config, load=True) loaded_entity_extractor.process([message_copy]) loaded_entity_extractor.patterns == entity_extractor.patterns
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer() message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(RESPONSE, text) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_train_tokenizer_e2e_actions(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) message.set(ACTION_TEXT, text) message.set(ACTION_NAME, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [ACTION_TEXT, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_extract_patterns_use_only_entities_lookup_tables( entity: Text, lookup_tables: Dict[Text, Text], expected_patterns: Dict[Text, Text]): training_data = TrainingData() if entity: training_data.training_examples = [ Message(data={ "text": "text", "entities": [{ "entity": entity, "value": "text" }] }) ] if lookup_tables: training_data.lookup_tables = [lookup_tables] actual_patterns = pattern_utils.extract_patterns(training_data, use_only_entities=True) assert actual_patterns == expected_patterns
def test_extract_patterns_use_only_entities_regexes( entity: Text, regex_features: Dict[Text, Text], expected_patterns: Dict[Text, Text]): training_data = TrainingData() if entity: training_data.training_examples = [ Message( data={ "text": "text", "intent": "greet", "entities": [{ "entity": entity, "value": "text" }], }) ] if regex_features: training_data.regex_features = [regex_features] actual_patterns = pattern_utils.extract_patterns(training_data, use_only_entities=True) assert actual_patterns == expected_patterns