def test_convert_training_examples( spacy_nlp: Any, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer() count_vectors_featurizer = CountVectorsFeaturizer() spacy_featurizer = SpacyFeaturizer() message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.train(training_data) count_vectors_featurizer.train(training_data) spacy_featurizer.train(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec, ) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def test_custom_intent_symbol(text, expected_tokens, spacy_nlp): component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"} tk = SpacyTokenizer(component_config) message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(INTENT, text) tk.train(TrainingData([message])) assert [t.text for t in message.get(TOKENS_NAMES[INTENT])] == expected_tokens
def test_spacy_intent_tokenizer(spacy_nlp_component): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer td = training_data.load_data("data/examples/rasa/demo-rasa.json") spacy_nlp_component.train(td, config=None) spacy_tokenizer = SpacyTokenizer() spacy_tokenizer.train(td, config=None) intent_tokens_exist = [ True if example.get("intent_tokens") is not None else False for example in td.intent_examples ] # no intent tokens should have been set assert not any(intent_tokens_exist)
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer() message = Message(text) message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text)) message.set(RESPONSE_ATTRIBUTE, text) message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]: tokens = training_data.training_examples[0].get(TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_crf_extractor(spacy_nlp): examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"), }, ), ] extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ] } ) tokenizer = SpacyTokenizer() training_data = TrainingData(training_examples=examples) tokenizer.train(training_data) extractor.train(training_data) sentence = "italian restaurant" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer.process(message) extractor.process(message) detected_entities = message.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"