def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], 'Only CRFEntityExtractor entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], 'Only ner_crf entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
from rasa_nlu.train import load_data from rasa_nlu.config import RasaNLUModelConfig from rasa_nlu.utils.spacy_utils import SpacyNLP from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor import spacy config = RasaNLUModelConfig() training_data = load_data("data/examples/rasa/demo-rasa.json") SpacyNLP(nlp=spacy.load("en")).train(training_data, config) SpacyTokenizer().train(training_data, config) print(training_data.training_examples[25].as_dict()) crf = CRFEntityExtractor() filtered_data = crf.filter_trainable_entities(training_data.training_examples) # Create Dataset # dataset = crf._create_dataset(filtered_data) ## Convert Examples dataset = [] ## Convert JSON TO CRF for training_example in filtered_data: entity_offsets = crf._convert_example(training_example) print("Entity Offset", entity_offsets) # b = crf._from_json_to_crf(training_example, entity_offsets) # print("JSON to CRF", b)