def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}], "spacy_doc": spacy_nlp("central indian restaurant") })] config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}} ext.train(TrainingData(training_examples=examples), config) sentence = 'anywhere in the west' crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor"} ], 'Only CRFEntityExtractor entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message("anywhere in the west", { "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}], "spacy_doc": spacy_nlp("anywhere in the west") }), Message("central indian restaurant", { "intent": "restaurant_search", "entities": [ {"start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor"}, {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], "spacy_doc": spacy_nlp("central indian restaurant") })] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = 'anywhere in the west' doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west'] feats = ext._sentence_to_features(crf_format) assert 'BOS' in feats[0] assert 'EOS' in feats[-1] assert feats[1]['0:low'] == "in" sentence = 'anywhere in the west' ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get('entities') == [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], 'Entity without extractor remains' assert filtered[1].get('entities') == [ {"start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "ner_crf"} ], 'Only ner_crf entity annotation remains' assert examples[1].get('entities')[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor" }, 'Original examples are not mutated'
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [ { "text": "anywhere in the west", "intent": "restaurant_search", "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}] }, { "text": "central indian restaurant", "intent": "restaurant_search", "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}] }] ext.train(TrainingData(training_examples=examples), spacy_nlp, True, ext.crf_features) crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp) assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']) feats = ext._sentence_to_features(crf_format) assert ('BOS' in feats[0]) assert ('EOS' in feats[-1]) assert ('0:low:in' in feats[1]) ext.extract_entities('anywhere in the west', spacy_nlp)
def test_crf_extractor(spacy_nlp): from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor() examples = [{ "text": "anywhere in the west", "intent": "restaurant_search", "entities": [{ "start": 16, "end": 20, "value": "west", "entity": "location" }] }, { "text": "central indian restaurant", "intent": "restaurant_search", "entities": [{ "start": 0, "end": 7, "value": "central", "entity": "location" }] }] ext.train(TrainingData(entity_examples_only=examples), spacy_nlp, True, ext.crf_features) crf_format = ext._from_text_to_crf('anywhere in the west', spacy_nlp) assert ([word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']) feats = ext._sentence_to_features(crf_format) assert ('BOS' in feats[0]) assert ('EOS' in feats[-1]) assert ('0:low:in' in feats[1]) ext.extract_entities('anywhere in the west', spacy_nlp)
# print("JSON to CRF", b) ### _bilou tags from offset ents = crf._bilou_tags_from_offsets(training_example.get("tokens"), entity_offsets) print("BILOU tags....", ents) text = crf._from_text_to_crf(training_example, ents) print("TEXT to CRF", text) dataset.append(text) #print (dataset) # Token, POSTag, Entity, pattern(In case of regex features) # Train Model import sklearn_crfsuite X_train = [crf._sentence_to_features(sent) for sent in dataset] print("X_Train...", X_train[-1]) y_train = [crf._sentence_to_labels(sent) for sent in dataset] print("Y_Train.......", y_train[-1]) crf.ent_tagger = sklearn_crfsuite.CRF( algorithm='lbfgs', # coefficient for L1 penalty c1=crf.component_config["L1_c"], # coefficient for L2 penalty c2=crf.component_config["L2_c"], # stop earlier max_iterations=crf.component_config["max_iterations"], # include transitions that are possible, but not observed all_possible_transitions=True) crf.ent_tagger.fit(X_train, y_train)