def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "spacy_doc": spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], "spacy_doc": spacy_nlp("central indian restaurant"), }, ), ] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = "anywhere in the west" doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"] feats = ext._sentence_to_features(crf_format) assert "BOS" in feats[0] assert "EOS" in feats[-1] assert feats[1]["0:low"] == "in" sentence = "anywhere in the west" ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get("entities") == [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "Entity without extractor remains" assert filtered[1].get("entities") == [{ "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }], "Only CRFEntityExtractor entity annotation remains" assert examples[1].get("entities")[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, "Original examples are not mutated"
class IncrementalCRFEntityExtractor(EntityExtractor, IncrementalComponent): provides = ["entities"] requires = ["tokens"] def __init__(self, component_config: Optional[Dict[Text, Any]]=None, ent_tagger: Optional[Dict[Text, Any]]=None) -> None: super(IncrementalCRFEntityExtractor, self).__init__(component_config) self.CRFEE = CRFEntityExtractor(component_config, ent_tagger) self.prev_ents = [] def new_utterance(self): self.prev_ents = [] @classmethod def required_packages(cls): return ["sklearn_crfsuite", "sklearn"] def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: self.CRFEE.train(training_data, config, **kwargs) def process(self, message: Message, **kwargs: Any) -> None: iu_list = message.get('iu_list') last_iu = iu_list[-1] iu_word, iu_type = last_iu # TODO: inefficient right now, we are always storing # previous state, even if a new entity hasn't been # added # This will not work with multiple extractors if iu_type == "add": extracted = self.add_extractor_name( self.CRFEE.extract_entities(message)) message.set("entities", extracted, add_to_output=True) self.prev_ents.append(message.get("entities")) elif iu_type == "revoke": if len(self.prev_ents) > 0: prev_ent = self.prev_ents.pop() message.set("entities", prev_ent, add_to_output=True) @classmethod def load(cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['IncrementalCRFEntityExtractor'] = None, **kwargs: Any ) -> 'IncrementalCRFEntityExtractor': from sklearn.externals import joblib file_name = meta.get("file") model_file = os.path.join(model_dir, file_name) if os.path.exists(model_file): ent_tagger = joblib.load(model_file) return cls(meta, ent_tagger) else: return cls(meta) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" return self.CRFEE.persist((file_name) + "_incr", model_dir)