Exemplo n.º 1
0
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor

    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
    examples = [
        Message(
            "anywhere in the west",
            {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 16,
                    "end": 20,
                    "value": "west",
                    "entity": "location"
                }],
                "spacy_doc":
                spacy_nlp("anywhere in the west"),
            },
        ),
        Message(
            "central indian restaurant",
            {
                "intent":
                "restaurant_search",
                "entities": [
                    {
                        "start": 0,
                        "end": 7,
                        "value": "central",
                        "entity": "location",
                        "extractor": "random_extractor",
                    },
                    {
                        "start": 8,
                        "end": 14,
                        "value": "indian",
                        "entity": "cuisine",
                        "extractor": "CRFEntityExtractor",
                    },
                ],
                "spacy_doc":
                spacy_nlp("central indian restaurant"),
            },
        ),
    ]

    # uses BILOU and the default features
    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
    sentence = "anywhere in the west"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    crf_format = ext._from_text_to_crf(Message(sentence, doc))
    assert [word[0]
            for word in crf_format] == ["anywhere", "in", "the", "west"]
    feats = ext._sentence_to_features(crf_format)
    assert "BOS" in feats[0]
    assert "EOS" in feats[-1]
    assert feats[1]["0:low"] == "in"
    sentence = "anywhere in the west"
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    filtered = ext.filter_trainable_entities(examples)
    assert filtered[0].get("entities") == [{
        "start": 16,
        "end": 20,
        "value": "west",
        "entity": "location"
    }], "Entity without extractor remains"
    assert filtered[1].get("entities") == [{
        "start": 8,
        "end": 14,
        "value": "indian",
        "entity": "cuisine",
        "extractor": "CRFEntityExtractor",
    }], "Only CRFEntityExtractor entity annotation remains"
    assert examples[1].get("entities")[0] == {
        "start": 0,
        "end": 7,
        "value": "central",
        "entity": "location",
        "extractor": "random_extractor",
    }, "Original examples are not mutated"
class IncrementalCRFEntityExtractor(EntityExtractor, IncrementalComponent):

    provides = ["entities"]

    requires = ["tokens"]

    def __init__(self,
                 component_config: Optional[Dict[Text, Any]]=None,
                 ent_tagger: Optional[Dict[Text, Any]]=None) -> None:

        super(IncrementalCRFEntityExtractor, self).__init__(component_config)

        self.CRFEE = CRFEntityExtractor(component_config, ent_tagger)
        self.prev_ents = []

    def new_utterance(self):
        self.prev_ents = []

    @classmethod
    def required_packages(cls):
        return ["sklearn_crfsuite", "sklearn"]

    def train(self,
              training_data: TrainingData,
              config: RasaNLUModelConfig,
              **kwargs: Any) -> None:

        self.CRFEE.train(training_data, config, **kwargs)

    def process(self, message: Message, **kwargs: Any) -> None:
        iu_list = message.get('iu_list')
        last_iu = iu_list[-1]
        iu_word, iu_type = last_iu
        # TODO: inefficient right now, we are always storing
        # previous state, even if a new entity hasn't been
        # added

        # This will not work with multiple extractors
        if iu_type == "add":
            extracted = self.add_extractor_name(
                self.CRFEE.extract_entities(message))
            message.set("entities", extracted, add_to_output=True)
            self.prev_ents.append(message.get("entities"))
        elif iu_type == "revoke":
            if len(self.prev_ents) > 0:
                prev_ent = self.prev_ents.pop()
                message.set("entities", prev_ent,
                            add_to_output=True)

    @classmethod
    def load(cls,
             meta: Dict[Text, Any],
             model_dir: Text = None,
             model_metadata: Metadata = None,
             cached_component: Optional['IncrementalCRFEntityExtractor'] = None,
             **kwargs: Any
             ) -> 'IncrementalCRFEntityExtractor':
        from sklearn.externals import joblib

        file_name = meta.get("file")
        model_file = os.path.join(model_dir, file_name)

        if os.path.exists(model_file):
            ent_tagger = joblib.load(model_file)
            return cls(meta, ent_tagger)
        else:
            return cls(meta)

    def persist(self,
                file_name: Text,
                model_dir: Text) -> Optional[Dict[Text, Any]]:
        """Persist this model into the passed directory.

        Returns the metadata necessary to load the model again."""

        return self.CRFEE.persist((file_name) + "_incr", model_dir)