def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ner_crf_pos_feature_config.update({"BILOU_flag": False}) ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) sentence = "I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} rs = ext._from_crf_to_json( Message(sentence, doc), [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"what": 1.0}, {"what": 1.0}, {"where": 1.0}, {"where": 1.0}, {"where": 1.0}, ], ) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r["confidence"] # confidence should exist del r["confidence"] assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"} assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"} assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"} assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"} assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) sentence = "I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} r = ext._from_crf_to_json( Message(sentence, doc), [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"B-what": 1.0}, {"L-what": 1.0}, {"B-where": 1.0}, {"I-where": 1.0}, {"L-where": 1.0}, ], ) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer ner_crf_pos_feature_config["features"][1].append("text_dense_features") crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) spacy_featurizer = SpacyFeaturizer() white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False}) text = "Rasa is a company in Berlin" message = Message(text) message.set("spacy_doc", spacy_nlp(text)) white_space_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._from_text_to_crf(message) features = crf_extractor._sentence_to_features(text_data) assert "0:text_dense_features" in features[0] for i in range(0, len(message.data.get("text_dense_features")[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == message.data.get("text_dense_features")[0][i] )
async def test_train_persist_load_with_composite_entities( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): importer = RasaFileImporter( training_data_paths=["data/test/demo-rasa-composite-entities.yml"]) training_data = importer.get_nlu_data() whitespace_tokenizer.process_training_data(training_data) crf_extractor = crf_entity_extractor({}) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) whitespace_tokenizer.process([message]) message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( CRFEntityExtractor.get_default_config(), default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() assert list(loaded_extractor.entity_taggers.keys()) == list( crf_extractor.entity_taggers.keys())
def __init__(self, component_config: Optional[Dict[Text, Any]]=None, ent_tagger: Optional[Dict[Text, Any]]=None) -> None: super(IncrementalCRFEntityExtractor, self).__init__(component_config) self.CRFEE = CRFEntityExtractor(component_config, ent_tagger) self.prev_ents = []
def load_entity_extractor(data_file, config_file): training_data = load_data(data_file) configuration = config.load(config_file) comp_builder = components.ComponentBuilder() #component = comp_builder.create_component("ner_crf",configuration) #ee = EntityExtractor(components.Component(configuration)) crf = CRFEntityExtractor() crf.train(training_data, configuration) model_directory = crf.persist('./models/default/') return model_directory
def inner(config: Dict[Text, Any]) -> CRFEntityExtractor: return CRFEntityExtractor.create( { **CRFEntityExtractor.get_default_config(), **config }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, )
def test_most_likely_entity( entity_predictions: List[Dict[Text, float]], expected_label: Text, expected_confidence: float, ): crf_extractor = CRFEntityExtractor({"BILOU_flag": True}) actual_label, actual_confidence = crf_extractor._most_likely_tag(entity_predictions) assert actual_label == expected_label assert actual_confidence == expected_confidence
def test_crf_json_from_BILOU(spacy_nlp): ext = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "bias", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", ], ["low", "title", "upper", "pos", "pos2"], ] } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) r = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"B-what": 1.0}, {"L-what": 1.0}, {"B-where": 1.0}, {"I-where": 1.0}, {"L-where": 1.0}, ], ) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"} assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
async def test_train_persist_with_different_configurations( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], config_params: Dict[Text, Any], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, spacy_tokenizer: SpacyTokenizer, spacy_featurizer: SpacyFeaturizer, spacy_nlp_component: SpacyNLP, spacy_model: SpacyModel, ): crf_extractor = crf_entity_extractor(config_params) importer = RasaFileImporter(training_data_paths=["data/examples/rasa"]) training_data = importer.get_nlu_data() training_data = spacy_nlp_component.process_training_data( training_data, spacy_model) training_data = spacy_tokenizer.process_training_data(training_data) training_data = spacy_featurizer.process_training_data(training_data) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) messages = spacy_nlp_component.process([message], spacy_model) messages = spacy_tokenizer.process(messages) message = spacy_featurizer.process(messages)[0] message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( { **CRFEntityExtractor.get_default_config(), **config_params }, default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() detected_entities = processed_message2.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"
def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ner_crf_pos_feature_config.update({"BILOU_flag": False}) ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} rs = ext._from_crf_to_json(Message(sentence, doc), [{ 'O': 1.0 }, { 'O': 1.0 }, { 'O': 1.0 }, { 'what': 1.0 }, { 'what': 1.0 }, { 'where': 1.0 }, { 'where': 1.0 }, { 'where': 1.0 }]) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r['confidence'] # confidence should exist del r['confidence'] assert rs[0] == {'start': 9, 'end': 13, 'value': 'home', 'entity': 'what'} assert rs[1] == { 'start': 14, 'end': 22, 'value': 'cleaning', 'entity': 'what' } assert rs[2] == { 'start': 23, 'end': 28, 'value': 'close', 'entity': 'where' } assert rs[3] == {'start': 28, 'end': 29, 'value': '-', 'entity': 'where'} assert rs[4] == {'start': 29, 'end': 31, 'value': 'by', 'entity': 'where'}
def test_crf_json_from_non_BILOU(spacy_nlp): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor( component_config={ "BILOU_flag": False, "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ], } ) sentence = "I need a home cleaning close-by" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer = SpacyTokenizer() tokenizer.process(message) rs = ext._from_crf_to_json( message, [ {"O": 1.0}, {"O": 1.0}, {"O": 1.0}, {"what": 1.0}, {"what": 1.0}, {"where": 1.0}, {"where": 1.0}, {"where": 1.0}, ], ) # non BILOU will split multi-word entities - hence 5 assert len(rs) == 5, "There should be five entities" for r in rs: assert r["confidence"] # confidence should exist del r["confidence"] assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"} assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"} assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"} assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"} assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
def test_crf_use_dense_features(spacy_nlp: Any): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(data={TEXT: text}) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) if dense_features: dense_features = dense_features.features for i in range(0, len(dense_features[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == dense_features[0][i] )
def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) sentence = u"I need a home cleaning close-by" doc = {"spacy_doc": spacy_nlp(sentence)} r = ext._from_crf_to_json(Message(sentence, doc), [{ 'O': 1.0 }, { 'O': 1.0 }, { 'O': 1.0 }, { 'B-what': 1.0 }, { 'L-what': 1.0 }, { 'B-where': 1.0 }, { 'I-where': 1.0 }, { 'L-where': 1.0 }]) assert len(r) == 2, "There should be two entities" assert r[0]["confidence"] # confidence should exist del r[0]["confidence"] assert r[0] == { 'start': 9, 'end': 22, 'value': 'home cleaning', 'entity': 'what' } assert r[1]["confidence"] # confidence should exist del r[1]["confidence"] assert r[1] == { 'start': 23, 'end': 31, 'value': 'close-by', 'entity': 'where' }
def test_crf_use_dense_features(spacy_nlp): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._from_text_to_crf(message) features = crf_extractor._sentence_to_features(text_data) assert "0:text_dense_features" in features[0] for i in range(0, len(message.data.get("text_dense_features")[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == message.data.get("text_dense_features")[0][i] )
assert os.path.exists( os.path.join(report_folder, "response_selection_confusion_matrix.png")) assert os.path.exists( os.path.join(report_folder, "response_selection_histogram.png")) assert not os.path.exists( os.path.join(report_folder, "response_selection_errors.json")) assert os.path.exists( os.path.join(report_folder, "response_selection_successes.json")) @pytest.mark.parametrize( "components, expected_extractors", [ ([DIETClassifier({ENTITY_RECOGNITION: False})], set()), ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}), ([CRFEntityExtractor()], {"CRFEntityExtractor"}), ( [SpacyEntityExtractor(), CRFEntityExtractor()], {"SpacyEntityExtractor", "CRFEntityExtractor"}, ), ([ResponseSelector()], set()), ], ) def test_get_entity_extractors(components, expected_extractors): mock_interpreter = Interpreter(components, None) extractors = get_entity_extractors(mock_interpreter) assert extractors == expected_extractors
os.path.join(report_folder, "response_selection_histogram.png") ) assert not os.path.exists( os.path.join(report_folder, "response_selection_errors.json") ) assert os.path.exists( os.path.join(report_folder, "response_selection_successes.json") ) @pytest.mark.parametrize( "components, expected_extractors", [ ([DIETClassifier({ENTITY_RECOGNITION: False})], set()), ([DIETClassifier({ENTITY_RECOGNITION: True})], {"DIETClassifier"}), ([CRFEntityExtractor()], {"CRFEntityExtractor"}), ( [SpacyEntityExtractor(), CRFEntityExtractor()], {"SpacyEntityExtractor", "CRFEntityExtractor"}, ), ([ResponseSelector()], set()), ], ) def test_get_entity_extractors( components: List[Component], expected_extractors: Set[Text] ): mock_interpreter = Interpreter(components, None) extractors = get_entity_extractors(mock_interpreter) assert extractors == expected_extractors
def test_crf_extractor(spacy_nlp): examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [ {"start": 16, "end": 20, "value": "west", "entity": "location"} ], SPACY_DOCS[TEXT]: spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], SPACY_DOCS[TEXT]: spacy_nlp("central indian restaurant"), }, ), ] extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], ["low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2"], ["low", "title", "upper", "pos", "pos2"], ] } ) tokenizer = SpacyTokenizer() training_data = TrainingData(training_examples=examples) tokenizer.train(training_data) extractor.train(training_data) sentence = "italian restaurant" message = Message(sentence, {SPACY_DOCS[TEXT]: spacy_nlp(sentence)}) tokenizer.process(message) extractor.process(message) detected_entities = message.get(ENTITIES) assert len(detected_entities) == 1 assert detected_entities[0]["entity"] == "cuisine" assert detected_entities[0]["value"] == "italian"
def test_crf_create_entity_dict(spacy_nlp): crf_extractor = CRFEntityExtractor() spacy_tokenizer = SpacyTokenizer() white_space_tokenizer = WhitespaceTokenizer() examples = [ { "message": Message( "where is St. Michael's Hospital?", { "intent": "search_location", "entities": [ { "start": 9, "end": 31, "value": "St. Michael's Hospital", "entity": "hospital", "SpacyTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 5, }, "WhitespaceTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 5, }, } ], SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"), }, ) }, { "message": Message( "where is Children's Hospital?", { "intent": "search_location", "entities": [ { "start": 9, "end": 28, "value": "Children's Hospital", "entity": "hospital", "SpacyTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 4, }, "WhitespaceTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 4, }, } ], SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"), }, ) }, ] for ex in examples: # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT) white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT) for tokenizer, tokens in [ ("SpacyTokenizer", spacy_tokens), ("WhitespaceTokenizer", white_space_tokens), ]: for entity in ex["message"].get("entities"): parsed_entities = crf_extractor._create_entity_dict( ex["message"], tokens, entity[tokenizer]["entity_start_token_idx"], entity[tokenizer]["entity_end_token_idx"], entity["entity"], 0.8, ) assert parsed_entities == { "start": entity["start"], "end": entity["end"], "value": entity["value"], "entity": entity["entity"], "confidence": 0.8, }
def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) examples = [ Message( "anywhere in the west", { "intent": "restaurant_search", "entities": [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "spacy_doc": spacy_nlp("anywhere in the west"), }, ), Message( "central indian restaurant", { "intent": "restaurant_search", "entities": [ { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, { "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }, ], "spacy_doc": spacy_nlp("central indian restaurant"), }, ), ] # uses BILOU and the default features ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig()) sentence = "anywhere in the west" doc = {"spacy_doc": spacy_nlp(sentence)} crf_format = ext._from_text_to_crf(Message(sentence, doc)) assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"] feats = ext._sentence_to_features(crf_format) assert "BOS" in feats[0] assert "EOS" in feats[-1] assert feats[1]["0:low"] == "in" sentence = "anywhere in the west" ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)})) filtered = ext.filter_trainable_entities(examples) assert filtered[0].get("entities") == [{ "start": 16, "end": 20, "value": "west", "entity": "location" }], "Entity without extractor remains" assert filtered[1].get("entities") == [{ "start": 8, "end": 14, "value": "indian", "entity": "cuisine", "extractor": "CRFEntityExtractor", }], "Only CRFEntityExtractor entity annotation remains" assert examples[1].get("entities")[0] == { "start": 0, "end": 7, "value": "central", "entity": "location", "extractor": "random_extractor", }, "Original examples are not mutated"
class IncrementalCRFEntityExtractor(EntityExtractor, IncrementalComponent): provides = ["entities"] requires = ["tokens"] def __init__(self, component_config: Optional[Dict[Text, Any]]=None, ent_tagger: Optional[Dict[Text, Any]]=None) -> None: super(IncrementalCRFEntityExtractor, self).__init__(component_config) self.CRFEE = CRFEntityExtractor(component_config, ent_tagger) self.prev_ents = [] def new_utterance(self): self.prev_ents = [] @classmethod def required_packages(cls): return ["sklearn_crfsuite", "sklearn"] def train(self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any) -> None: self.CRFEE.train(training_data, config, **kwargs) def process(self, message: Message, **kwargs: Any) -> None: iu_list = message.get('iu_list') last_iu = iu_list[-1] iu_word, iu_type = last_iu # TODO: inefficient right now, we are always storing # previous state, even if a new entity hasn't been # added # This will not work with multiple extractors if iu_type == "add": extracted = self.add_extractor_name( self.CRFEE.extract_entities(message)) message.set("entities", extracted, add_to_output=True) self.prev_ents.append(message.get("entities")) elif iu_type == "revoke": if len(self.prev_ents) > 0: prev_ent = self.prev_ents.pop() message.set("entities", prev_ent, add_to_output=True) @classmethod def load(cls, meta: Dict[Text, Any], model_dir: Text = None, model_metadata: Metadata = None, cached_component: Optional['IncrementalCRFEntityExtractor'] = None, **kwargs: Any ) -> 'IncrementalCRFEntityExtractor': from sklearn.externals import joblib file_name = meta.get("file") model_file = os.path.join(model_dir, file_name) if os.path.exists(model_file): ent_tagger = joblib.load(model_file) return cls(meta, ent_tagger) else: return cls(meta) def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]: """Persist this model into the passed directory. Returns the metadata necessary to load the model again.""" return self.CRFEE.persist((file_name) + "_incr", model_dir)