def test_build_tag_id_dict(): message_1 = Message("Germany is part of the European Union") message_1.set( BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"], ) message_2 = Message("Berlin is the capital of Germany") message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"]) training_data = TrainingData([message_1, message_2]) tag_id_dict = bilou_utils.build_tag_id_dict(training_data) assert tag_id_dict == { "O": 0, "B-location": 1, "I-location": 2, "U-location": 3, "L-location": 4, "B-organisation": 5, "I-organisation": 6, "U-organisation": 7, "L-organisation": 8, }
def _tag_id_index_mapping(self, training_data: TrainingData) -> Dict[Text, int]: """Create tag_id dictionary""" if self.component_config[BILOU_FLAG]: return bilou_utils.build_tag_id_dict(training_data) distinct_tag_ids = set(e["entity"] for example in training_data.entity_examples for e in example.get(ENTITIES)) - {None} tag_id_dict = { tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1) } # NO_ENTITY_TAG corresponds to non-entity which should correspond to 0 index # needed for correct prediction for padding tag_id_dict[NO_ENTITY_TAG] = 0 return tag_id_dict