def __call__(self, doc): """ Overlays entity annotations over tokens in a Doc object. Requires that tokens in the Doc have the custom 'gold_annotation_file' and 'file_name' extension. :param doc: a spaCy Doc object. :return: the same Doc object, but it now has 'gold_label' annotations. """ if hasattr(doc._, 'file_name'): logging.debug("%s: Called GoldAnnotator Component", doc._.file_name) if logging.getLogger().getEffectiveLevel() == logging.DEBUG: # print document tokenization for token in doc: logging.debug(str(token)) # check if gold annotation file path has been set. if not hasattr(doc._, 'gold_annotation_file'): logging.warning("No extension doc._.gold_annotation_file is present; it will not be possible to fit a model with this Doc") return doc gold_annotations = Annotations(doc._.gold_annotation_file) for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations(): if e_start > e_end: logging.critical("%s: Broken annotation - start is greater than end: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label) continue span = doc.char_span(e_start, e_end) if span is None: self.failed_overlay_count += 1 self.failed_identifying_span_count += 1 logging.warning("%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)", doc._.file_name, self.failed_overlay_count, e_start, e_end, e_label) fixed_span = self.find_span(e_start, e_end, doc) if fixed_span is not None: if span is None: logging.warning("%s: Fixed span (%i,%i,%s) into: %s", doc._.file_name, e_start, e_end, e_label, fixed_span.text) self.failed_identifying_span_count -= 1 for token in fixed_span: if e_label in self.labels or not self.labels: token._.set('gold_label', e_label) else: # annotation was not able to be fixed, it will be ignored - this is bad in evaluation. logging.warning("%s: Could not fix annotation: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label) logging.warning("%s: Total Failed Annotations: %i", doc._.file_name, self.failed_identifying_span_count) if self.failed_overlay_count > .3 * len(gold_annotations): logging.warning("%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s", doc._.file_name, doc._.gold_annotation_file) return doc
def test_confusion_matrix(self): ann_1 = Annotations(self.ann_path_1) ann_2 = Annotations(self.ann_path_2) ann_1.add_entity(*ann_2.get_entity_annotations()[0]) self.assertEqual(len(ann_1.compute_confusion_matrix(ann_2, self.entities)[0]), len(self.entities)) self.assertEqual(len(ann_1.compute_confusion_matrix(ann_2, self.entities)), len(self.entities))