def compute_ambiguity(self, dataset): """ Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label. label. If 'dataset' comprises a models predictions, this method provides a strong indicators of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix. :param dataset: a Dataset object containing a predicted version of this dataset. :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side. :return: a dictionary containing the ambiguity computations on each gold, predicted file pair """ if not isinstance(dataset, Dataset): raise ValueError("dataset must be instance of Dataset") # verify files are consistent diff = set([file.ann_path.split(os.sep)[-1] for file in self]).difference(set([file.ann_path.split(os.sep)[-1] for file in dataset])) if diff: raise ValueError("Dataset of predictions is missing the files: " + str(list(diff))) #Dictionary storing ambiguity over dataset ambiguity_dict = {} for gold_data_file in self: prediction_iter = iter(dataset) prediction_data_file = next(prediction_iter) while str(gold_data_file) != str(prediction_data_file): prediction_data_file = next(prediction_iter) gold_annotation = Annotations(gold_data_file.ann_path) pred_annotation = Annotations(prediction_data_file.ann_path) # compute matrix on the Annotation file level ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation) return ambiguity_dict
def test_compute_ambiguity(self): annotations1 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') annotations2 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') label, start, end, text = annotations2.get_entity_annotations()[0] annotations2.add_entity('incorrect_label', start, end, text) self.assertEqual(len(annotations1.compute_ambiguity(annotations2)), 1)