def test_ann_conversions(self): """Tests converting and un-converting a valid Annotations object to an ANN file.""" annotations = Annotations(self.ann_file_path_one, annotation_type='ann') annotations.to_ann(write_location=join(self.test_dir,"intermediary.ann")) annotations2 = Annotations(join(self.test_dir, "intermediary.ann"), annotation_type='ann') self.assertEqual(annotations.get_entity_annotations(return_dictionary=True), annotations2.get_entity_annotations(return_dictionary=True) )
def test_intersection(self): annotations1 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') annotations2 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[1]), annotation_type='ann') annotations1.add_entity(*annotations2.get_entity_annotations()[0]) annotations1.add_entity(*annotations2.get_entity_annotations()[1]) self.assertEqual( annotations1.intersection(annotations2), set([ annotations2.get_entity_annotations()[0], annotations2.get_entity_annotations()[1] ]))
def test_get_entity_annotations_dict(self): """ Tests the validity of the annotation dict :return: """ annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') self.assertIsInstance(annotations.get_entity_annotations(return_dictionary=True), dict)
def test_init_from_ann_file(self): """ Tests initialization from valid ann file :return: """ annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') self.assertIsNotNone(annotations.get_entity_annotations())
def test_confusion_matrix(self): annotations1 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') annotations2 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[1]), annotation_type='ann') annotations1.add_entity(*annotations2.get_entity_annotations()[0]) self.assertEqual(len(annotations1.compute_confusion_matrix(annotations2, self.entities)[0]), len(self.entities)) self.assertEqual(len(annotations1.compute_confusion_matrix(annotations2, self.entities)), len(self.entities))
def test_get_entity_annotations_list(self): """ Tests the validity of annotation list :return: """ annotations = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') self.assertIsInstance(annotations.get_entity_annotations(), list)
def test_good_con_data(self): """Tests to see if valid con data can be used to instantiate an Annotations object.""" with open(join(self.test_dir, "test_con.con"), 'w+') as c,\ open(join(self.test_dir, "test_con_text.txt"), 'w+') as t: c.write(con_text) t.write(con_source_text) annotations = Annotations(c.name, annotation_type='con', source_text_path=t.name) self.assertIsInstance(annotations.get_entity_annotations(), list)
def test_compute_ambiguity(self): annotations1 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') annotations2 = Annotations(join(self.dataset.get_data_directory(), self.ann_files[0]), annotation_type='ann') label, start, end, text = annotations2.get_entity_annotations()[0] annotations2.add_entity('incorrect_label', start, end, text) self.assertEqual(len(annotations1.compute_ambiguity(annotations2)), 1)
def get_training_data(self, data_format='spacy'): """ Get training data in a specified format. :param data_format: The specified format as a string. :return: The requested data in the requested format. """ # Only spaCy format is currently supported. if data_format != 'spacy': raise TypeError("Format %s not supported" % format) training_data = [] # Add each entry in dataset with annotation to train_data for data_file in self.all_data_files: txt_path = data_file.get_text_path() ann_path = data_file.get_annotation_path() annotations = Annotations(ann_path, source_text_path=txt_path) training_data.append( annotations.get_entity_annotations(format='spacy')) return training_data
def __call__(self, doc): nlp = self.nlp if hasattr(doc._, 'file_name'): logging.debug("%s: Called GoldAnnotator Component", doc._.file_name) if logging.getLogger().getEffectiveLevel( ) == logging.DEBUG: #print document tokenization for token in doc: logging.debug(str(token)) #check if gold annotation file path has been set. if not hasattr(doc._, 'gold_annotation_file'): raise ValueError( "No extension doc._.gold_annotation_file is present.") gold_annotations = Annotations(doc._.gold_annotation_file, annotation_type='ann') # for label in set([label for _,_,label in [gold['entities'][key] for key in gold['entities']]]): # for token in doc: # print(token.text, token.idx) for e_label, e_start, e_end, _ in gold_annotations.get_entity_annotations( ): #print(e_label, e_start, e_end) if e_start > e_end: logging.critical( "%s: Broken annotation - start is greater than end: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label) continue span = doc.char_span(e_start, e_end) if span is None: self.failed_overlay_count += 1 self.failed_identifying_span_count += 1 logging.warning( "%s: Number of failed annotation overlays with current tokenizer: %i (%i,%i,%s)", doc._.file_name, self.failed_overlay_count, e_start, e_end, e_label) fixed_span = self.find_span(e_start, e_end, e_label, span, doc) if fixed_span is not None: if span is None: logging.warning("%s: Fixed span (%i,%i,%s) into: %s", doc._.file_name, e_start, e_end, e_label, fixed_span.text) self.failed_identifying_span_count -= 1 for token in fixed_span: if e_label in self.labels or not self.labels: token._.set('gold_label', e_label) else: #annotation was not able to be fixed, it will be ignored - this is bad in evaluation. logging.warning("%s: Could not fix annotation: (%i,%i,%s)", doc._.file_name, e_start, e_end, e_label) logging.warning("%s: Total Failed Annotations: %i", doc._.file_name, self.failed_identifying_span_count) if self.failed_overlay_count > .3 * len( gold_annotations.get_entity_annotations()): logging.warning( "%s: Annotations may mis-aligned as more than 30 percent failed to overlay: %s", doc._.file_name, doc._.gold_annotation_file) return doc
def test_init_from_ann_file(self): """Tests initialization from valid ann file""" annotations = Annotations(self.ann_file_path_one, annotation_type='ann') self.assertIsNotNone(annotations.get_entity_annotations())
def test_get_entity_annotations_list(self): """Tests the validity of annotation list""" annotations = Annotations(self.ann_file_path_one, annotation_type='ann') self.assertIsInstance(annotations.get_entity_annotations(), list)
def test_get_entity_annotations_dict(self): """Tests the validity of the annotation dict.""" annotations = Annotations(self.ann_file_path_one, annotation_type='ann') self.assertIsInstance(annotations.get_entity_annotations(return_dictionary=True), dict)