def main(args): if len(args) < 2: sys.stderr.write('Required argument(s): <unlabeled anafora directory> <labeled anafora directory>\n') sys.exit(-1) for sub_dir, text_name, xml_names in walk(args[0], xml_name_regex='[.]dave\.inprogress\.xml$'): # print("text_name = %s" % text_name) assert len(xml_names) == 1 xml_file = join(args[0], sub_dir, xml_names[0]) gold_file = join(args[1], sub_dir, xml_names[0].replace('inprogress', 'completed')) use_gold = False # print("xml file = %s" % (xml_file,)) if exists(gold_file): anafora_data = AnaforaData.from_file(gold_file) use_gold = True else: anafora_data = AnaforaData.from_file(xml_file) for event in anafora_data.annotations.select_type('CuiEntity'): if use_gold: negated = event.properties['negated'] else: negated = "Unlabeled" span = event.spans assert len(span) == 1 span = span[0] assert len(span) == 2 # print("Row id %s event with span (%d, %d) is negated=%s" % (text_name, span[0], span[1], str(negated))) print('%s\t%d\t%d\t%s' % (text_name, span[0], span[1], str(negated)))
def from_texts(cls, text_dir, nlp, tokenizer): if not os.path.exists(text_dir): raise Exception("The %s directory does not exist." % text_dir) text_directory_files = anafora.walk( text_dir, xml_name_regex=".*((?<![.].{3})|[.]txt)$") features = [] doc_indices = [] for text_files in text_directory_files: doc_index = len(features) text_subdir_path, text_doc_name, text_file_names = text_files if len(text_file_names) != 1: raise Exception("Wrong number of text files in %s" % text_subdir_path) text_file_path = os.path.join(text_dir, text_subdir_path, text_file_names[0]) with open(text_file_path) as txt_file: text = txt_file.read() doc = nlp(text) input_raw = [sent.text_with_ws for sent in doc.sents] input_data = tokenizer(input_raw, return_tensors="pt", padding="max_length", truncation="longest_first", return_offsets_mapping=True) sent_offset = 0 for sent_idx, _ in enumerate(input_data["input_ids"]): features.append( TimexInputFeatures.from_sentence(input_data, sent_idx, sent_offset)) sent_offset += len(input_raw[sent_idx]) doc_indices.append((text_subdir_path, doc_index, len(features))) return cls(doc_indices, features)
def write(self, predictions): """Write predictions in anafora XML format""" index = 0 if os.path.isdir(self.out_dir): shutil.rmtree(self.out_dir) os.mkdir(self.out_dir) for sub_dir, text_name, file_names in \ anafora.walk(self.xml_dir, self.xml_regex): xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(xml_path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type entity.properties['DocTimeRel'] = int2label[predictions[index]] data.annotations.append(entity) index = index + 1 os.mkdir(os.path.join(self.out_dir, sub_dir)) out_path = os.path.join(self.out_dir, sub_dir, file_names[0]) data.indent() data.to_file(out_path)
def main(xml_dir, text_dir, xml_regex, context_size): """Main Driver""" for sub_dir, text_name, file_names in anafora.walk(xml_dir, xml_regex): xml_path = os.path.join(xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(xml_path) text_path = os.path.join(text_dir, text_name) text = open(text_path).read() for data in ref_data.annotations.select_type('EVENT'): start, end = data.spans[0] context = text[start - context_size:end + context_size].replace( '\n', '') event = text[start:end] dtr = data.properties['DocTimeRel'] left = text[start - context_size:start] right = text[end:end + context_size] tagged = left + ' [ES] ' + event + ' [EE] ' + right print('{}|{}|{}'.format(dtr, event, context)) print(tagged) print()
def notes_to_annotations(self): """Map note paths to relation, time, and event offsets""" for sub_dir, text_name, file_names in anafora.walk( self.xml_dir, self.xml_regex): note_path = os.path.join(self.text_dir, text_name) xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(xml_path) # collect (annot_start, annot_end, annot_id) tuples add_annotations(self.note2times[note_path], ref_data, 'TIMEX3') add_annotations(self.note2times[note_path], ref_data, 'SECTIONTIME') add_annotations(self.note2times[note_path], ref_data, 'DOCTIME') add_annotations(self.note2events[note_path], ref_data, 'EVENT') # collect (src spans, targ spans, src id, targ id) tuples for rel in ref_data.annotations.select_type('TLINK'): src = rel.properties['Source'] targ = rel.properties['Target'] label = rel.properties['Type'] if label == 'CONTAINS': src_start, src_end = src.spans[0] targ_start, targ_end = targ.spans[0] self.note2rels[note_path].append( (src_start, src_end, targ_start, targ_end, src.id, targ.id))
def create_datasets(model, nlp, dataset_path, train=False, valid=False): text_directory_files = anafora.walk( dataset_path, xml_name_regex=".*((?<![.].{3})|[.]txt)$") features = [] annotations = {} doc_indices = [] if train or valid: for text_files in text_directory_files: doc_index = len(features) text_subdir_path, text_doc_name, text_file_names = text_files if len(text_file_names) != 1: raise Exception("Wrong number of text files in %s" % text_subdir_path) anafora_path = os.path.join(dataset_path, text_subdir_path) anafora_directory_files = anafora.walk(anafora_path, xml_name_regex="[.]xml$") anafora_directory_files = list(anafora_directory_files) if len(anafora_directory_files) != 1: raise Exception("Wrong structure in %s" % anafora_path) anafora_subdir_path, anafora_doc_name, anafora_file_names = anafora_directory_files[ 0] if len(anafora_file_names) != 1: raise Exception("Wrong number of anafora files in %s" % anafora_subdir_path) text_file_path = os.path.join(dataset_path, text_subdir_path, text_file_names[0]) anafora_file_path = os.path.join(anafora_path, anafora_subdir_path, anafora_file_names[0]) doc_features, doc_annotations = from_doc_to_features( model, nlp, text_file_path, anafora_file_path, True) features.extend(doc_features) if valid: annotations[text_doc_name] = doc_annotations doc_indices.append((text_subdir_path, doc_index, len(features))) else: for text_files in text_directory_files: doc_index = len(features) text_subdir_path, text_doc_name, text_file_names = text_files if len(text_file_names) != 1: raise Exception("Wrong number of text files in %s" % text_subdir_path) text_file_path = os.path.join(dataset_path, text_subdir_path, text_file_names[0]) doc_features, _ = from_doc_to_features(model, nlp, text_file_path) features.extend(doc_features) doc_indices.append((text_subdir_path, doc_index, len(features))) return TimeDataset(doc_indices, features, annotations)
def main(args): TYPE = 'Annotation type' ROWID = 'Row Id' STATUS = 'Status' ANNOTATOR = 'Annotator' LABEL = 'Label' SPAN_TEXT = 'Span text' ANNOT_TEXT = 'Annotated text' if len(args) < 2: sys.stderr.write('2 required arguments: <input anafora dir> <output csv file>\n') sys.exit(-1) with open(args[1], 'w', newline='') as csvfile: # fieldnames=['Annotation type', 'row_id', 'Status', 'Annotator', 'Label', 'Span text', 'Annotated text'] fieldnames=[TYPE, ROWID, STATUS, ANNOTATOR, LABEL, SPAN_TEXT, ANNOT_TEXT] csvout = csv.DictWriter(csvfile, fieldnames, delimiter=',', quotechar='"') csvout.writeheader() for sub_dir, text_name, xml_names in anafora.walk(args[0]): if len(xml_names) == 0: continue with open( join( join(args[0], sub_dir), text_name), 'r') as tf: text = tf.read() for xml_name in xml_names: xml_path = os.path.join(args[0], sub_dir, xml_name) xml_parts = xml_name.split('.') annotator = xml_parts[2] status = xml_parts[3] data = anafora.AnaforaData.from_file(xml_path) for annot in data.annotations.select_type('Problem'): span = annot.spans[0] span_text = text[span[0]:span[1]] cat = annot.properties['Content'] annotated_text = text[:span[0]] + '<problem> ' + span_text + ' </problem>' + text[span[1]:] csvout.writerow({TYPE:'Problem', ROWID:sub_dir, STATUS:status, ANNOTATOR:annotator, LABEL:cat, SPAN_TEXT:span_text, ANNOT_TEXT: annotated_text}) for annot in data.annotations.select_type('Question Type'): span = annot.spans[0] span_text = text[span[0]:span[1]] cat = annot.properties['Type'] annotated_text = text[:span[0]] + '<type> ' + span_text + ' </type>' + text[span[1]:] csvout.writerow({TYPE:'Type', ROWID:sub_dir, STATUS:status, ANNOTATOR:annotator, LABEL:cat, SPAN_TEXT:span_text, ANNOT_TEXT: annotated_text})
def fix_thyme_errors(schema, input_dir, output_dir, xml_name_regex="[.]xml$"): """ :param schema anafora.validate.Schema: the THYME schema :param input_dir str: the root of a set of THYME Anafora XML directories :param output_dir str: the directory where the cleaned versions of the THYME Anafora XML files should be written. The directory structure will mirror the input directory structure. """ for sub_dir, text_name, xml_names in anafora.walk(input_dir, xml_name_regex): for xml_name in xml_names: xml_path = os.path.join(input_dir, sub_dir, xml_name) # load the data from the Anafora XML try: data = anafora.AnaforaData.from_file(xml_path) except anafora.ElementTree.ParseError as e: logging.warning("SKIPPING invalid XML: %s: %s", e, xml_path) continue # remove invalid TLINKs and ALINKs changed = False to_remove = [] for annotation in data.annotations: try: schema.validate(annotation) except anafora.validate.SchemaValidationError as e: if annotation.type in {"TLINK", "ALINK"}: logging.warning("REMOVING %s: %s", e, annotation) to_remove.append(annotation) for annotation in to_remove: data.annotations.remove(annotation) changed = True # remove TIMEX3s that are directly on top of SECTIONTIMEs and DOCTIMEs for span, annotations in anafora.validate.find_entities_with_identical_spans(data): try: # sorts SECTIONTIME and DOCTIME before TIMEX3 special_time, timex = sorted(annotations, key=lambda a: a.type) except ValueError: pass else: if special_time.type in {"SECTIONTIME", "DOCTIME"} and timex.type == "TIMEX3": msg = "REPLACING multiple entities for span %s: %s WITH %s" logging.warning(msg, span, timex, special_time) for annotation in data.annotations: for name, value in annotation.properties.items(): if value is timex: annotation.properties[name] = special_time data.annotations.remove(timex) changed = True # if we found and fixed any errors, write out the new XML file if changed: output_sub_dir = os.path.join(output_dir, sub_dir) if not os.path.exists(output_sub_dir): os.makedirs(output_sub_dir) output_path = os.path.join(output_sub_dir, xml_name) data.to_file(output_path)
def xml_json_xml(input_dir, output_dir, set_to_super_interval): paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed') for sub_dir, text_file_name, xml_file_names in paths: for xml_file_name in xml_file_names: input_path = os.path.join(input_dir, sub_dir, xml_file_name) data = anafora.AnaforaData.from_file(input_path) data = parse_json(parse_element(data, set_to_super_interval)) output_parent = os.path.join(output_dir, sub_dir) if not os.path.exists(output_parent): os.makedirs(output_parent) data.to_file(os.path.join(output_parent, xml_file_name))
def text_data_pairs(): for sub_dir, text_name, xml_names in anafora.walk(train_dir, xml_name_regex): if text_dir is not None: text_path = os.path.join(text_dir, text_name) else: text_path = os.path.join(train_dir, sub_dir, text_name) if not os.path.exists(text_path): logging.warning("no text found at %s", text_path) continue with codecs.open(text_path, 'r', text_encoding) as text_file: text = text_file.read() for xml_name in xml_names: data = anafora.AnaforaData.from_file(os.path.join(train_dir, sub_dir, xml_name)) yield text, data
def write_xml(self, predicted_relations): """Write predictions in anafora XML format""" # make a directory to write anafora xml if os.path.isdir(self.out_dir): shutil.rmtree(self.out_dir) os.mkdir(self.out_dir) # key: note, value: list of rel arg tuples note2rels = defaultdict(list) # map notes to relations in these notes # for container_id, contained_id in predicted_relations: for contained_id, container_id in predicted_relations: note_name = container_id.split('@')[2] note2rels[note_name].append((container_id, contained_id)) # iterate over reference anafora xml files for sub_dir, text_name, file_names in anafora.walk( self.xml_dir, self.xml_regex): path = os.path.join(self.xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(path) # make a new XML file generated_data = anafora.AnaforaData() # copy gold events and times copy_annotations(ref_data, generated_data, 'EVENT') copy_annotations(ref_data, generated_data, 'TIMEX3') copy_annotations(ref_data, generated_data, 'SECTIONTIME') copy_annotations(ref_data, generated_data, 'DOCTIME') # add generated relations note_name = file_names[0].split('.')[0] for container_id, contained_id in note2rels[note_name]: relation = anafora.AnaforaRelation() relation.id = str(random.random())[2:] relation.type = 'TLINK' relation.parents_type = 'TemporalRelations' relation.properties['Source'] = container_id relation.properties['Type'] = 'CONTAINS' relation.properties['Target'] = contained_id generated_data.annotations.append(relation) # write xml to file generated_data.indent() os.mkdir(os.path.join(self.out_dir, sub_dir)) out_path = os.path.join(self.out_dir, sub_dir, file_names[0]) generated_data.to_file(out_path)
def log_entities_with_identical_spans(anafora_dir, xml_name_regex): """ :param AnaforaData data: the Anafora data to be searched """ for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex): for xml_name in xml_names: xml_path = os.path.join(anafora_dir, sub_dir, xml_name) try: data = anafora.AnaforaData.from_file(xml_path) except anafora.ElementTree.ParseError: pass else: for span, annotations in find_entities_with_identical_spans(data): logging.warn("%s: multiple entities for span %s:\n%s", xml_path, span, "\n".join(str(ann).rstrip() for ann in annotations))
def compare_through_golden_files(input_dir): paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed') for sub_dir, text_file_name, xml_file_names in paths: for xml_file_name in xml_file_names: input_path = os.path.join(input_dir, sub_dir, xml_file_name) data = anafora.AnaforaData.from_file(input_path) data = parse_json(parse_element(data)) data.to_file("output_from_labelstud.xml") remove_some_elements(input_path, "output_from_golden.xml") compare_two_files("output_from_labelstud.xml", "output_from_golden.xml", xml_file_name) return
def log_schema_errors(schema, anafora_dir, xml_name_regex): """ :param Schema schema: the schema to validate against :param string anafora_dir: the Anafora directory containing directories to validate """ for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex): for xml_name in xml_names: xml_path = os.path.join(anafora_dir, sub_dir, xml_name) try: data = anafora.AnaforaData.from_file(xml_path) except anafora.ElementTree.ParseError: logging.warn("%s: invalid XML", xml_path) else: for annotation, error in schema.errors(data): logging.warn("%s: %s", xml_path, error)
def text_data_pairs(): for sub_dir, text_name, xml_names in anafora.walk( train_dir, xml_name_regex): if text_dir is not None: text_path = os.path.join(text_dir, text_name) else: text_path = os.path.join(train_dir, sub_dir, text_name) if not os.path.exists(text_path): logging.warning("no text found at %s", text_path) continue with codecs.open(text_path, 'r', text_encoding) as text_file: text = text_file.read() for xml_name in xml_names: data = anafora.AnaforaData.from_file( os.path.join(train_dir, sub_dir, xml_name)) yield text, data
def _main(input_dir, output_dir, xml_name_regex="[.]xml$", include=None, exclude=None): select = Select(include, exclude) for sub_dir, text_name, xml_names in anafora.walk(input_dir, xml_name_regex): for xml_name in xml_names: # reads in the data from the input file xml_path = os.path.join(input_dir, sub_dir, xml_name) data = anafora.AnaforaData.from_file(xml_path) # find annotations and properties to remove annotations_to_remove = [] annotation_properties_to_remove = [] for annotation in data.annotations: # remove the annotation if its type has not been selected if not select(annotation.type): annotations_to_remove.append(annotation) else: for name, value in annotation.properties.items(): # remove the property if its name or value has not been selected if not select(annotation.type, name, value): annotation_properties_to_remove.append( (annotation, name)) # if we're overwriting, save a backup of the original if annotations_to_remove or annotation_properties_to_remove: data.to_file(xml_path + ".bak") # do the actual removal of annotations here for annotation in annotations_to_remove: data.annotations.remove(annotation) for annotation, name in annotation_properties_to_remove: del annotation.properties[name] # writes out the modified data to the output file output_sub_dir = os.path.join(output_dir or input_dir, sub_dir) if not os.path.exists(output_sub_dir): os.makedirs(output_sub_dir) output_path = os.path.join(output_sub_dir, xml_name) data.to_file(output_path)
def write_xml(self, prediction_lookup): """Write predictions in anafora XML format""" # make a directory to write anafora xml if os.path.isdir(self.xml_out_dir): shutil.rmtree(self.xml_out_dir) os.mkdir(self.xml_out_dir) # t5 occasionally fails to predict missing_predictions = [] # iterate over reference xml files # look up the DTR prediction for each event # and write it in anafora format to specificed dir for sub_dir, text_name, file_names in \ anafora.walk(self.xml_ref_dir, xml_regex): path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): # make a new entity and copy some ref info entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type # lookup the prediction key = '|'.join((sub_dir, str(start), str(end))) if key not in prediction_lookup: # use majority class for now entity.properties['DocTimeRel'] = 'OVERLAP' missing_predictions.append(key) else: entity.properties['DocTimeRel'] = prediction_lookup[key] data.annotations.append(entity) data.indent() os.mkdir(os.path.join(self.xml_out_dir, sub_dir)) out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0]) data.to_file(out_path) print('number of missing predictions:', len(missing_predictions))
def convert_dir(input_dir, output_dir, raw_dir=None): if not os.path.exists(output_dir): os.mkdir(output_dir) for document in anafora.walk(input_dir): document_dir = document[0] document_name = document[1] for xml_name in document[2]: if xml_name.endswith(".TimeNorm.gold.completed.xml"): xml_path = os.path.join(input_dir, document_dir, xml_name) output_path = os.path.join(output_dir, document_name, xml_name) raw_path = None if raw_dir is not None: raw_path = os.path.join(raw_dir, document_dir, document_name) convert_xml(xml_path, output_path, raw_path)
def sub_to_super(input_dir, output_dir): paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed') for sub_dir, text_file_name, xml_file_names in paths: for xml_file_name in xml_file_names: input_path = os.path.join(input_dir, sub_dir, xml_file_name) data = anafora.AnaforaData.from_file(input_path) data = check_year_of_sub_interval(data) for entity in data.annotations: if 'Sub-Interval' in entity.properties: sub_entity = entity.properties['Sub-Interval'] if sub_entity: sub_entity.properties['Super-Interval'] = entity.id del entity.properties['Sub-Interval'] output_parent = os.path.join(output_dir, sub_dir) if not os.path.exists(output_parent): os.makedirs(output_parent) data.to_file(os.path.join(output_parent, xml_file_name))
def read(self): """Make x, y etc.""" inputs = [] labels = [] tokenizer = BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True) for sub_dir, text_name, file_names in \ anafora.walk(self.xml_dir, self.xml_regex): xml_path = os.path.join(self.xml_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(xml_path) text_path = os.path.join(self.text_dir, text_name) text = open(text_path).read() for event in ref_data.annotations.select_type('EVENT'): label = event.properties['DocTimeRel'] labels.append(label2int[label]) start, end = event.spans[0] event = text[start:end] # should be end+1? left = text[start - self.context_chars : start] right = text[end : end + self.context_chars] context = left + ' es ' + event + ' ee ' + right inputs.append(tokenizer.encode(context.replace('\n', ''))) inputs = pad_sequences( inputs, maxlen=self.max_length, dtype='long', truncating='post', padding='post') masks = [] # attention masks for sequence in inputs: mask = [float(value > 0) for value in sequence] masks.append(mask) return inputs, labels, masks
def write(self, predictions): """Write predictions in anafora XML format""" # predictions are in the same order in which they were read prediction_lookup = dict(zip(self.offsets, predictions)) # make a directory to write anafora xml if os.path.isdir(self.xml_out_dir): shutil.rmtree(self.xml_out_dir) os.mkdir(self.xml_out_dir) # iterate over reference xml files # look up the DTR prediction for each event # and write it in anafora format to specificed dir for sub_dir, text_name, file_names in \ anafora.walk(self.xml_ref_dir, xml_regex): path = os.path.join(self.xml_ref_dir, sub_dir, file_names[0]) ref_data = anafora.AnaforaData.from_file(path) data = anafora.AnaforaData() for event in ref_data.annotations.select_type('EVENT'): # make a new entity and copy some ref info entity = anafora.AnaforaEntity() entity.id = event.id start, end = event.spans[0] entity.spans = event.spans entity.type = event.type # lookup the prediction if (sub_dir, start, end) not in prediction_lookup: print('missing key:', (sub_dir, start, end)) continue label = prediction_lookup[(sub_dir, start, end)] entity.properties['DocTimeRel'] = int2label[label] data.annotations.append(entity) data.indent() os.mkdir(os.path.join(self.xml_out_dir, sub_dir)) out_path = os.path.join(self.xml_out_dir, sub_dir, file_names[0]) data.to_file(out_path)
def main(input_dir, exclude, verbose): duplicate_types = Counter() parent_types = Counter() paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed') for sub_dir, text_file_name, xml_file_names in paths: for xml_file_name in xml_file_names: input_path = os.path.join(input_dir, sub_dir, xml_file_name) data = anafora.AnaforaData.from_file(input_path) # find entities that share the same type and span counts = Counter() for entity in data.annotations: #if entity.type not in exclude: counts[entity] += 1 duplicates = {key for key, count in counts.items() if count > 1} # which types are most often duplicated if duplicates: if verbose: print(f"{xml_file_name}") for entity in sorted(duplicates): if verbose: print(f" {entity.spans} {entity.type}") duplicate_types[entity.type] += 1 # which types most often have duplicated entities as arguments for entity in data.annotations: for _, value in entity.properties.items(): if isinstance( value, anafora.AnaforaAnnotation) and value in duplicates: if verbose: print( f" parent: {entity.id} {entity.spans} ----> " f"{value.id} {value.spans}") parent_types[entity.type] += 1 break print(f'duplicate types: {duplicate_types}') print(f'parent types: {parent_types}')
def log_entities_with_identical_spans(anafora_dir, xml_name_regex): """ :param AnaforaData data: the Anafora data to be searched """ for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex): for xml_name in xml_names: xml_path = os.path.join(anafora_dir, sub_dir, xml_name) try: data = anafora.AnaforaData.from_file(xml_path) except anafora.ElementTree.ParseError: pass else: for span, annotations in find_entities_with_identical_spans( data): logging.warn( "%s: multiple entities for span %s:\n%s", xml_path, span, "\n".join(str(ann).rstrip() for ann in annotations))
def _main(input_dir, output_dir, xml_name_regex="[.]xml$", include=None, exclude=None): select = Select(include, exclude) for sub_dir, text_name, xml_names in anafora.walk(input_dir, xml_name_regex): for xml_name in xml_names: # reads in the data from the input file xml_path = os.path.join(input_dir, sub_dir, xml_name) data = anafora.AnaforaData.from_file(xml_path) # find annotations and properties to remove annotations_to_remove = [] annotation_properties_to_remove = [] for annotation in data.annotations: # remove the annotation if its type has not been selected if not select(annotation.type): annotations_to_remove.append(annotation) else: for name, value in annotation.properties.items(): # remove the property if its name or value has not been selected if not select(annotation.type, name, value): annotation_properties_to_remove.append((annotation, name)) # if we're overwriting, save a backup of the original if annotations_to_remove or annotation_properties_to_remove: data.to_file(xml_path + ".bak") # do the actual removal of annotations here for annotation in annotations_to_remove: data.annotations.remove(annotation) for annotation, name in annotation_properties_to_remove: del annotation.properties[name] # writes out the modified data to the output file output_sub_dir = os.path.join(output_dir or input_dir, sub_dir) if not os.path.exists(output_sub_dir): os.makedirs(output_sub_dir) output_path = os.path.join(output_sub_dir, xml_name) data.to_file(output_path)
def _copy_text(text_name_to_path, get_text, get_dct, anafora_dir, xml_name_regex): for sub_dir, text_file_name, _ in anafora.walk( anafora_dir, xml_name_regex=xml_name_regex): if text_file_name not in text_name_to_path: sys.exit("No text file found for " + text_file_name) text_path = os.path.join(anafora_dir, sub_dir, text_file_name) if os.path.exists(text_path): sys.exit("Text file already exists: " + text_path) input_path = text_name_to_path[text_file_name] text = get_text(input_path) with open(text_path, 'w') as text_file: text_file.write(text) if get_dct is not None: dct_path = text_path + ".dct" if os.path.exists(dct_path): sys.exit("DCT file already exists: " + dct_path) dct = get_dct(input_path) with open(dct_path, 'w') as dct_file: dct_file.write(dct) dct_file.write("\n")
def main(args): if len(args) < 1: sys.stderr.write('Required argument(s): <anafora directory>\n') sys.exit(-1) for sub_dir, text_name, xml_names in walk( args[0], xml_name_regex='[.]dave\.completed\.xml$'): # print("text_name = %s" % text_name) assert len(xml_names) == 1 xml_file = join(args[0], sub_dir, xml_names[0]) # print("xml file = %s" % (xml_file,)) anafora_data = AnaforaData.from_file(xml_file) for event in anafora_data.annotations.select_type('CuiEntity'): negated = event.properties['negated'] span = event.spans assert len(span) == 1 span = span[0] assert len(span) == 2 # print("Row id %s event with span (%d, %d) is negated=%s" % (text_name, span[0], span[1], str(negated))) print('%s\t%d\t%d\t%s' % (text_name, span[0], span[1], str(negated)))
def main(input_dir, output_dir): paths = anafora.walk(input_dir, xml_name_regex=r'TimeNorm\.gold\.completed') for sub_dir, text_file_name, xml_file_names in paths: for xml_file_name in xml_file_names: input_path = os.path.join(input_dir, sub_dir, xml_file_name) data = anafora.AnaforaData.from_file(input_path) # find entities that share the same type and span counts = Counter() for entity in data.annotations: counts[entity] += 1 duplicates = {key for key, count in counts.items() if count > 1} identical_span_to_parent_entity = {} # same spans with different id remove_id = [] # which types most often have duplicated entities as arguments for entity in data.annotations: for _, value in entity.properties.items(): if isinstance(value, anafora.AnaforaAnnotation) and value in duplicates: if value.spans not in identical_span_to_parent_entity: identical_span_to_parent_entity[value.spans] = value.id else: remove_id.append(value.id) entity.properties['Super-Interval'] = identical_span_to_parent_entity[value.spans] break for entity in list(data.annotations): if entity.id in remove_id: data.annotations.remove(entity) output_parent = os.path.join(output_dir, sub_dir) if not os.path.exists(output_parent): os.makedirs(output_parent) data.to_file(os.path.join(output_parent, xml_file_name))
help="The <type> of the target annotations.") relations_to_closest_parser.add_argument("-r", "--relation", metavar="TYPE", dest="relation_type", required=True, help="The <type> of relation annotation to be created.") relations_to_closest_parser.add_argument("-rs", "--relation-source", metavar="NAME", required=True, dest="relation_source_property_name", help="The name of the property on the relation annotation that should " + "point to the source annotation.") relations_to_closest_parser.add_argument("-rt", "--relation-target", metavar="NAME", required=True, dest="relation_target_property_name", help="The name of the property on the relation annotation that should " + "point to the target annotation.") relations_to_closest_parser.add_argument("-ro", "--relation-other", metavar="NAME=VALUE", nargs='+', type=_pair, dest="relation_other_properties", help="Other properties that should be added to the relation annotation.") args = parser.parse_args() kwargs = vars(args) func = kwargs.pop("func") input_dir = kwargs.pop('input_dir') xml_name_regex = kwargs.pop('xml_name_regex') output_dir = kwargs.pop('output_dir') for sub_dir, _, xml_file_names in anafora.walk(input_dir, xml_name_regex): for xml_file_name in xml_file_names: input_data = anafora.AnaforaData.from_file(os.path.join(input_dir, sub_dir, xml_file_name)) func(input_data, **kwargs) output_sub_dir = os.path.join(output_dir, sub_dir) if not os.path.exists(output_sub_dir): os.makedirs(output_sub_dir) input_data.to_file(os.path.join(output_dir, sub_dir, xml_file_name))
def from_texts(cls, data_dir, nlp, tokenizer, config): if not os.path.exists(data_dir): raise Exception("The %s directory does not exist." % data_dir) text_directory_files = anafora.walk( data_dir, xml_name_regex=".*((?<![.].{3})|[.]txt)$") features = [] doc_indices = [] for text_files in text_directory_files: doc_index = len(features) text_subdir_path, text_doc_name, text_file_names = text_files if len(text_file_names) != 1: raise Exception("Wrong number of text files in %s" % text_subdir_path) anafora_path = os.path.join(data_dir, text_subdir_path) anafora_directory_files = anafora.walk(anafora_path, xml_name_regex="[.]xml$") anafora_directory_files = list(anafora_directory_files) if len(anafora_directory_files) != 1: raise Exception("Wrong structure in %s" % anafora_path) anafora_subdir_path, anafora_doc_name, anafora_file_names = anafora_directory_files[ 0] if len(anafora_file_names) != 1: raise Exception("Wrong number of anafora files in %s" % anafora_subdir_path) text_file_path = os.path.join(data_dir, text_subdir_path, text_file_names[0]) # Load the annotations anafora_file_path = os.path.join(anafora_path, anafora_subdir_path, anafora_file_names[0]) data = anafora.AnaforaData.from_file(anafora_file_path) annotations = dict() for annotation in data.annotations: label = annotation.type for span in annotation.spans: start, end = span annotations[start] = (end, label) # Read, segment and tokenize the raw text. with open(text_file_path) as txt_file: text = txt_file.read() doc = nlp(text) input_raw = [sent.text_with_ws for sent in doc.sents] input_data = tokenizer(input_raw, return_tensors="pt", padding="max_length", truncation="longest_first", return_offsets_mapping=True) # Initialize label sequence with 0. Use ignore index for padding tokens negative_attention_mask = ( ~input_data["attention_mask"].byte()).true_divide(255).long() input_data["labels"] = negative_attention_mask.mul( config.label_pad_id) # Assign label_pad to </s> token sent_indices = torch.arange(input_data["labels"].shape[0]) last_non_padded = [ sent_indices, input_data["labels"].argmax(dim=1) ] input_data["labels"][last_non_padded] = config.label_pad_id # Assign label_pad to <s> token input_data["labels"][:, 0] = config.label_pad_id sent_offset = 0 for sent_idx, _ in enumerate(input_data["input_ids"]): features.append( TimexInputFeatures.from_sentence(input_data, sent_idx, sent_offset, annotations, config)) sent_offset += len(input_raw[sent_idx]) doc_indices.append((text_subdir_path, doc_index, len(features))) return cls(doc_indices, features)
from shutil import copyfile import anafora import os split_files_path = 'path/to/train-all-data' split_train_path = 'path/to/train-new-data' split_test_input_path = 'path/to/test-input' split_test_label_path = 'path/to/test-label' text_directory_files = anafora.walk(split_files_path, xml_name_regex=".*((?<![.].{3})|[.]xml)$") i = 0 for text_files in text_directory_files: text_subdir_path, text_doc_name, text_file_names = text_files for text_file_name in text_file_names: old_xml_file_path = os.path.join(split_files_path, text_subdir_path, text_file_name) if i % 5 == 4: if text_file_name.endswith("xml"): new_xml_dir_path = os.path.join(split_test_label_path, text_subdir_path) else: new_xml_dir_path = os.path.join(split_test_input_path, text_subdir_path) else: new_xml_dir_path = os.path.join(split_train_path, text_subdir_path) if not os.path.exists(new_xml_dir_path): os.makedirs(new_xml_dir_path, 0o0755) new_xml_file_path = os.path.join(new_xml_dir_path, text_file_name) copyfile(old_xml_file_path, new_xml_file_path) i += 1
for l in f: predict_modality.append(int(l.strip())) labelidx = 0 for dir_path, dir_names, file_names in os.walk(input_text_dir): pbar = ProgressBar(maxval=len(file_names)).start() for i, fn in enumerate(sorted(file_names)): time.sleep(0.01) pbar.update(i + 1) # this for to make consistence for sub_dir, text_name, xml_names in anafora.walk( os.path.join(ann_dir, fn)): for xml_name in xml_names: if "Temporal" not in xml_name: continue xml_path = os.path.join(ann_dir, text_name, xml_name) data = anafora.AnaforaData.from_file(xml_path) positive_span_label_map = {} for annotation in data.annotations: if annotation.type == 'EVENT': startoffset = annotation.spans[0][0]
def score_dirs(reference_dir, predicted_dir, xml_name_regex="[.]xml$", text_dir=None, include=None, exclude=None, scores_type=Scores, annotation_wrapper=None): """ :param string reference_dir: directory containing reference ("gold standard") Anafora XML directories :param string predicted_dir: directory containing predicted (system-generated) Anafora XML directories :param xml_name_regex: regular expression matching the files to be compared :param string text_dir: directory containing the raw texts corresponding to the Anafora XML (if None, texts are assumed to be in the reference dir) :param set include: types of annotations to include (others will be excluded); may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param type scores_type: type for calculating matches between predictions and reference :param type annotation_wrapper: wrapper object to apply to AnaforaAnnotations :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from (annotation type[, property name[, property value]]) to a Scores object """ # walks through the reference Anafora XML directories, scoring each and adding those to the overall scores for sub_dir, text_name, reference_xml_names in anafora.walk( reference_dir, xml_name_regex): # load the reference data from its Anafora XML try: [reference_xml_name] = reference_xml_names except ValueError: logging.warn("expected one reference file for %s, found %s", text_name, reference_xml_names) if not reference_xml_names: continue reference_xml_name = reference_xml_names[0] reference_xml_path = os.path.join(reference_dir, sub_dir, reference_xml_name) reference_data = _load(reference_xml_path) # check for self-references in the annotations, which cause equality and hashing to fail self_reference = reference_data.annotations.find_self_referential() if self_reference is not None: msg = "skipping reference file %s with self-referential annotation %s" logging.warn(msg, reference_xml_path, self_reference.id) continue # find and load the corresponding predicted data from its Anafora XML predicted_xml_glob = os.path.join(predicted_dir, sub_dir, text_name + "*.xml") predicted_xml_paths = [ f for f in glob.glob(predicted_xml_glob) if re.search(xml_name_regex, f) is not None ] try: [predicted_xml_path] = predicted_xml_paths predicted_data = _load(predicted_xml_path) except ValueError: logging.warn("expected one predicted file at %s, found %s", predicted_xml_glob, predicted_xml_paths) if not predicted_xml_paths: predicted_xml_path = None predicted_data = anafora.AnaforaData() else: predicted_xml_path = predicted_xml_paths[0] predicted_data = _load(predicted_xml_path) # check for self-references in the annotations, which cause equality and hashing to fail self_reference = predicted_data.annotations.find_self_referential() if self_reference is not None: msg = "skipping predicted file %s with self-referential annotation %s" logging.warn(msg, predicted_xml_path, self_reference.id) predicted_data = anafora.AnaforaData() # determine the path for the raw text source file if text_dir is None: text_path = os.path.join(reference_dir, sub_dir, text_name) else: text_path = os.path.join(text_dir, text_name) # if no raw text was found, then asking for the text of an annotation is an error if not os.path.exists(text_path) or not os.path.isfile(text_path): def _span_text(_): raise RuntimeError( "no text file found at {0}".format(text_path)) # otherwise, the text of an annotation can be extracted based on its spans else: with open(text_path) as text_file: text = text_file.read() def _flatten(items): if isinstance(items, tuple) and isinstance(items[0], int): yield items else: for item in items: for flattened_items in _flatten(item): yield flattened_items def _span_text(spans): return "...".join(text[start:end] for start, end in _flatten(spans)) # score this data and update the overall scores named_scores = score_data(reference_data, predicted_data, include, exclude, scores_type=scores_type, annotation_wrapper=annotation_wrapper) for name, scores in named_scores.items(): # if there were some predictions, and if we're using scores that keep track of errors, log the errors if predicted_xml_paths: for annotation, message in getattr(scores, "errors", []): logging.debug('%s: %s: "%s" %s"', text_name, message, _span_text(annotation.spans), annotation) # generate the file name and the resulting scores yield text_name, named_scores
def score_annotators(anafora_dir, xml_name_regex, include=None, exclude=None, scores_type=Scores, annotation_wrapper=None): """ :param anafora_dir: directory containing Anafora XML directories :param xml_name_regex: regular expression matching the annotator files to be compared :param include: types of annotations to include (others will be excluded); may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param type scores_type: type for calculating matches between predictions and reference :param type annotation_wrapper: wrapper object to apply to AnaforaAnnotations :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from (annotation type[, property name[, property value]]) to a Scores object """ # pattern for extracting the annotator name from the Anafora XML file name annotator_name_regex = "([^.]*)[.][^.]*[.]xml$" # function for getting a canonical prefix corresponding to a pair of annotators def make_prefix(annotators): return "{0}-vs-{1}".format(*sorted(annotators)) # walks through the Anafora XML directories, scoring each and adding those to the overall scores for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex): # load the data from each Anafora XML file annotator_data = [] for xml_name in xml_names: # ignore in-progress annotations and automatic pre-annotations if '.inprogress.' in xml_name or '.preannotation.' in xml_name: continue # ignore empty files xml_path = os.path.join(anafora_dir, sub_dir, xml_name) if os.stat(xml_path).st_size == 0: continue # load the data and add it to the list data = _load(xml_path) annotator_name = re.search(annotator_name_regex, xml_name).group(1) annotator_data.append((annotator_name, data)) # at least 2 annotators are needed for annotator agreement if len(annotator_data) < 2: logging.warn("%s: found fewer than 2 annotators: %s", text_name, xml_names) continue # pair each annotator with each other annotator annotator_named_scores = collections.defaultdict(lambda: scores_type()) for i in range(len(annotator_data)): annotator1, data1 = annotator_data[i] for j in range(i + 1, len(annotator_data)): annotator2, data2 = annotator_data[j] # make a prefix for this specific pair of annotators prefix = make_prefix([annotator1, annotator2]) # make a prefix where non-gold annotators are just called "annotator" general_prefix = make_prefix(a if a == "gold" else "annotator" for a in [annotator1, annotator2]) # perform the comparison of the two annotation sets and update the overall scores named_scores = score_data( data1, data2, include, exclude, scores_type=scores_type, annotation_wrapper=annotation_wrapper) # add annotators as prefixes for name, scores in named_scores.items(): if not isinstance(name, tuple): name = name, annotator_named_scores[(prefix, ) + name].update(scores) annotator_named_scores[(general_prefix, ) + name].update(scores) # generate the filename and the resulting scores yield text_name, annotator_named_scores
input_text_dir = os.path.join(plain_dir, "test") ann_dir = os.path.join(base_dir, 'annotation/coloncancer/Test') for dir_path, dir_names, file_names in os.walk(input_text_dir): pbar = ProgressBar(maxval=len(file_names)).start() for i, fn in enumerate(sorted(file_names)): time.sleep(0.01) pbar.update(i + 1) # this for to make consistence for sub_dir, text_name, xml_names in anafora.walk(os.path.join(ann_dir, fn)): for xml_name in xml_names: if "Temporal" not in xml_name: continue xml_path = os.path.join(ann_dir, text_name, xml_name) data = anafora.AnaforaData.from_file(xml_path) positive_span_label_map={} for annotation in data.annotations: if annotation.type == 'EVENT': startoffset = annotation.spans[0][0]
def preprocess_data_torch(input_text_dir, input_ann_dir, outDir, window_size, input_name, input_type, Shuffle): maxchar = 0 num_doc = 0 with open(os.path.join(outDir, input_name+"_"+input_type+".csv"), 'w') as csvf: for dir_path, dir_names, file_names in os.walk(input_text_dir): pbar = ProgressBar(maxval=len(file_names)).start() for i, fn in enumerate(sorted(file_names)): time.sleep(0.01) pbar.update(i + 1) for sub_dir, text_name, xml_names in anafora.walk(os.path.join(input_ann_dir, fn)): for xml_name in xml_names: if "Temporal" not in xml_name: continue num_doc += 1 #print fn xml_path = os.path.join(input_ann_dir, text_name, xml_name) data = anafora.AnaforaData.from_file(xml_path) with open(os.path.join(input_text_dir, fn), 'r') as f: content = f.read() positive_span_label_map={} for annotation in data.annotations: if annotation.type == 'EVENT': startoffset = annotation.spans[0][0] endoffset = annotation.spans[0][1] properties = annotation.properties pros = {} for pro_name in properties: pro_val = properties.__getitem__(pro_name) pros[pro_name] = pro_val if input_name == "type": label = Type[pros["Type"]] elif input_name == "polarity": label = Polarity[pros["Polarity"]] elif input_name == "degree": label = Degree[pros["Degree"]] elif input_name == "modality": label = ContextualModality[pros["ContextualModality"]] positive_span_label_map[(startoffset,endoffset)] = label all_spans = content2span(content) negative_span_label_map={} for span in all_spans: if span not in positive_span_label_map: if input_name == "type": negative_span_label_map[span] = "4" elif input_name == "polarity": negative_span_label_map[span] = "3" elif input_name == "degree": negative_span_label_map[span] = "4" elif input_name == "modality": negative_span_label_map[span] = "5" merged_spans = positive_span_label_map.keys() + negative_span_label_map.keys() if Shuffle: shuffle(merged_spans) for span in merged_spans: feats = feature_generation_1(content, span[0], span[1], window_size) if maxchar < len(feats): maxchar = len(feats) if span in positive_span_label_map: label = positive_span_label_map[span] elif span in negative_span_label_map: label = negative_span_label_map[span] label = "\"" +label+"\"" feats = "\"" +feats+"\"" csvf.write(label+","+feats+"\n") pbar.finish() print "max char is: " + str(maxchar) print "num_doc is: " +str(num_doc)
dest="relation_target_property_name", help="The name of the property on the relation annotation that should " + "point to the target annotation.") relations_to_closest_parser.add_argument( "-ro", "--relation-other", metavar="NAME=VALUE", nargs='+', type=_pair, dest="relation_other_properties", help="Other properties that should be added to the relation annotation." ) args = parser.parse_args() kwargs = vars(args) func = kwargs.pop("func") input_dir = kwargs.pop('input_dir') xml_name_regex = kwargs.pop('xml_name_regex') output_dir = kwargs.pop('output_dir') for sub_dir, _, xml_file_names in anafora.walk(input_dir, xml_name_regex): for xml_file_name in xml_file_names: input_data = anafora.AnaforaData.from_file( os.path.join(input_dir, sub_dir, xml_file_name)) func(input_data, **kwargs) output_sub_dir = os.path.join(output_dir, sub_dir) if not os.path.exists(output_sub_dir): os.makedirs(output_sub_dir) input_data.to_file(os.path.join(output_dir, sub_dir, xml_file_name))
def main(args): if len(args) < 3: sys.stderr.write( "Required arguments: <input directory> <rest host> <output directory>\n" ) sys.exit(-1) hostname = args[1] # initialize rest server init_url = 'http://%s:8000/temporal/initialize' % hostname process_url = 'http://%s:8000/temporal/process' % hostname # sentence segmenter rush = RuSH('conf/rush_rules.tsv') # tokenizer # tokenizer = TreebankWordTokenizer() r = requests.post(init_url) if r.status_code != 200: sys.stderr.write('Error: rest init call was not successful\n') sys.exit(-1) for sub_dir, text_name, xml_names in anafora.walk(args[0], xml_name_regex): print("Processing filename: %s" % (text_name)) if len(xml_names) > 1: sys.stderr.write( 'There were multiple valid xml files for file %s' % (text_name)) sys.exit(-1) xml_name = xml_names[0] with open(os.path.join(args[0], sub_dir, text_name)) as f: text = f.read() sentences = rush.segToSentenceSpans(text) sent_tokens = [] for sentence in sentences: sent_txt = text[sentence.begin:sentence.end] sent_tokens.append(tokenize(sent_txt)) r = requests.post(process_url, json={'sent_tokens': sent_tokens}) if r.status_code != 200: sys.stderr.write('Error: rest call was not successful\n') sys.exit(-1) json = r.json() anafora_data = AnaforaData() cur_id = 0 for sent_ind, sentence in enumerate(sentences): sent_txt = text[sentence.begin:sentence.end] sent_events = json['events'][sent_ind] sent_timexes = json['timexes'][sent_ind] try: token_spans = align_tokens(sent_tokens[sent_ind], sent_txt) except Exception as e: sys.stderr.write( 'In document %s, error \n%s\n processing sentence:\n*****\n%s\n******\n' % (text_name, str(e), sent_txt)) sys.exit(-1) for event in sent_events: begin_token_ind = event['begin'] end_token_ind = event['end'] dtr = event['dtr'] event_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin event_end_offset = token_spans[end_token_ind][ 1] + sentence.begin event_text = text[event_start_offset:event_end_offset] annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((event_start_offset, event_end_offset), ) annot.type = "EVENT" annot.properties['DocTimeRel'] = dtr anafora_data.annotations.append(annot) #print("Found event %s" % (event_text)) for timex in sent_timexes: begin_token_ind = timex['begin'] end_token_ind = timex['end'] time_class = timex['timeClass'] timex_start_offset = token_spans[begin_token_ind][ 0] + sentence.begin timex_end_offset = token_spans[end_token_ind][ 1] + sentence.begin timex_text = text[timex_start_offset:timex_end_offset] # create anafora entry annot = AnaforaEntity() annot.id = str(cur_id) + "@e@" + text_name cur_id += 1 annot.spans = ((timex_start_offset, timex_end_offset), ) annot.type = "TIMEX3" annot.properties['Class'] = time_class anafora_data.annotations.append(annot) #print("Found timex %s" % (timex_text)) #break anafora_data.indent() os.makedirs(os.path.join(args[2], sub_dir), exist_ok=True) anafora_data.to_file(os.path.join(args[2], sub_dir, xml_name))
def score_dirs(reference_dir, predicted_dir, xml_name_regex="[.]xml$", text_dir=None, include=None, exclude=None, scores_type=Scores, spans_type=None): """ :param string reference_dir: directory containing reference ("gold standard") Anafora XML directories :param string predicted_dir: directory containing predicted (system-generated) Anafora XML directories :param xml_name_regex: regular expression matching the files to be compared :param string text_dir: directory containing the raw texts corresponding to the Anafora XML (if None, texts are assumed to be in the reference dir) :param set include: types of annotations to include (others will be excluded); may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param type scores_type: type for calculating matches between predictions and reference :param type spans_type: wrapper object to apply to annotation spans :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from (annotation type[, property name[, property value]]) to a Scores object """ # walks through the reference Anafora XML directories, scoring each and adding those to the overall scores for sub_dir, text_name, reference_xml_names in anafora.walk(reference_dir, xml_name_regex): # load the reference data from its Anafora XML try: [reference_xml_name] = reference_xml_names except ValueError: logging.warn("expected one reference file for %s, found %s", text_name, reference_xml_names) if not reference_xml_names: continue reference_xml_name = reference_xml_names[0] reference_xml_path = os.path.join(reference_dir, sub_dir, reference_xml_name) reference_data = _load(reference_xml_path) # check for self-references in the annotations, which cause equality and hashing to fail self_reference = reference_data.annotations.find_self_referential() if self_reference is not None: msg = "skipping reference file %s with self-referential annotation %s" logging.warn(msg, reference_xml_path, self_reference.id) continue # find and load the corresponding predicted data from its Anafora XML predicted_xml_glob = os.path.join(predicted_dir, sub_dir, text_name + "*.xml") predicted_xml_paths = [f for f in glob.glob(predicted_xml_glob) if re.search(xml_name_regex, f) is not None] try: [predicted_xml_path] = predicted_xml_paths predicted_data = _load(predicted_xml_path) except ValueError: logging.warn("expected one predicted file at %s, found %s", predicted_xml_glob, predicted_xml_paths) if not predicted_xml_paths: predicted_xml_path = None predicted_data = anafora.AnaforaData() else: predicted_xml_path = predicted_xml_paths[0] predicted_data = _load(predicted_xml_path) # check for self-references in the annotations, which cause equality and hashing to fail self_reference = predicted_data.annotations.find_self_referential() if self_reference is not None: msg = "skipping predicted file %s with self-referential annotation %s" logging.warn(msg, predicted_xml_path, self_reference.id) predicted_data = anafora.AnaforaData() # determine the path for the raw text source file if text_dir is None: text_path = os.path.join(reference_dir, sub_dir, text_name) else: text_path = os.path.join(text_dir, text_name) # if no raw text was found, then asking for the text of an annotation is an error if not os.path.exists(text_path) or not os.path.isfile(text_path): def _span_text(_): raise RuntimeError("no text file found at {0}".format(text_path)) # otherwise, the text of an annotation can be extracted based on its spans else: with open(text_path) as text_file: text = text_file.read() def _flatten(items): if isinstance(items, tuple) and isinstance(items[0], int): yield items else: for item in items: for flattened_items in _flatten(item): yield flattened_items def _span_text(spans): return "...".join(text[start:end] for start, end in _flatten(spans)) # score this data and update the overall scores named_scores = score_data(reference_data, predicted_data, include, exclude, scores_type=scores_type, spans_type=spans_type) for name, scores in named_scores.items(): # if there were some predictions, and if we're using scores that keep track of errors, log the errors if predicted_xml_paths: for annotation, message in getattr(scores, "errors", []): spans, _, _ = annotation logging.debug('%s: %s: "%s" %s"', text_name, message, _span_text(spans), annotation) # generate the file name and the resulting scores yield text_name, named_scores
def score_annotators(anafora_dir, xml_name_regex, include=None, exclude=None, scores_type=Scores, spans_type=None): """ :param anafora_dir: directory containing Anafora XML directories :param xml_name_regex: regular expression matching the annotator files to be compared :param include: types of annotations to include (others will be excluded); may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param set exclude: types of annotations to exclude; may be type names, (type-name, property-name) tuples, (type-name, property-name, property-value) tuples :param type scores_type: type for calculating matches between predictions and reference :param type spans_type: wrapper object to apply to annotation spans :return iter: an iterator of (file-name, name-to-scores) where name-to-scores is a mapping from (annotation type[, property name[, property value]]) to a Scores object """ # pattern for extracting the annotator name from the Anafora XML file name annotator_name_regex = "([^.]*)[.][^.]*[.]xml$" # function for getting a canonical prefix corresponding to a pair of annotators def make_prefix(annotators): return "{0}-vs-{1}".format(*sorted(annotators)) # walks through the Anafora XML directories, scoring each and adding those to the overall scores for sub_dir, text_name, xml_names in anafora.walk(anafora_dir, xml_name_regex): # load the data from each Anafora XML file annotator_data = [] for xml_name in xml_names: # ignore in-progress annotations and automatic pre-annotations if '.inprogress.' in xml_name or '.preannotation.' in xml_name: continue # ignore empty files xml_path = os.path.join(anafora_dir, sub_dir, xml_name) if os.stat(xml_path).st_size == 0: continue # load the data and add it to the list data = _load(xml_path) annotator_name = re.search(annotator_name_regex, xml_name).group(1) annotator_data.append((annotator_name, data)) # at least 2 annotators are needed for annotator agreement if len(annotator_data) < 2: logging.warn("%s: found fewer than 2 annotators: %s", text_name, xml_names) continue # pair each annotator with each other annotator annotator_named_scores = collections.defaultdict(lambda: scores_type()) for i in range(len(annotator_data)): annotator1, data1 = annotator_data[i] for j in range(i + 1, len(annotator_data)): annotator2, data2 = annotator_data[j] # make a prefix for this specific pair of annotators prefix = make_prefix([annotator1, annotator2]) # make a prefix where non-gold annotators are just called "annotator" general_prefix = make_prefix( a if a == "gold" else "annotator" for a in [annotator1, annotator2]) # perform the comparison of the two annotation sets and update the overall scores named_scores = score_data(data1, data2, include, exclude, scores_type=scores_type, spans_type=spans_type) # add annotators as prefixes for name, scores in named_scores.items(): if not isinstance(name, tuple): name = name, annotator_named_scores[(prefix,) + name].update(scores) annotator_named_scores[(general_prefix,) + name].update(scores) # generate the filename and the resulting scores yield text_name, annotator_named_scores
def preprocess_data_lasagne(input_ann_dir, input_text_dir, outDir, window_size=3, num_feats=2, Shuffle = False): ext_positive = 0 ext_negative=0 with open(os.path.join(outDir, "feature.toks"), 'w') as g_feature,\ open(os.path.join(outDir, "label.txt"), 'w') as g_label: g_feature.write(str(num_feats)+"\t"+str(window_size)+"\n") for dir_path, dir_names, file_names in os.walk(input_text_dir): pbar = ProgressBar(maxval=len(file_names)).start() for i, fn in enumerate(sorted(file_names)): time.sleep(0.01) pbar.update(i + 1) for sub_dir, text_name, xml_names in anafora.walk(os.path.join(input_ann_dir, fn)): for xml_name in xml_names: if "Temporal" not in xml_name: continue #print fn xml_path = os.path.join(input_ann_dir, text_name, xml_name) data = anafora.AnaforaData.from_file(xml_path) positive_span_label_map={} for annotation in data.annotations: if annotation.type == 'EVENT': startoffset = annotation.spans[0][0] endoffset = annotation.spans[0][1] properties = annotation.properties pros = {} for pro_name in properties: pro_val = properties.__getitem__(pro_name) pros[pro_name] = pro_val positive_span_label_map[(startoffset,endoffset)] = "1" with open(os.path.join(input_text_dir, fn), 'r') as f: content = f.read() all_spans = content2span(content) negative_span_label_map={} for span in all_spans: if span not in positive_span_label_map: negative_span_label_map[span] = "0" merged_spans = positive_span_label_map.keys() + negative_span_label_map.keys() if Shuffle: shuffle(merged_spans) for span in merged_spans: if span not in positive_span_label_map: ext_negative += 1 label = negative_span_label_map[span] else: ext_positive += 1 label = positive_span_label_map[span] if num_feats == 2: feat = feature_generation_2(content, span[0], span[1], window_size) elif num_feats == 3: feat = feature_generation_3(content, span[0], span[1], window_size) seqlen = 2*window_size+1 toks_a = feat.rstrip('\n').split() assert len(toks_a) == seqlen*num_feats, "wrong :"+a g_feature.write(feat+"\n") g_label.write(label+"\n") pbar.finish() print "Extract positive events is %d"%ext_positive print "Extract negative events is %d"%ext_negative