def brat_file(filename, encoding="utf-8"): no_ext, ext = os.path.splitext(filename) txt_file = no_ext + ".txt" ann_file = no_ext + ".ann" if not (os.path.exists(txt_file) and os.path.exists(ann_file)): raise ValueError("missing either .ann or .txt file") document = Document(os.path.basename(txt_file), encoding=encoding, mime_type="text/plain") document.content = codecs.open(txt_file, "rU", encoding).read().replace(u"\r", u"") annotations = Annotation("NER") for line in codecs.open(ann_file, "rU", encoding): line = line.strip() if line != u"" and line.startswith(u'T'): parts = line.split(u"\t") value, bounds = parts[1].split(" ", 1) for bound in bounds.split(";"): lb, ub = bound.split() lb = int(lb) ub = int(ub) annotations.append(Tag(lb=lb, ub=ub, value=value)) annotations.sort() document.add_annotation(annotations) return document
def gate_data(data, name=None): document = Document(name or "__DOCUMENT__", mime_type="text/plain") textwithnodes = data.findall("TextWithNodes")[0] annotation_sets = data.findall("AnnotationSet") text_parts = [textwithnodes.text or u""] nodes = {} for node in list(textwithnodes): nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts]) text_parts.append(node.tail or u"") document.content = u"".join(text_parts) annotations = [] for annotation_set in annotation_sets: annotation_name = annotation_set.attrib["Name"] sem_annotation = Annotation(annotation_name) for annotation in annotation_set: lb = nodes[int(annotation.attrib["StartNode"])] ub = nodes[int(annotation.attrib["EndNode"])] sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"])) document.add_annotation(sem_annotation) return document