def json_data(data): document = Document(data.get(u"name", u"_DOCUMENT_"), content=data.get(u"content", u"")) for key, value in data.get(u"metadatas", {}).items(): document.add_metadata(key, value) for segmentation_name in data.get(u"segmentations", {}): d = data[u"segmentations"][segmentation_name] spans = [ Span(lb=span[u"s"], ub=0, length=span[u"l"]) for span in d[u"spans"] ] segmentation = Segmentation(segmentation_name, spans=spans, reference=d.get(u"reference", None)) document.add_segmentation(segmentation) for segmentation in document.segmentations: if segmentation.reference is not None: segmentation.reference = document.segmentation( segmentation.reference) for annotation_name in data.get(u"annotations", {}): d = data[u"annotations"][annotation_name] annotations = [ Tag(lb=annotation[u"s"], ub=0, length=annotation[u"l"], value=annotation[u"v"]) for annotation in d[u"annotations"] ] annotation = Annotation(annotation_name, reference=document.segmentation( d[u"reference"]), annotations=annotations) document.add_annotation(annotation)
def main(indirnames, outfilename, default_shift=0, top_level=False): dirs = [] for indirname in indirnames: dirs.extend([ os.path.join(indirname, name) for name in sorted(os.listdir(indirname)) if os.path.isdir(os.path.join(indirname, name)) ]) contents = [] annotations = [] shift = 0 for dirname in dirs: cur_contents, cur_annotations, cur_shift = make_data( dirname, default_shift=shift, top_level=top_level) contents.extend(cur_contents) annotations.extend(cur_annotations) shift = cur_shift document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def main(indirname, outfilename, default_shift=0, top_level=False): contents, annotations, shift = make_data(indirname, default_shift=default_shift, top_level=top_level) document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents)) document.add_annotation(Annotation("NER", annotations=annotations)) exporter = BratExporter() with codecs.open(outfilename + ".ann", "w", "utf-8") as O: O.write(exporter.document_to_unicode(document, {"ner": "NER"})) with codecs.open(outfilename + ".txt", "w", "utf-8") as O: O.write(document.content)
def brat_file(filename, encoding="utf-8"): no_ext, ext = os.path.splitext(filename) txt_file = no_ext + ".txt" ann_file = no_ext + ".ann" if not (os.path.exists(txt_file) and os.path.exists(ann_file)): raise ValueError("missing either .ann or .txt file") document = Document(os.path.basename(txt_file), encoding=encoding, mime_type="text/plain") document.content = codecs.open(txt_file, "rU", encoding).read().replace(u"\r", u"") annotations = Annotation("NER") for line in codecs.open(ann_file, "rU", encoding): line = line.strip() if line != u"" and line.startswith(u'T'): parts = line.split(u"\t") value, bounds = parts[1].split(" ", 1) for bound in bounds.split(";"): lb, ub = bound.split() lb = int(lb) ub = int(ub) annotations.append(Tag(lb=lb, ub=ub, value=value)) annotations.sort() document.add_annotation(annotations) return document
def gate_data(data, name=None): document = Document(name or "__DOCUMENT__", mime_type="text/plain") textwithnodes = data.findall("TextWithNodes")[0] annotation_sets = data.findall("AnnotationSet") text_parts = [textwithnodes.text or u""] nodes = {} for node in list(textwithnodes): nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts]) text_parts.append(node.tail or u"") document.content = u"".join(text_parts) annotations = [] for annotation_set in annotation_sets: annotation_name = annotation_set.attrib["Name"] sem_annotation = Annotation(annotation_name) for annotation in annotation_set: lb = nodes[int(annotation.attrib["StartNode"])] ub = nodes[int(annotation.attrib["EndNode"])] sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"])) document.add_annotation(sem_annotation) return document
def process_document(self, document, **kwargs): """ Updates a document with various segmentations and creates an sem.corpus (CoNLL-formatted data) using field argument as index. Parameters ---------- document : sem.storage.Document the input data. It is a document with only a content log_level : str or int the logging level log_file : str if not None, the file to log to (does not remove command-line logging). """ start = time.time() if self._log_file is not None: map_annotations_logger.addHandler(file_handler(self._log_file)) map_annotations_logger.setLevel(self._log_level) ref_annotation = document.annotation(self._annotation_name) ref_annotations = ref_annotation.annotations values = set([a.value for a in ref_annotations]) new_annotations = [ Tag(annotation.lb, annotation.ub, self._mapping.get(annotation.value, annotation.value)) for annotation in ref_annotations if self._mapping.get(annotation.value, None) != u"" ] document.add_annotation( Annotation(self._annotation_name, reference=ref_annotation.reference, annotations=new_annotations)) laps = time.time() - start map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
def main(args): infile = args.infile reference_column = args.reference_column tagging_column = args.tagging_column ienc = args.ienc or args.enc oenc = args.oenc or args.enc verbose = args.verbose input_format = args.input_format reference_file = args.reference_file annotation_name = args.annotation_name dump = args.dump context_size = args.context_size counts = {} prf = {} if input_format == "conll": if reference_file: print(u"reference_file not handled for CoNLL files") L = [] R = [] keys = None nth = -1 for n_line, p in Reader(infile, ienc).line_iter(): nth += 1 keys = keys or range(len(p[0])) L.extend( annotation_from_sentence(p, column=reference_column, shift=n_line - nth)) R.extend( annotation_from_sentence(p, column=tagging_column, shift=n_line - nth)) document = sem.importers.conll_file(infile, keys, keys[0], encoding=ienc) L = Annotation("", annotations=L, reference=document.segmentation( "tokens")).get_reference_annotations() R = Annotation("", annotations=R, reference=document.segmentation( "tokens")).get_reference_annotations() elif input_format == "brat": document = sem.importers.brat_file(reference_file) L = document.annotation("NER").get_reference_annotations() R = sem.importers.brat_file(infile).annotation( "NER").get_reference_annotations() elif input_format in ("sem", "SEM"): document = Document.from_xml(reference_file) system = Document.from_xml(infile) common_annotations = set(document.annotations.keys()) & set( system.annotations.keys()) if len(common_annotations) == 1 and annotation_name is None: annotation_name = list(common_annotations)[0] if annotation_name is None: raise RuntimeError( "Could not find an annotation set to evaluate: please provide one" ) L = document.annotation(annotation_name).get_reference_annotations() R = system.annotation(annotation_name).get_reference_annotations() else: raise RuntimeError("format not handled: {0}".format(input_format)) len_ref = len(L) len_tag = len(R) d = { CORRECT: [], TYPE_ERROR: [], BOUNDARY_ERROR: [], TYPE_AND_BOUNDARY_ERROR: [], SILENCE_ERROR: [], NOISE_ERROR: [] } # first pass, removing correct i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR == RR: del L[i] del R[j] i -= 1 d[CORRECT].append([LR, RR]) break j += 1 i += 1 # second pass, typing errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value != RR.value and LR.lb == RR.lb and LR.ub == RR.ub: del L[i] del R[j] d[TYPE_ERROR].append([LR, RR]) break j += 1 i += 1 # third pass, boundary errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value == RR.value and ((LR.lb != RR.lb and LR.ub == RR.ub) or (LR.lb == RR.lb and LR.ub != RR.ub)): del L[i] del R[j] i -= 1 d[BOUNDARY_ERROR].append([LR, RR]) break j += 1 i += 1 # fourth pass, both type and boundary errors i = 0 while i < len(L): LR = L[i] j = 0 while j < len(R): RR = R[j] if LR.value != RR.value and (LR.lb != RR.lb and LR.ub == RR.ub) or (LR.lb == RR.lb and LR.ub != RR.ub): del L[i] del R[j] i -= 1 d[TYPE_AND_BOUNDARY_ERROR].append([LR, RR]) break j += 1 i += 1 d[SILENCE_ERROR] = L[:] d[NOISE_ERROR] = R[:] entities = set() for l in d.values(): for e in l: try: l, r = e entities.add(l.value) entities.add(r.value) except: entities.add(e.value) with codecs.open(dump, "w", "utf-8") as O: O.write(u"error kind\treference entity\toutput entity\tdiff\n") for error_kind in (TYPE_ERROR, BOUNDARY_ERROR, TYPE_AND_BOUNDARY_ERROR, NOISE_ERROR, SILENCE_ERROR): for ex in d[error_kind]: if error_kind == NOISE_ERROR: gold = None guess = ex elif error_kind == SILENCE_ERROR: gold = ex guess = None else: gold = ex[0] guess = ex[1] gold_str = (u"{0}:{1}".format( gold.value, document.content[gold.lb:gold.ub]) if gold else "").replace("\r", "").replace("\n", " ") guess_str = (u"{0}:{1}".format( guess.value, document.content[guess.lb:guess.ub]) if guess else "").replace("\r", "").replace("\n", " ") diff = get_diff(document.content, gold, guess, error_kind, context_size=context_size) O.write(u"{0}\t{1}\t{2}\t{3}\n".format(error_kind, gold_str, guess_str, diff)) counts = {} for entity in entities: sub_d = {} sub_d[CORRECT] = [m for m in d[CORRECT] if m[0].value == entity] sub_d[TYPE_ERROR] = [ m for m in d[TYPE_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[BOUNDARY_ERROR] = [ m for m in d[BOUNDARY_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[TYPE_AND_BOUNDARY_ERROR] = [ m for m in d[TYPE_AND_BOUNDARY_ERROR] if m[0].value == entity or m[1].value == entity ] sub_d[NOISE_ERROR] = [m for m in d[NOISE_ERROR] if m.value == entity] sub_d[SILENCE_ERROR] = [ m for m in d[SILENCE_ERROR] if m.value == entity ] counts[entity] = sub_d # basic counts print(u"entity\tmeasure\tvalue") for entity in sorted(entities): for kind in OUTPUT_KINDS: print(u"{0}\t{1}\t{2}".format(entity, kind, len(counts[entity][kind]))) print(u"global\treference\t{0}".format(len_ref)) print(u"global\ttagging\t{0}".format(len_tag)) for kind in OUTPUT_KINDS: print(u"global\t{0}\t{1}".format(kind, len(d[kind]))) # P R F precisions = [] recalls = [] print() print(u"entity\tmeasure\tvalue") for entity in sorted(entities): precisions.append(precision(counts[entity])) recalls.append(recall(counts[entity])) print(u"{0}\tprecision\t{1:.4f}".format(entity, precisions[-1])) print(u"{0}\trecall\t{1:.4f}".format(entity, recalls[-1])) print(u"{0}\tfscore\t{1:.4f}".format( entity, fscore(precision(counts[entity]), recall(counts[entity])))) print(u"global\tprecision\t{0:.4f}".format(precision(d))) print(u"global\trecall\t{0:.4f}".format(recall(d))) print(u"global\tfscore\t{0:.4f}".format(fscore(precision(d), recall(d)))) print(u"global\tmacro-precision\t{0:.4f}".format(mean(precisions))) print(u"global\tmacro-recall\t{0:.4f}".format(mean(recalls))) print(u"global\tmacro-fscore\t{0:.4f}".format( fscore(mean(precisions), mean(recalls)))) # over/under generation, substitution print() print(u"entity\tmeasure\tvalue") for entity in sorted(entities): print(u"{0}\tundergeneration\t{1:.4f}".format( entity, undergeneration(counts[entity]))) print(u"{0}\tovergeneration\t{1:.4f}".format( entity, overgeneration(counts[entity]))) print(u"{0}\tsubstitution\t{1:.4f}".format( entity, substitution(counts[entity]))) print(u"global\tundergeneration\t{0:.4f}".format(undergeneration(d))) print(u"global\tovergeneration\t{0:.4f}".format(overgeneration(d))) print(u"global\tsubstitution\t{0:.4f}".format(substitution(d)))