예제 #1
0
def json_data(data):
    document = Document(data.get(u"name", u"_DOCUMENT_"),
                        content=data.get(u"content", u""))
    for key, value in data.get(u"metadatas", {}).items():
        document.add_metadata(key, value)

    for segmentation_name in data.get(u"segmentations", {}):
        d = data[u"segmentations"][segmentation_name]
        spans = [
            Span(lb=span[u"s"], ub=0, length=span[u"l"])
            for span in d[u"spans"]
        ]
        segmentation = Segmentation(segmentation_name,
                                    spans=spans,
                                    reference=d.get(u"reference", None))
        document.add_segmentation(segmentation)
    for segmentation in document.segmentations:
        if segmentation.reference is not None:
            segmentation.reference = document.segmentation(
                segmentation.reference)

    for annotation_name in data.get(u"annotations", {}):
        d = data[u"annotations"][annotation_name]
        annotations = [
            Tag(lb=annotation[u"s"],
                ub=0,
                length=annotation[u"l"],
                value=annotation[u"v"]) for annotation in d[u"annotations"]
        ]
        annotation = Annotation(annotation_name,
                                reference=document.segmentation(
                                    d[u"reference"]),
                                annotations=annotations)
        document.add_annotation(annotation)
예제 #2
0
def main(indirnames, outfilename, default_shift=0, top_level=False):
    dirs = []
    for indirname in indirnames:
        dirs.extend([
            os.path.join(indirname, name)
            for name in sorted(os.listdir(indirname))
            if os.path.isdir(os.path.join(indirname, name))
        ])

    contents = []
    annotations = []
    shift = 0
    for dirname in dirs:
        cur_contents, cur_annotations, cur_shift = make_data(
            dirname, default_shift=shift, top_level=top_level)
        contents.extend(cur_contents)
        annotations.extend(cur_annotations)
        shift = cur_shift

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
예제 #3
0
def main(indirname, outfilename, default_shift=0, top_level=False):
    contents, annotations, shift = make_data(indirname,
                                             default_shift=default_shift,
                                             top_level=top_level)

    document = Document("_doc_", content=(u"\n" * NUM_NEWLINES).join(contents))
    document.add_annotation(Annotation("NER", annotations=annotations))
    exporter = BratExporter()
    with codecs.open(outfilename + ".ann", "w", "utf-8") as O:
        O.write(exporter.document_to_unicode(document, {"ner": "NER"}))
    with codecs.open(outfilename + ".txt", "w", "utf-8") as O:
        O.write(document.content)
예제 #4
0
def brat_file(filename, encoding="utf-8"):
    no_ext, ext = os.path.splitext(filename)
    txt_file = no_ext + ".txt"
    ann_file = no_ext + ".ann"
    if not (os.path.exists(txt_file) and os.path.exists(ann_file)):
        raise ValueError("missing either .ann or .txt file")

    document = Document(os.path.basename(txt_file),
                        encoding=encoding,
                        mime_type="text/plain")
    document.content = codecs.open(txt_file, "rU",
                                   encoding).read().replace(u"\r", u"")
    annotations = Annotation("NER")
    for line in codecs.open(ann_file, "rU", encoding):
        line = line.strip()
        if line != u"" and line.startswith(u'T'):
            parts = line.split(u"\t")
            value, bounds = parts[1].split(" ", 1)
            for bound in bounds.split(";"):
                lb, ub = bound.split()
                lb = int(lb)
                ub = int(ub)
                annotations.append(Tag(lb=lb, ub=ub, value=value))
    annotations.sort()
    document.add_annotation(annotations)

    return document
예제 #5
0
def gate_data(data, name=None):
    document = Document(name or "__DOCUMENT__", mime_type="text/plain")

    textwithnodes = data.findall("TextWithNodes")[0]
    annotation_sets = data.findall("AnnotationSet")

    text_parts = [textwithnodes.text or u""]
    nodes = {}
    for node in list(textwithnodes):
        nodes[int(node.attrib["id"])] = sum([len(part) for part in text_parts])
        text_parts.append(node.tail or u"")
    document.content = u"".join(text_parts)

    annotations = []
    for annotation_set in annotation_sets:
        annotation_name = annotation_set.attrib["Name"]
        sem_annotation = Annotation(annotation_name)
        for annotation in annotation_set:
            lb = nodes[int(annotation.attrib["StartNode"])]
            ub = nodes[int(annotation.attrib["EndNode"])]
            sem_annotation.append(Tag(lb, ub, annotation.attrib["Type"]))
        document.add_annotation(sem_annotation)

    return document
예제 #6
0
    def process_document(self, document, **kwargs):
        """
        Updates a document with various segmentations and creates
        an sem.corpus (CoNLL-formatted data) using field argument as index.
        
        Parameters
        ----------
        document : sem.storage.Document
            the input data. It is a document with only a content
        log_level : str or int
            the logging level
        log_file : str
            if not None, the file to log to (does not remove command-line
            logging).
        """

        start = time.time()

        if self._log_file is not None:
            map_annotations_logger.addHandler(file_handler(self._log_file))
        map_annotations_logger.setLevel(self._log_level)

        ref_annotation = document.annotation(self._annotation_name)
        ref_annotations = ref_annotation.annotations
        values = set([a.value for a in ref_annotations])
        new_annotations = [
            Tag(annotation.lb, annotation.ub,
                self._mapping.get(annotation.value, annotation.value))
            for annotation in ref_annotations
            if self._mapping.get(annotation.value, None) != u""
        ]

        document.add_annotation(
            Annotation(self._annotation_name,
                       reference=ref_annotation.reference,
                       annotations=new_annotations))

        laps = time.time() - start
        map_annotations_logger.info('in %s' % (timedelta(seconds=laps)))
예제 #7
0
def main(args):
    infile = args.infile
    reference_column = args.reference_column
    tagging_column = args.tagging_column
    ienc = args.ienc or args.enc
    oenc = args.oenc or args.enc
    verbose = args.verbose
    input_format = args.input_format
    reference_file = args.reference_file
    annotation_name = args.annotation_name
    dump = args.dump
    context_size = args.context_size

    counts = {}
    prf = {}
    if input_format == "conll":
        if reference_file:
            print(u"reference_file not handled for CoNLL files")
        L = []
        R = []
        keys = None
        nth = -1
        for n_line, p in Reader(infile, ienc).line_iter():
            nth += 1
            keys = keys or range(len(p[0]))
            L.extend(
                annotation_from_sentence(p,
                                         column=reference_column,
                                         shift=n_line - nth))
            R.extend(
                annotation_from_sentence(p,
                                         column=tagging_column,
                                         shift=n_line - nth))
        document = sem.importers.conll_file(infile,
                                            keys,
                                            keys[0],
                                            encoding=ienc)
        L = Annotation("",
                       annotations=L,
                       reference=document.segmentation(
                           "tokens")).get_reference_annotations()
        R = Annotation("",
                       annotations=R,
                       reference=document.segmentation(
                           "tokens")).get_reference_annotations()
    elif input_format == "brat":
        document = sem.importers.brat_file(reference_file)
        L = document.annotation("NER").get_reference_annotations()
        R = sem.importers.brat_file(infile).annotation(
            "NER").get_reference_annotations()
    elif input_format in ("sem", "SEM"):
        document = Document.from_xml(reference_file)
        system = Document.from_xml(infile)
        common_annotations = set(document.annotations.keys()) & set(
            system.annotations.keys())
        if len(common_annotations) == 1 and annotation_name is None:
            annotation_name = list(common_annotations)[0]
        if annotation_name is None:
            raise RuntimeError(
                "Could not find an annotation set to evaluate: please provide one"
            )
        L = document.annotation(annotation_name).get_reference_annotations()
        R = system.annotation(annotation_name).get_reference_annotations()
    else:
        raise RuntimeError("format not handled: {0}".format(input_format))

    len_ref = len(L)
    len_tag = len(R)
    d = {
        CORRECT: [],
        TYPE_ERROR: [],
        BOUNDARY_ERROR: [],
        TYPE_AND_BOUNDARY_ERROR: [],
        SILENCE_ERROR: [],
        NOISE_ERROR: []
    }
    # first pass, removing correct
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR == RR:
                del L[i]
                del R[j]
                i -= 1
                d[CORRECT].append([LR, RR])
                break
            j += 1
        i += 1

    # second pass, typing errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value != RR.value and LR.lb == RR.lb and LR.ub == RR.ub:
                del L[i]
                del R[j]
                d[TYPE_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    # third pass, boundary errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value == RR.value and ((LR.lb != RR.lb and LR.ub == RR.ub) or
                                         (LR.lb == RR.lb and LR.ub != RR.ub)):
                del L[i]
                del R[j]
                i -= 1
                d[BOUNDARY_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    # fourth pass, both type and boundary errors
    i = 0
    while i < len(L):
        LR = L[i]
        j = 0
        while j < len(R):
            RR = R[j]
            if LR.value != RR.value and (LR.lb != RR.lb and LR.ub
                                         == RR.ub) or (LR.lb == RR.lb
                                                       and LR.ub != RR.ub):
                del L[i]
                del R[j]
                i -= 1
                d[TYPE_AND_BOUNDARY_ERROR].append([LR, RR])
                break
            j += 1
        i += 1

    d[SILENCE_ERROR] = L[:]
    d[NOISE_ERROR] = R[:]

    entities = set()
    for l in d.values():
        for e in l:
            try:
                l, r = e
                entities.add(l.value)
                entities.add(r.value)
            except:
                entities.add(e.value)

    with codecs.open(dump, "w", "utf-8") as O:
        O.write(u"error kind\treference entity\toutput entity\tdiff\n")
        for error_kind in (TYPE_ERROR, BOUNDARY_ERROR, TYPE_AND_BOUNDARY_ERROR,
                           NOISE_ERROR, SILENCE_ERROR):
            for ex in d[error_kind]:
                if error_kind == NOISE_ERROR:
                    gold = None
                    guess = ex
                elif error_kind == SILENCE_ERROR:
                    gold = ex
                    guess = None
                else:
                    gold = ex[0]
                    guess = ex[1]
                gold_str = (u"{0}:{1}".format(
                    gold.value, document.content[gold.lb:gold.ub]) if gold else
                            "").replace("\r", "").replace("\n", " ")
                guess_str = (u"{0}:{1}".format(
                    guess.value, document.content[guess.lb:guess.ub]) if guess
                             else "").replace("\r", "").replace("\n", " ")
                diff = get_diff(document.content,
                                gold,
                                guess,
                                error_kind,
                                context_size=context_size)
                O.write(u"{0}\t{1}\t{2}\t{3}\n".format(error_kind, gold_str,
                                                       guess_str, diff))

    counts = {}
    for entity in entities:
        sub_d = {}
        sub_d[CORRECT] = [m for m in d[CORRECT] if m[0].value == entity]
        sub_d[TYPE_ERROR] = [
            m for m in d[TYPE_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[BOUNDARY_ERROR] = [
            m for m in d[BOUNDARY_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[TYPE_AND_BOUNDARY_ERROR] = [
            m for m in d[TYPE_AND_BOUNDARY_ERROR]
            if m[0].value == entity or m[1].value == entity
        ]
        sub_d[NOISE_ERROR] = [m for m in d[NOISE_ERROR] if m.value == entity]
        sub_d[SILENCE_ERROR] = [
            m for m in d[SILENCE_ERROR] if m.value == entity
        ]
        counts[entity] = sub_d

    # basic counts
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        for kind in OUTPUT_KINDS:
            print(u"{0}\t{1}\t{2}".format(entity, kind,
                                          len(counts[entity][kind])))
    print(u"global\treference\t{0}".format(len_ref))
    print(u"global\ttagging\t{0}".format(len_tag))
    for kind in OUTPUT_KINDS:
        print(u"global\t{0}\t{1}".format(kind, len(d[kind])))

    # P R F
    precisions = []
    recalls = []
    print()
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        precisions.append(precision(counts[entity]))
        recalls.append(recall(counts[entity]))
        print(u"{0}\tprecision\t{1:.4f}".format(entity, precisions[-1]))
        print(u"{0}\trecall\t{1:.4f}".format(entity, recalls[-1]))
        print(u"{0}\tfscore\t{1:.4f}".format(
            entity, fscore(precision(counts[entity]), recall(counts[entity]))))
    print(u"global\tprecision\t{0:.4f}".format(precision(d)))
    print(u"global\trecall\t{0:.4f}".format(recall(d)))
    print(u"global\tfscore\t{0:.4f}".format(fscore(precision(d), recall(d))))
    print(u"global\tmacro-precision\t{0:.4f}".format(mean(precisions)))
    print(u"global\tmacro-recall\t{0:.4f}".format(mean(recalls)))
    print(u"global\tmacro-fscore\t{0:.4f}".format(
        fscore(mean(precisions), mean(recalls))))

    # over/under generation, substitution
    print()
    print(u"entity\tmeasure\tvalue")
    for entity in sorted(entities):
        print(u"{0}\tundergeneration\t{1:.4f}".format(
            entity, undergeneration(counts[entity])))
        print(u"{0}\tovergeneration\t{1:.4f}".format(
            entity, overgeneration(counts[entity])))
        print(u"{0}\tsubstitution\t{1:.4f}".format(
            entity, substitution(counts[entity])))
    print(u"global\tundergeneration\t{0:.4f}".format(undergeneration(d)))
    print(u"global\tovergeneration\t{0:.4f}".format(overgeneration(d)))
    print(u"global\tsubstitution\t{0:.4f}".format(substitution(d)))