예제 #1
0
def confusion(true_brat_dir, pred_brat_dir):
    cm = {}

    for pred_label in ENTITIES:
        cm[pred_label] = {}
        for true_label in ENTITIES + ('None', ):
            cm[pred_label][true_label] = 0

    for true_fname, pred_fname in zip(sorted_glob(true_brat_dir + '/*.ann'),
                                      sorted_glob(pred_brat_dir + '/*.ann')):
        true_annots = read_true_brat_annots(true_fname)

        for line in open(pred_fname):
            if line.startswith('T'):
                part = line.split('\t')[1]
                pred_label, start, end = part.split()
                true_label = true_annots.get((start, end), 'None')
                cm[pred_label][true_label] += 1

    l = 16 * ' '
    for true_label in ENTITIES + ('None', ):
        l += '{:>16}'.format(true_label)
    print(l)

    for pred_label in ENTITIES:
        l = '{:16}'.format(pred_label)

        for true_label in ENTITIES + ('None', ):
            l += '{:16}'.format(cm[pred_label][true_label])
        print(l)
예제 #2
0
def convert(crfplus_dirs, true_iob_dir, pred_iob_dir):
    makedirs(pred_iob_dir, exist_ok=True)

    for iob_fname in sorted_glob(join(true_iob_dir, '*.json')):
        try:
            doc_iob = json.load(open(iob_fname))
            base_name = basename(iob_fname)

            for label in ENTITIES:
                crfplus_fname = join(
                    crfplus_dirs[label],
                    base_name.replace('.json', '.txt'))
                f = open(crfplus_fname)

                for sent_iob in doc_iob:
                    for tok_iob in sent_iob:
                        line = next(f)
                        pred_tag = line.split('\t')[2].strip()
                        tok_iob[label] = pred_tag
                    next(f)

            pred_iob_fname = join(pred_iob_dir, base_name)
            json.dump(doc_iob, open(pred_iob_fname, 'w'),
                      indent=4, sort_keys=True, ensure_ascii=False)
        except Exception as err:
            print('*** ERRROR **', err)
            print(crfplus_fname)
            print(line)
            print()
예제 #3
0
파일: crf.py 프로젝트: phychaos/scienceie17
def collect_crf_data(iob_dir, *feat_dirs):
    # *** DEPRECATED *** Do not use in new experiments!
    """
    Collect the data to train/eval CRF classifier.
    Labels for entities are derived from IOB tags in the files in the iob_dir.
    Features are collected from the json files in one or more feat_dir.
    Filenames are the basenames of the iob files.
    """
    data = dict((label, list()) for label in ENTITIES)
    data['feats'] = []
    data['filenames'] = []

    for iob_fname in sorted_glob(join(iob_dir, '*.json')):
        text_iob = json.load(open(iob_fname))

        filename = basename(iob_fname)
        feat_filenames = [join(dir, filename) for dir in feat_dirs]
        text_feat = Features.from_file(*feat_filenames)
        assert len(text_iob) == len(text_feat)
        data['feats'] += text_feat

        for label in ENTITIES:
            data[label] += _text_iob_tags(text_iob, label)

        data['filenames'] += len(text_iob) * [filename]

    return data
예제 #4
0
def iob_to_brat(iob_dir, txt_dir, brat_dir):
    for iob_fname in sorted_glob(join(iob_dir, '*.json')):
        spans = get_text_spans(iob_fname)
        # need text file for correct whitespace
        txt_fname = join(txt_dir, splitext(basename(iob_fname))[0] + '.txt')
        text = open(txt_fname).read()
        makedirs(brat_dir, exist_ok=True)
        brat_fname = join(brat_dir, splitext(basename(iob_fname))[0] + '.ann')
        write_brat_file(brat_fname, spans, text)
예제 #5
0
def postproc_brat(in_brat_dir, txt_dir, out_brat_dir):
    makedirs(out_brat_dir, exist_ok=True)

    for in_brat_fname in sorted_glob(join(in_brat_dir, '*.ann')):
        spans = read_brat_file(in_brat_fname)
        txt_fname = join(txt_dir,
                         basename(in_brat_fname).replace('.ann', '.txt'))
        print('reading ' + txt_fname)
        text = open(txt_fname).read()
        phrase2annots = get_phrase_annots(spans, text)

        for phrase, annots in phrase2annots.items():
            counts = Counter(span.label for span in annots)
            most_common = counts.most_common()
            if len(most_common) > 1:
                if most_common[0][1] > most_common[1][1]:
                    print('--> found majority label for phrase', repr(phrase),
                          ':', counts)
                    majority_label = most_common[0][0]
                    for span in annots:
                        if span.label != majority_label:
                            print('==> removing', span)
                            annots.remove(span)
                            spans.remove(span)
                else:
                    best_label = 'Material' if 'Material' in counts else 'Process'
                    print('--> found best label for phrase', repr(phrase), ':',
                          counts)
                    for span in annots:
                        if span.label != best_label:
                            print('==> removing', span)
                            annots.remove(span)
                            spans.remove(span)

            # now all labels in annots are the same
            unique_label = annots[0].label
            for m in re.finditer(re.escape(phrase), text):
                try:
                    if text[m.start() - 1].isalpha():
                        continue
                except IndexError:
                    pass

                try:
                    if text[m.end()].isalpha():
                        continue
                except IndexError:
                    pass

                span = Span(unique_label, m.start(), m.end())
                if span not in annots:
                    print(annots)
                    print('==> adding span', span, 'for phrase', repr(phrase))
                    spans.append(span)

        out_brat_fname = join(out_brat_dir, basename(in_brat_fname))
        write_brat_file(out_brat_fname, spans, text)
예제 #6
0
def get_entity_lempos_counts(iob_dir, spacy_dir, nlp=None, use_pickle=True):
    pickle_fname = '_counts.pkl'

    if use_pickle:
        try:
            return pickle.load(open(pickle_fname, 'rb'))
        except IOError:
            pass

    iob_fnames = sorted_glob(join(iob_dir, '*'))
    spacy_fnames = sorted_glob(join(spacy_dir, '*'))
    counts = {}

    if not nlp:
        nlp = spacy.load('en')

    for iob_fname, spacy_fname in zip(iob_fnames, spacy_fnames):
        iob_doc = json.load(open(iob_fname, encoding='utf8'))
        spacy_doc = read_doc(spacy_fname, nlp)
        count = dict((e, Counter()) for e in LABELS)

        for spacy_sent, iob_sent in zip(spacy_doc.sents, iob_doc):
            for spacy_tok, iob_tok in zip(spacy_sent, iob_sent):
                if spacy_tok.pos_ in CONTENT_POS and not spacy_tok.is_stop:
                    lempos = spacy_tok.lemma_ + '#' + spacy_tok.pos_
                    other = True

                    for label in ENTITIES:
                        if iob_tok[label] in 'BI':
                            count[label][lempos] += 1
                            other = False

                    if other:
                        count['Other'][lempos] += 1

        counts[basename(iob_fname)] = count

    pickle.dump(counts, open(pickle_fname, 'wb'))

    return counts
예제 #7
0
파일: crf.py 프로젝트: phychaos/scienceie17
def collect_features(iob_dir, *feat_dirs):
    """
    Collect the features to train/eval CRF classifier from the json files in one or more feat_dirs.
    """
    feats = []

    for iob_fname in sorted_glob(join(iob_dir, '*.json')):
        filename = basename(iob_fname)
        feat_filenames = [join(dir, filename) for dir in feat_dirs]
        text_feat = Features.from_file(*feat_filenames)
        feats += text_feat

    return feats
예제 #8
0
def postproc_labels(in_iob_dir, out_iob_dir):
    makedirs(out_iob_dir, exist_ok=True)

    for in_iob_fname in sorted_glob(join(in_iob_dir, '*.json')):
        print('reading ' + in_iob_fname)
        text_iob = json.load(open(in_iob_fname))
        tokens2labels = get_token_labels(text_iob)
        resolve_labels(tokens2labels)
        relabel(text_iob, tokens2labels)

        out_iob_fname = join(out_iob_dir, basename(in_iob_fname))

        with open(out_iob_fname, 'w') as outf:
            print('writing ' + out_iob_fname)
            json.dump(text_iob, outf, indent=4, sort_keys=True, ensure_ascii=False)
예제 #9
0
def run_nlp(txt_dir, spacy_dir, nlp=None):
    """
    Process text files in directory txt_dir with Spacy NLP pipeline and
    serialize analyses to directory spacy_dir
    """
    if not nlp:
        nlp = spacy.load('en')

    makedirs(spacy_dir, exist_ok=True)

    for txt_fname in sorted_glob(join(txt_dir, '*.txt')):
        print('reading ' + txt_fname)
        text = open(txt_fname).read()
        # Spacy considers '\n' as a separate token.
        # That causes problems when writing tokens in column format,
        # so we strip the final '\n'.
        doc = nlp(text.rstrip('\n'))
        spacy_fname = join(spacy_dir,
                           splitext(basename(txt_fname))[0] + '.spacy')
        write_doc(spacy_fname, doc)
예제 #10
0
파일: crf.py 프로젝트: phychaos/scienceie17
def generate_labels(iob_dir, labels_fname):
    """
    Generate labels to train/eval CRF classifier.
    Labels for entities are derived from IOB tags in the files in the iob_dir.
    Filenames are the basenames of the iob files (used for creating folds and
    converting back CRF predictions to IOB files).
    Saved as a pickled dict with keys for all entity labels plus the special key __filenames__.
    """
    labels = dict((label, list()) for label in ENTITIES)
    labels['__filenames__'] = []

    for iob_fname in sorted_glob(join(iob_dir, '*.json')):
        text_iob = json.load(open(iob_fname))
        filename = basename(iob_fname)

        for label in ENTITIES:
            labels[label] += _text_iob_tags(text_iob, label)

        labels['__filenames__'] += len(text_iob) * [filename]

    print('writing labels to file ' + labels_fname)
    pickle.dump(labels, open(labels_fname, 'wb'))
예제 #11
0
def generate_feats(spacy_dir, feat_dir, feat_func, nlp=None):
    """
    Generate features and save to file

    :param spacy_dir: dir with serialized Spacy analyses
    :param feat_dir: output dir for generated feature files in json format
    :param feat_func: function for generating
    :return:
    """
    if not nlp:
        nlp = spacy.load('en')
    makedirs(feat_dir, exist_ok=True)

    for spacy_fname in sorted_glob(join(spacy_dir, '*.spacy')):
        doc = read_doc(spacy_fname, nlp)

        feat_fname = join(feat_dir,
                          splitext(basename(spacy_fname))[0] + '.json')

        text_feats = Features([feat_func(sent) for sent in doc.sents])

        text_feats.to_file(feat_fname)
예제 #12
0
from os.path import join, basename
from glob import glob
import json

from sie import LOCAL_DIR
from sie.utils import sorted_glob

true_iob_dir = join(LOCAL_DIR, 'train', 'iob')
synvec_feats_dir = join('_train', 'synvec_feats')

for iob_fname in sorted_glob(join(true_iob_dir, '*'))[10:12]:
    doc_iob = json.load(open(iob_fname))
    synvec_fname = join(synvec_feats_dir, basename(iob_fname))
    doc_feats = json.load(open(synvec_fname))

    for sent_iob, sent_feats in zip(doc_iob, doc_feats):
        for token_iob, token_feats in zip(sent_iob, sent_feats):
            if 'Pred' in token_feats['synvec']:
                print('{:20} {:10} {:10} {:10} {:10}'.format(
                    token_iob['token'],
                    'Material' if token_iob['Material'] != 'O' else '-',
                    'Process' if token_iob['Process'] != 'O' else '-',
                    'Task' if token_iob['Task'] != 'O' else '-',
                    token_feats['synvec']['Pred']))
            else:
                print(
                    '{:20} {:10} {:10} {:10} Material={:.2f}        Process={:.2f}        Task={:.2f}        Other={:.2f}'
                    .format(
                        token_iob['token'],
                        'Material' if token_iob['Material'] != 'O' else '-',
                        'Process' if token_iob['Process'] != 'O' else '-',
예제 #13
0
def generate_iob_tags(ann_dir, spacy_dir, iob_dir, nlp=None):
    """
    Generate files with IOB tags from Brat .ann files in ann_dir,
    Spacy serialized analyses in spacy_dir, writing to output files to iob_dir
    """
    #TODO This does not correctly handle embedded entities of the same type!
    # E.g. Material inside another Material

    if not nlp:
        nlp = spacy.load('en')

    makedirs(iob_dir, exist_ok=True)
    correct = incorrect = 0
    txt_count = ann_count = iob_count = 0

    for txt_fname in sorted_glob(join(ann_dir, '*.txt')):
        txt_count += 1
        spacy_fname = join(spacy_dir, splitext(basename(txt_fname))[0] + '.spacy')
        doc = read_doc(spacy_fname, nlp)
        char2token = map_chars_to_tokens(doc)

        iob_tags = {}
        for label in ENTITIES:
            iob_tags[label] = len(doc) * ['O']

        ann_fname = txt_fname.replace('.txt', '.ann')

        if exists(ann_fname):
            print('reading ' + ann_fname)
            ann_count += 1

            for line in open(ann_fname):
                if line.startswith('T'):
                    try:
                        label, begin_char, end_char = line.split('\t')[1].split()
                    except ValueError:
                        print('Oh no! Malformed annotation:\n' + line)
                        continue

                    begin_char, end_char = int(begin_char), int(end_char)
                    start_token = char2token[begin_char]
                    end_token = char2token[end_char]

                    span = Span(doc, start_token, end_token, label=14)

                    if span.start_char != begin_char or span.end_char != end_char:
                        print('BRAT SPAN:   ', doc.text[begin_char:end_char])
                        print('SPACY SPAN:  ', span)
                        toks = [t.text
                                for t in doc[max(0, start_token - 3):end_token + 3]]
                        print('SPACY TOKENS:', toks)
                        print()
                        incorrect += 1
                    else:
                        iob_tags[label][start_token] = 'B'
                        for i in range(start_token + 1, end_token):
                            iob_tags[label][i] = 'I'
                        correct += 1
        else:
            # test data has no annotation
            print('WARNING: no annotation file ' + ann_fname)

        iob_fname = join(iob_dir, splitext(basename(ann_fname))[0] + '.json')
        iob_count += 1
        write_iob_file(iob_fname, doc, iob_tags)

    print('\n#succesful spans: {}\n#failed spans: {}'.format(correct, incorrect))
    print('\n#txt files: {}\n#ann files: {}\n#iob files: {}'.format(txt_count, ann_count, iob_count))
예제 #14
0
def add_entities(ann_dir, spacy_dir, ents_dir=None, nlp=None):
    """
    Add Method, Process and Task entities from .ann files as entity spans
    to the corresponding serialized Spacy analyses in directory spacy_dir
    """
    if not nlp:
        nlp = spacy.load('en')

    if ents_dir:
        makedirs(ents_dir, exist_ok=True)

    register_entities(nlp)
    correct = incorrect = 0

    for ann_fname in sorted_glob(join(ann_dir, '*.ann')):
        print('reading ' + ann_fname)
        spacy_fname = join(spacy_dir,
                           splitext(basename(ann_fname))[0] + '.spacy')
        doc = read_doc(spacy_fname, nlp)

        # see https://github.com/spacy-io/spaCy/issues/461
        entities = [(e.label, e.start, e.end) for e in doc.ents]

        char2token = map_chars_to_tokens(doc)

        for line in open(ann_fname):
            if line.startswith('T'):
                try:
                    label, begin, end = line.split('\t')[1].split()
                except ValueError:
                    print('Oh no! Malformed annotation:\n' + line)
                    continue

                start_token = char2token[int(begin)]
                end_token = char2token[int(end)]

                span = Span(doc, start_token, end_token, label=14)

                if span.start_char != int(begin) or span.end_char != int(end):
                    print('BRAT SPAN:   ', doc.text[int(begin):int(end)])
                    print('SPACY SPAN:  ', span)
                    toks = [t.text
                            for t in doc[max(0, start_token - 3):end_token + 3]]
                    print('SPACY TOKENS:', toks)
                    print()
                    incorrect += 1
                else:
                    label_id = nlp.vocab.strings[label]
                    entities.append((label_id, start_token, end_token))
                    correct += 1

        if ents_dir:
            ents_fname = join(ents_dir,
                              splitext(basename(ann_fname))[0] + '_ents.pkl')
            print('writing ' + ents_fname)
            pickle.dump(entities, open(ents_fname, 'wb'))
        else:
            # Save ents in doc
            # FIXME: this currently fails with KeyError!
            # See https://github.com/explosion/spaCy/issues/514
            # doc.ents behaves like a set, so adding duplicates is harmless
            doc.ents = entities
            write_doc(spacy_fname, doc)

    print('\n#succesful spans: {}\n#failed spans: {}'.format(
        correct, incorrect))