def confusion(true_brat_dir, pred_brat_dir): cm = {} for pred_label in ENTITIES: cm[pred_label] = {} for true_label in ENTITIES + ('None', ): cm[pred_label][true_label] = 0 for true_fname, pred_fname in zip(sorted_glob(true_brat_dir + '/*.ann'), sorted_glob(pred_brat_dir + '/*.ann')): true_annots = read_true_brat_annots(true_fname) for line in open(pred_fname): if line.startswith('T'): part = line.split('\t')[1] pred_label, start, end = part.split() true_label = true_annots.get((start, end), 'None') cm[pred_label][true_label] += 1 l = 16 * ' ' for true_label in ENTITIES + ('None', ): l += '{:>16}'.format(true_label) print(l) for pred_label in ENTITIES: l = '{:16}'.format(pred_label) for true_label in ENTITIES + ('None', ): l += '{:16}'.format(cm[pred_label][true_label]) print(l)
def convert(crfplus_dirs, true_iob_dir, pred_iob_dir): makedirs(pred_iob_dir, exist_ok=True) for iob_fname in sorted_glob(join(true_iob_dir, '*.json')): try: doc_iob = json.load(open(iob_fname)) base_name = basename(iob_fname) for label in ENTITIES: crfplus_fname = join( crfplus_dirs[label], base_name.replace('.json', '.txt')) f = open(crfplus_fname) for sent_iob in doc_iob: for tok_iob in sent_iob: line = next(f) pred_tag = line.split('\t')[2].strip() tok_iob[label] = pred_tag next(f) pred_iob_fname = join(pred_iob_dir, base_name) json.dump(doc_iob, open(pred_iob_fname, 'w'), indent=4, sort_keys=True, ensure_ascii=False) except Exception as err: print('*** ERRROR **', err) print(crfplus_fname) print(line) print()
def collect_crf_data(iob_dir, *feat_dirs): # *** DEPRECATED *** Do not use in new experiments! """ Collect the data to train/eval CRF classifier. Labels for entities are derived from IOB tags in the files in the iob_dir. Features are collected from the json files in one or more feat_dir. Filenames are the basenames of the iob files. """ data = dict((label, list()) for label in ENTITIES) data['feats'] = [] data['filenames'] = [] for iob_fname in sorted_glob(join(iob_dir, '*.json')): text_iob = json.load(open(iob_fname)) filename = basename(iob_fname) feat_filenames = [join(dir, filename) for dir in feat_dirs] text_feat = Features.from_file(*feat_filenames) assert len(text_iob) == len(text_feat) data['feats'] += text_feat for label in ENTITIES: data[label] += _text_iob_tags(text_iob, label) data['filenames'] += len(text_iob) * [filename] return data
def iob_to_brat(iob_dir, txt_dir, brat_dir): for iob_fname in sorted_glob(join(iob_dir, '*.json')): spans = get_text_spans(iob_fname) # need text file for correct whitespace txt_fname = join(txt_dir, splitext(basename(iob_fname))[0] + '.txt') text = open(txt_fname).read() makedirs(brat_dir, exist_ok=True) brat_fname = join(brat_dir, splitext(basename(iob_fname))[0] + '.ann') write_brat_file(brat_fname, spans, text)
def postproc_brat(in_brat_dir, txt_dir, out_brat_dir): makedirs(out_brat_dir, exist_ok=True) for in_brat_fname in sorted_glob(join(in_brat_dir, '*.ann')): spans = read_brat_file(in_brat_fname) txt_fname = join(txt_dir, basename(in_brat_fname).replace('.ann', '.txt')) print('reading ' + txt_fname) text = open(txt_fname).read() phrase2annots = get_phrase_annots(spans, text) for phrase, annots in phrase2annots.items(): counts = Counter(span.label for span in annots) most_common = counts.most_common() if len(most_common) > 1: if most_common[0][1] > most_common[1][1]: print('--> found majority label for phrase', repr(phrase), ':', counts) majority_label = most_common[0][0] for span in annots: if span.label != majority_label: print('==> removing', span) annots.remove(span) spans.remove(span) else: best_label = 'Material' if 'Material' in counts else 'Process' print('--> found best label for phrase', repr(phrase), ':', counts) for span in annots: if span.label != best_label: print('==> removing', span) annots.remove(span) spans.remove(span) # now all labels in annots are the same unique_label = annots[0].label for m in re.finditer(re.escape(phrase), text): try: if text[m.start() - 1].isalpha(): continue except IndexError: pass try: if text[m.end()].isalpha(): continue except IndexError: pass span = Span(unique_label, m.start(), m.end()) if span not in annots: print(annots) print('==> adding span', span, 'for phrase', repr(phrase)) spans.append(span) out_brat_fname = join(out_brat_dir, basename(in_brat_fname)) write_brat_file(out_brat_fname, spans, text)
def get_entity_lempos_counts(iob_dir, spacy_dir, nlp=None, use_pickle=True): pickle_fname = '_counts.pkl' if use_pickle: try: return pickle.load(open(pickle_fname, 'rb')) except IOError: pass iob_fnames = sorted_glob(join(iob_dir, '*')) spacy_fnames = sorted_glob(join(spacy_dir, '*')) counts = {} if not nlp: nlp = spacy.load('en') for iob_fname, spacy_fname in zip(iob_fnames, spacy_fnames): iob_doc = json.load(open(iob_fname, encoding='utf8')) spacy_doc = read_doc(spacy_fname, nlp) count = dict((e, Counter()) for e in LABELS) for spacy_sent, iob_sent in zip(spacy_doc.sents, iob_doc): for spacy_tok, iob_tok in zip(spacy_sent, iob_sent): if spacy_tok.pos_ in CONTENT_POS and not spacy_tok.is_stop: lempos = spacy_tok.lemma_ + '#' + spacy_tok.pos_ other = True for label in ENTITIES: if iob_tok[label] in 'BI': count[label][lempos] += 1 other = False if other: count['Other'][lempos] += 1 counts[basename(iob_fname)] = count pickle.dump(counts, open(pickle_fname, 'wb')) return counts
def collect_features(iob_dir, *feat_dirs): """ Collect the features to train/eval CRF classifier from the json files in one or more feat_dirs. """ feats = [] for iob_fname in sorted_glob(join(iob_dir, '*.json')): filename = basename(iob_fname) feat_filenames = [join(dir, filename) for dir in feat_dirs] text_feat = Features.from_file(*feat_filenames) feats += text_feat return feats
def postproc_labels(in_iob_dir, out_iob_dir): makedirs(out_iob_dir, exist_ok=True) for in_iob_fname in sorted_glob(join(in_iob_dir, '*.json')): print('reading ' + in_iob_fname) text_iob = json.load(open(in_iob_fname)) tokens2labels = get_token_labels(text_iob) resolve_labels(tokens2labels) relabel(text_iob, tokens2labels) out_iob_fname = join(out_iob_dir, basename(in_iob_fname)) with open(out_iob_fname, 'w') as outf: print('writing ' + out_iob_fname) json.dump(text_iob, outf, indent=4, sort_keys=True, ensure_ascii=False)
def run_nlp(txt_dir, spacy_dir, nlp=None): """ Process text files in directory txt_dir with Spacy NLP pipeline and serialize analyses to directory spacy_dir """ if not nlp: nlp = spacy.load('en') makedirs(spacy_dir, exist_ok=True) for txt_fname in sorted_glob(join(txt_dir, '*.txt')): print('reading ' + txt_fname) text = open(txt_fname).read() # Spacy considers '\n' as a separate token. # That causes problems when writing tokens in column format, # so we strip the final '\n'. doc = nlp(text.rstrip('\n')) spacy_fname = join(spacy_dir, splitext(basename(txt_fname))[0] + '.spacy') write_doc(spacy_fname, doc)
def generate_labels(iob_dir, labels_fname): """ Generate labels to train/eval CRF classifier. Labels for entities are derived from IOB tags in the files in the iob_dir. Filenames are the basenames of the iob files (used for creating folds and converting back CRF predictions to IOB files). Saved as a pickled dict with keys for all entity labels plus the special key __filenames__. """ labels = dict((label, list()) for label in ENTITIES) labels['__filenames__'] = [] for iob_fname in sorted_glob(join(iob_dir, '*.json')): text_iob = json.load(open(iob_fname)) filename = basename(iob_fname) for label in ENTITIES: labels[label] += _text_iob_tags(text_iob, label) labels['__filenames__'] += len(text_iob) * [filename] print('writing labels to file ' + labels_fname) pickle.dump(labels, open(labels_fname, 'wb'))
def generate_feats(spacy_dir, feat_dir, feat_func, nlp=None): """ Generate features and save to file :param spacy_dir: dir with serialized Spacy analyses :param feat_dir: output dir for generated feature files in json format :param feat_func: function for generating :return: """ if not nlp: nlp = spacy.load('en') makedirs(feat_dir, exist_ok=True) for spacy_fname in sorted_glob(join(spacy_dir, '*.spacy')): doc = read_doc(spacy_fname, nlp) feat_fname = join(feat_dir, splitext(basename(spacy_fname))[0] + '.json') text_feats = Features([feat_func(sent) for sent in doc.sents]) text_feats.to_file(feat_fname)
from os.path import join, basename from glob import glob import json from sie import LOCAL_DIR from sie.utils import sorted_glob true_iob_dir = join(LOCAL_DIR, 'train', 'iob') synvec_feats_dir = join('_train', 'synvec_feats') for iob_fname in sorted_glob(join(true_iob_dir, '*'))[10:12]: doc_iob = json.load(open(iob_fname)) synvec_fname = join(synvec_feats_dir, basename(iob_fname)) doc_feats = json.load(open(synvec_fname)) for sent_iob, sent_feats in zip(doc_iob, doc_feats): for token_iob, token_feats in zip(sent_iob, sent_feats): if 'Pred' in token_feats['synvec']: print('{:20} {:10} {:10} {:10} {:10}'.format( token_iob['token'], 'Material' if token_iob['Material'] != 'O' else '-', 'Process' if token_iob['Process'] != 'O' else '-', 'Task' if token_iob['Task'] != 'O' else '-', token_feats['synvec']['Pred'])) else: print( '{:20} {:10} {:10} {:10} Material={:.2f} Process={:.2f} Task={:.2f} Other={:.2f}' .format( token_iob['token'], 'Material' if token_iob['Material'] != 'O' else '-', 'Process' if token_iob['Process'] != 'O' else '-',
def generate_iob_tags(ann_dir, spacy_dir, iob_dir, nlp=None): """ Generate files with IOB tags from Brat .ann files in ann_dir, Spacy serialized analyses in spacy_dir, writing to output files to iob_dir """ #TODO This does not correctly handle embedded entities of the same type! # E.g. Material inside another Material if not nlp: nlp = spacy.load('en') makedirs(iob_dir, exist_ok=True) correct = incorrect = 0 txt_count = ann_count = iob_count = 0 for txt_fname in sorted_glob(join(ann_dir, '*.txt')): txt_count += 1 spacy_fname = join(spacy_dir, splitext(basename(txt_fname))[0] + '.spacy') doc = read_doc(spacy_fname, nlp) char2token = map_chars_to_tokens(doc) iob_tags = {} for label in ENTITIES: iob_tags[label] = len(doc) * ['O'] ann_fname = txt_fname.replace('.txt', '.ann') if exists(ann_fname): print('reading ' + ann_fname) ann_count += 1 for line in open(ann_fname): if line.startswith('T'): try: label, begin_char, end_char = line.split('\t')[1].split() except ValueError: print('Oh no! Malformed annotation:\n' + line) continue begin_char, end_char = int(begin_char), int(end_char) start_token = char2token[begin_char] end_token = char2token[end_char] span = Span(doc, start_token, end_token, label=14) if span.start_char != begin_char or span.end_char != end_char: print('BRAT SPAN: ', doc.text[begin_char:end_char]) print('SPACY SPAN: ', span) toks = [t.text for t in doc[max(0, start_token - 3):end_token + 3]] print('SPACY TOKENS:', toks) print() incorrect += 1 else: iob_tags[label][start_token] = 'B' for i in range(start_token + 1, end_token): iob_tags[label][i] = 'I' correct += 1 else: # test data has no annotation print('WARNING: no annotation file ' + ann_fname) iob_fname = join(iob_dir, splitext(basename(ann_fname))[0] + '.json') iob_count += 1 write_iob_file(iob_fname, doc, iob_tags) print('\n#succesful spans: {}\n#failed spans: {}'.format(correct, incorrect)) print('\n#txt files: {}\n#ann files: {}\n#iob files: {}'.format(txt_count, ann_count, iob_count))
def add_entities(ann_dir, spacy_dir, ents_dir=None, nlp=None): """ Add Method, Process and Task entities from .ann files as entity spans to the corresponding serialized Spacy analyses in directory spacy_dir """ if not nlp: nlp = spacy.load('en') if ents_dir: makedirs(ents_dir, exist_ok=True) register_entities(nlp) correct = incorrect = 0 for ann_fname in sorted_glob(join(ann_dir, '*.ann')): print('reading ' + ann_fname) spacy_fname = join(spacy_dir, splitext(basename(ann_fname))[0] + '.spacy') doc = read_doc(spacy_fname, nlp) # see https://github.com/spacy-io/spaCy/issues/461 entities = [(e.label, e.start, e.end) for e in doc.ents] char2token = map_chars_to_tokens(doc) for line in open(ann_fname): if line.startswith('T'): try: label, begin, end = line.split('\t')[1].split() except ValueError: print('Oh no! Malformed annotation:\n' + line) continue start_token = char2token[int(begin)] end_token = char2token[int(end)] span = Span(doc, start_token, end_token, label=14) if span.start_char != int(begin) or span.end_char != int(end): print('BRAT SPAN: ', doc.text[int(begin):int(end)]) print('SPACY SPAN: ', span) toks = [t.text for t in doc[max(0, start_token - 3):end_token + 3]] print('SPACY TOKENS:', toks) print() incorrect += 1 else: label_id = nlp.vocab.strings[label] entities.append((label_id, start_token, end_token)) correct += 1 if ents_dir: ents_fname = join(ents_dir, splitext(basename(ann_fname))[0] + '_ents.pkl') print('writing ' + ents_fname) pickle.dump(entities, open(ents_fname, 'wb')) else: # Save ents in doc # FIXME: this currently fails with KeyError! # See https://github.com/explosion/spaCy/issues/514 # doc.ents behaves like a set, so adding duplicates is harmless doc.ents = entities write_doc(spacy_fname, doc) print('\n#succesful spans: {}\n#failed spans: {}'.format( correct, incorrect))