def postprocess_brat_conllu(corpus, save_to=None): """Converts corpus in text format into CoNLL-U format. Embedded brat entities will be placed to the MISC field. :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format :param save_to: a path where the result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data """ def process(): for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) sent_ = [] tags = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if TAGS_BRAT[0] in misc: if TAGS_BRAT[0] not in tags: tags.append(misc[TAGS_BRAT[0]]) elif TAGS_BRAT[1] in misc: try: tags.remove(misc[TAGS_BRAT[1]]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = misc[ 'SpaceAfter'] else: sent_.append(token) else: for tag in tags: misc[TAG_BRAT + tag] = 'Yes' sent_.append(token) yield sent_, meta res = process() if save_to: Conllu.save(res, save_to, fix=True) else: return Conllu.fix(res)
def postprocess_brat_conllu(corpus, save_to=None): """Does postprocessing for the *corpus* with embedded brat annotations which already was preliminarily prepared by Toxine's TextPreprocessor. :param corpus: corpus in Parsed CoNLL-U format or a path to the previously saved corpus in CoNLL-U format. :param save_to: a path where result will be stored. If ``None`` (default), the function returns the result as a generator of Parsed CoNLL-U data. """ def process(): def unmask(text): return text.replace(r'\{}'.format(BRAT_TEXT_BOUND_START_MARK[-1]), BRAT_TEXT_BOUND_START_MARK[-1]) \ .replace(r'\{}'.format(SEP1), SEP1) \ .replace('__', ' ').replace(r'\_', '_') for sent, meta in Conllu.load(corpus) \ if isinstance(corpus, str) else \ corpus: meta.pop('text', None) if 'par_text' in meta: meta['par_text'] = RE_BRAT.sub('', meta['par_text']) sent_ = [] anns = [] for token in sent: misc = token['MISC'] if token['FORM'] is None: if BRAT_START_TAG in misc: assert BRAT_START_TAG not in anns assert misc[BRAT_START_TAG][0] == 'T', \ 'ERROR: Invalid annotation type "{}"' \ .format(misc[BRAT_START_TAG]) anns.append(misc[BRAT_START_TAG]) elif BRAT_END_TAG in misc: anns_ = [] for ann in anns: prefix = misc[BRAT_END_TAG] + SEP2 anns = list( filter(lambda x: not x.startswith(prefix), anns)) try: tags.remove(misc[BRAT_END_TAG]) except: pass if sent_ and 'SpaceAfter' in misc: sent_[-1]['MISC']['SpaceAfter'] = \ misc['SpaceAfter'] else: sent_.append(token) else: for ann in anns: ann = ann.split(SEP1 + SEP1) entity, ann_ = ann[0], ann[1:] tid, name = entity.split(SEP2) assert tid.startswith('T'), \ 'ERROR: Unrecognized annotation {}'.format(ann) misc[BRAT_TAG + tid] = name for ann in ann_: if ann.startswith('R'): ann_id, name, role = ann.split(SEP2) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + name + SEP3 + role elif ann.startswith('*'): ann_id, name = ann.split(SEP2) misc[BRAT_TAG + ann_id] = tid + SEP3 + name elif ann.startswith('E'): ann_id, name, role = ann.split(SEP2) val = tid + SEP3 + name if role: val += SEP3 + role misc[BRAT_TAG + ann_id] = val elif ann.startswith('A'): ann_id, name, value = ann.split(SEP2) val = tid + SEP3 + name if value: val += SEP3 + value misc[BRAT_TAG + ann_id] = val elif ann.startswith('N'): ann_id, service_name, service_id, title = \ ann.split(SEP2, maxsplit=3) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + service_name \ + SEP3 + service_id + SEP3 + unmask(title) elif ann.startswith('#'): ann_id, note = ann.split(SEP2, maxsplit=1) misc[BRAT_TAG + ann_id] = \ tid + SEP3 + unmask(note) else: raise ValueError('ERROR: Unknown annotation ' 'type') #misc[BRAT_TAG + ann] = 'Yes' sent_.append(token) yield sent_, meta res = process() if save_to: Conllu.save(res, save_to, fix=True) else: return Conllu.fix(res)