def make_homogeneous_sentence_annotations(sentence_annotations): annotators_ids = set( annotator_id for __, annotators_sentence_edits in sentence_annotations for annotator_id in annotators_sentence_edits.keys()) sentences = [] annotations = {annotator_id: [] for annotator_id in annotators_ids} for sentence, annotators_sentence_edits in sentence_annotations: sentences.append(sentence) n = len(sentence) for annotator_id, edits in annotations.items(): if annotator_id not in annotators_sentence_edits: annotator_edits = [ m2format.Edit(m2format.Region(0, n), tokens=tuple(), type=None) ] else: annotator_edits = annotators_sentence_edits[annotator_id] coverage = m2format.make_coverage(n, (e.region for e in annotator_edits)) explicit_identity_edits = ( m2format.Edit(r, tokens=tuple(), type=m2format.IDENTITY_CORRECTION_TYPE) for r in m2format.generate_uncovered_regions(coverage)) annotator_edits.extend(explicit_identity_edits) edits.append(annotator_edits) assert set(len(edits) for edits in annotations.values()) == {len(sentences)} return sentences, annotations
def erase_all_edit_types(edits_a, edits_b): NON_EXISTING_EDIT_TYPE = object() edits_a = [ m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE) for e in edits_a ] edits_b = [ m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE) for e in edits_b ] return edits_a, edits_b
def normalize_to_single_token_edits(edit): def positions_it(region): return itertools.chain( (m2format.Region(pos, pos + 1) for pos in range(region.beg, region.end)), itertools.repeat(m2format.Region(region.end, region.end)), ) def tokens_it(tokens): return itertools.chain( ((token, ) for token in tokens), itertools.repeat(tuple()), ) region = edit.region tokens = edit.tokens if region.beg == region.end: yield edit return n = max(region.end - region.beg, len(tokens)) for st_region, st_tokens in utils.take( n, zip(positions_it(region), tokens_it(tokens))): assert region.beg <= st_region.beg and st_region.end <= region.end yield m2format.Edit(st_region, st_tokens, edit.type)
def concatenate_edits(edits_iterable): """Precondition: edits fully cover target sample. So, at least: sample lengths == max(edit.region.end) """ offset = 0 for edits in edits_iterable: assert edits n = max(e.region.end for e in edits) for e in edits: r = m2format.Region(e.region.beg + offset, e.region.end + offset) yield m2format.Edit(r, e.tokens, e.type) offset += n
def erase_all_tokens(edits_a, edits_b): edits_a = [m2format.Edit(e.region, tuple(), e.type) for e in edits_a] edits_b = [m2format.Edit(e.region, tuple(), e.type) for e in edits_b] return edits_a, edits_b