예제 #1
0
def make_homogeneous_sentence_annotations(sentence_annotations):

    annotators_ids = set(
        annotator_id for __, annotators_sentence_edits in sentence_annotations
        for annotator_id in annotators_sentence_edits.keys())

    sentences = []
    annotations = {annotator_id: [] for annotator_id in annotators_ids}

    for sentence, annotators_sentence_edits in sentence_annotations:
        sentences.append(sentence)
        n = len(sentence)
        for annotator_id, edits in annotations.items():
            if annotator_id not in annotators_sentence_edits:
                annotator_edits = [
                    m2format.Edit(m2format.Region(0, n),
                                  tokens=tuple(),
                                  type=None)
                ]
            else:
                annotator_edits = annotators_sentence_edits[annotator_id]
                coverage = m2format.make_coverage(n,
                                                  (e.region
                                                   for e in annotator_edits))
                explicit_identity_edits = (
                    m2format.Edit(r,
                                  tokens=tuple(),
                                  type=m2format.IDENTITY_CORRECTION_TYPE)
                    for r in m2format.generate_uncovered_regions(coverage))
                annotator_edits.extend(explicit_identity_edits)
            edits.append(annotator_edits)

    assert set(len(edits)
               for edits in annotations.values()) == {len(sentences)}
    return sentences, annotations
예제 #2
0
 def erase_all_edit_types(edits_a, edits_b):
     NON_EXISTING_EDIT_TYPE = object()
     edits_a = [
         m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE)
         for e in edits_a
     ]
     edits_b = [
         m2format.Edit(e.region, e.tokens, NON_EXISTING_EDIT_TYPE)
         for e in edits_b
     ]
     return edits_a, edits_b
예제 #3
0
def normalize_to_single_token_edits(edit):
    def positions_it(region):
        return itertools.chain(
            (m2format.Region(pos, pos + 1)
             for pos in range(region.beg, region.end)),
            itertools.repeat(m2format.Region(region.end, region.end)),
        )

    def tokens_it(tokens):
        return itertools.chain(
            ((token, ) for token in tokens),
            itertools.repeat(tuple()),
        )

    region = edit.region
    tokens = edit.tokens

    if region.beg == region.end:
        yield edit
        return

    n = max(region.end - region.beg, len(tokens))
    for st_region, st_tokens in utils.take(
            n, zip(positions_it(region), tokens_it(tokens))):
        assert region.beg <= st_region.beg and st_region.end <= region.end
        yield m2format.Edit(st_region, st_tokens, edit.type)
예제 #4
0
def concatenate_edits(edits_iterable):
    """Precondition: edits fully cover target sample.
    So, at least: sample lengths == max(edit.region.end)
    """

    offset = 0
    for edits in edits_iterable:
        assert edits
        n = max(e.region.end for e in edits)
        for e in edits:
            r = m2format.Region(e.region.beg + offset, e.region.end + offset)
            yield m2format.Edit(r, e.tokens, e.type)
        offset += n
예제 #5
0
 def erase_all_tokens(edits_a, edits_b):
     edits_a = [m2format.Edit(e.region, tuple(), e.type) for e in edits_a]
     edits_b = [m2format.Edit(e.region, tuple(), e.type) for e in edits_b]
     return edits_a, edits_b