Пример #1
0
def _tweak_presplit(tcache, doc, spans):
    """
    What to do in case the split was already done manually
    (in the discourse section)
    """
    renames = {}
    for span in sorted(spans):
        matches = [
            x for x in doc.units
            if x.text_span() == span and educe.stac.is_edu(x)
        ]
        if not matches:
            raise Exception("No matches found for %s in %s" %
                            (span, doc.origin))
        edu = matches[0]
        old_id = edu.local_id()
        new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span)))
        set_anno_date(edu, tcache.get(span))
        set_anno_author(edu, _AUTHOR)
        renames[old_id] = new_id

    for rel in doc.relations:
        if rel.span.t1 in renames:
            rel.span.t1 = renames[rel.span.t1]
        if rel.span.t2 in renames:
            rel.span.t2 = renames[rel.span.t2]
    for schema in doc.schemas:
        units2 = set(schema.units)
        for unit in schema.units:
            if unit in renames:
                units2.remove(unit)
                units2.add(renames[unit])
        schema.units = units2
Пример #2
0
def _tweak_presplit(tcache, doc, spans):
    """
    What to do in case the split was already done manually
    (in the discourse section)
    """
    renames = {}
    for span in sorted(spans):
        matches = [x for x in doc.units
                   if x.text_span() == span and educe.stac.is_edu(x)]
        if not matches:
            raise Exception("No matches found for %s in %s" %
                            (span, doc.origin))
        edu = matches[0]
        old_id = edu.local_id()
        new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span)))
        set_anno_date(edu, tcache.get(span))
        set_anno_author(edu, _AUTHOR)
        renames[old_id] = new_id

    for rel in doc.relations:
        if rel.span.t1 in renames:
            rel.span.t1 = renames[rel.span.t1]
        if rel.span.t2 in renames:
            rel.span.t2 = renames[rel.span.t2]
    for schema in doc.schemas:
        units2 = set(schema.units)
        for unit in schema.units:
            if unit in renames:
                units2.remove(unit)
                units2.add(renames[unit])
        schema.units = units2
Пример #3
0
def absorb_emoticon(doc, stamp, penult, last):
    """
    Given a timestamp, and two edus, @penult@ (the second to last edu
    in a turn annotation), and @last@ (an emoticon-only edu that follows it),
    absorb the latter into the former.

    This only mutates `penult` (and updates the timestamp generator), and
    does not return anything

    Note that we also have to update any relations/schemas in the document
    telling them to point to the annotation with the new id
    """
    old_id = penult.local_id()
    penult.span = penult.text_span().merge(last.text_span())
    set_anno_date(penult, stamp)
    set_anno_author(penult, "stacutil")
    retarget(doc, old_id, penult)
Пример #4
0
def absorb_emoticon(doc, stamp, penult, last):
    """
    Given a timestamp, and two edus, @penult@ (the second to last edu
    in a turn annotation), and @last@ (an emoticon-only edu that follows it),
    absorb the latter into the former.

    This only mutates `penult` (and updates the timestamp generator), and
    does not return anything

    Note that we also have to update any relations/schemas in the document
    telling them to point to the annotation with the new id
    """
    old_id = penult.local_id()
    penult.span = penult.text_span().merge(last.text_span())
    set_anno_date(penult, stamp)
    set_anno_author(penult, "stacutil")
    retarget(doc, old_id, penult)
Пример #5
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """

    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Пример #6
0
def _actually_merge(tcache, edus, doc):
    """
    Given a timestamp cache, a document and a collection of edus,
    replace the edus with a single merged edu in the document

    Anything that points to one of the EDUs should point
    instead to the new edu.

    Anything which points exclusively to EDUs in the span
    should be deleted (or signaled?)

    Annotations and features should be merged
    """
    def one_or_join(strs):
        "Return element if singleton, otherwise moosh together"
        strs = [x for x in strs if x is not None]
        return list(strs)[0] if len(strs) == 1\
            else _MERGE_PREFIX + "/".join(strs)

    if not edus:
        return
    new_edu = copy.deepcopy(edus[0])
    new_edu.span = Span.merge_all(x.text_span() for x in edus)
    stamp = tcache.get(new_edu.span)
    set_anno_date(new_edu, stamp)
    set_anno_author(new_edu, _AUTHOR)

    if doc.origin.stage == 'units':
        new_edu.type = one_or_join(frozenset(x.type for x in edus))
        # feature keys for all edus
        all_keys = frozenset(x for edu in edus for x in edu.features.keys())
        for key in all_keys:
            old_values = frozenset(x.features.get(key) for x in edus)
            new_edu.features[key] = one_or_join(old_values)

    # in-place replacement
    for i, _ in enumerate(doc.units):
        if doc.units[i] in edus:
            doc.units[i] = new_edu
            break

    for edu in edus:
        if edu in doc.units:
            doc.units.remove(edu)
        retarget(doc, edu.local_id(), new_edu)
Пример #7
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit', {},
                                  metadata={
                                      'author': _AUTHOR,
                                      'creation-date': str(cdu_stamp)
                                  })
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Пример #8
0
def _actually_split(tcache, doc, spans, edu):
    """
    Split the EDU, trying to generate the same new ID for the
    same new EDU across all sections

    Discourse stage: If the EDU is in any relations or CDUs,
    replace any references to it with a new CDU encompassing
    the newly created EDUs
    """

    new_edus = {}
    for span in sorted(spans):
        stamp = tcache.get(span)
        edu2 = copy.deepcopy(edu)
        new_id = anno_id_from_tuple((_AUTHOR, stamp))
        set_anno_date(edu2, stamp)
        set_anno_author(edu2, _AUTHOR)
        if doc.origin.stage == 'units':
            edu2.type = _SPLIT_PREFIX + edu2.type
            for key in edu2.features:
                edu2.features[key] = _SPLIT_PREFIX + edu2.features[key]
        new_edus[new_id] = edu2
        edu2.span = span
        doc.units.append(edu2)

    cdu_stamp = tcache.get(Span.merge_all(spans))
    cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)),
                                  frozenset(new_edus),
                                  frozenset(),
                                  frozenset(),
                                  'Complex_discourse_unit',
                                  {},
                                  metadata={'author': _AUTHOR,
                                            'creation-date': str(cdu_stamp)})
    cdu.fleshout(new_edus)

    want_cdu = retarget(doc, edu.local_id(), cdu)
    doc.units.remove(edu)
    if want_cdu:
        doc.schemas.append(cdu)
Пример #9
0
def _set(tcache, span, anno):
    """Assign an annotation an id/span according to the timestamp cache"""
    stamp = tcache.get(span)
    set_anno_date(anno, stamp)
    set_anno_author(anno, _AUTHOR)
    anno.span = span
Пример #10
0
def _set(tcache, span, anno):
    """Assign an annotation an id/span according to the timestamp cache"""
    stamp = tcache.get(span)
    set_anno_date(anno, stamp)
    set_anno_author(anno, _AUTHOR)
    anno.span = span