def absorb_emoticon(doc, stamp, penult, last): """ Given a timestamp, and two edus, @penult@ (the second to last edu in a turn annotation), and @last@ (an emoticon-only edu that follows it), absorb the latter into the former. This only mutates `penult` (and updates the timestamp generator), and does not return anything Note that we also have to update any relations/schemas in the document telling them to point to the annotation with the new id """ old_id = penult.local_id() penult.span = penult.text_span().merge(last.text_span()) set_anno_date(penult, stamp) set_anno_author(penult, "stacutil") retarget(doc, old_id, penult)
def _actually_merge(tcache, edus, doc): """ Given a timestamp cache, a document and a collection of edus, replace the edus with a single merged edu in the document Anything that points to one of the EDUs should point instead to the new edu. Anything which points exclusively to EDUs in the span should be deleted (or signaled?) Annotations and features should be merged """ def one_or_join(strs): "Return element if singleton, otherwise moosh together" strs = [x for x in strs if x is not None] return list(strs)[0] if len(strs) == 1\ else _MERGE_PREFIX + "/".join(strs) if not edus: return new_edu = copy.deepcopy(edus[0]) new_edu.span = Span.merge_all(x.text_span() for x in edus) stamp = tcache.get(new_edu.span) set_anno_date(new_edu, stamp) set_anno_author(new_edu, _AUTHOR) if doc.origin.stage == 'units': new_edu.type = one_or_join(frozenset(x.type for x in edus)) # feature keys for all edus all_keys = frozenset(x for edu in edus for x in edu.features.keys()) for key in all_keys: old_values = frozenset(x.features.get(key) for x in edus) new_edu.features[key] = one_or_join(old_values) # in-place replacement for i, _ in enumerate(doc.units): if doc.units[i] in edus: doc.units[i] = new_edu break for edu in edus: if edu in doc.units: doc.units.remove(edu) retarget(doc, edu.local_id(), new_edu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={ 'author': _AUTHOR, 'creation-date': str(cdu_stamp) }) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={'author': _AUTHOR, 'creation-date': str(cdu_stamp)}) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)