def _tweak_presplit(tcache, doc, spans): """ What to do in case the split was already done manually (in the discourse section) """ renames = {} for span in sorted(spans): matches = [ x for x in doc.units if x.text_span() == span and educe.stac.is_edu(x) ] if not matches: raise Exception("No matches found for %s in %s" % (span, doc.origin)) edu = matches[0] old_id = edu.local_id() new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span))) set_anno_date(edu, tcache.get(span)) set_anno_author(edu, _AUTHOR) renames[old_id] = new_id for rel in doc.relations: if rel.span.t1 in renames: rel.span.t1 = renames[rel.span.t1] if rel.span.t2 in renames: rel.span.t2 = renames[rel.span.t2] for schema in doc.schemas: units2 = set(schema.units) for unit in schema.units: if unit in renames: units2.remove(unit) units2.add(renames[unit]) schema.units = units2
def _tweak_presplit(tcache, doc, spans): """ What to do in case the split was already done manually (in the discourse section) """ renames = {} for span in sorted(spans): matches = [x for x in doc.units if x.text_span() == span and educe.stac.is_edu(x)] if not matches: raise Exception("No matches found for %s in %s" % (span, doc.origin)) edu = matches[0] old_id = edu.local_id() new_id = anno_id_from_tuple((_AUTHOR, tcache.get(span))) set_anno_date(edu, tcache.get(span)) set_anno_author(edu, _AUTHOR) renames[old_id] = new_id for rel in doc.relations: if rel.span.t1 in renames: rel.span.t1 = renames[rel.span.t1] if rel.span.t2 in renames: rel.span.t2 = renames[rel.span.t2] for schema in doc.schemas: units2 = set(schema.units) for unit in schema.units: if unit in renames: units2.remove(unit) units2.add(renames[unit]) schema.units = units2
def absorb_emoticon(doc, stamp, penult, last): """ Given a timestamp, and two edus, @penult@ (the second to last edu in a turn annotation), and @last@ (an emoticon-only edu that follows it), absorb the latter into the former. This only mutates `penult` (and updates the timestamp generator), and does not return anything Note that we also have to update any relations/schemas in the document telling them to point to the annotation with the new id """ old_id = penult.local_id() penult.span = penult.text_span().merge(last.text_span()) set_anno_date(penult, stamp) set_anno_author(penult, "stacutil") retarget(doc, old_id, penult)
def _actually_merge(tcache, edus, doc): """ Given a timestamp cache, a document and a collection of edus, replace the edus with a single merged edu in the document Anything that points to one of the EDUs should point instead to the new edu. Anything which points exclusively to EDUs in the span should be deleted (or signaled?) Annotations and features should be merged """ def one_or_join(strs): "Return element if singleton, otherwise moosh together" strs = [x for x in strs if x is not None] return list(strs)[0] if len(strs) == 1\ else _MERGE_PREFIX + "/".join(strs) if not edus: return new_edu = copy.deepcopy(edus[0]) new_edu.span = Span.merge_all(x.text_span() for x in edus) stamp = tcache.get(new_edu.span) set_anno_date(new_edu, stamp) set_anno_author(new_edu, _AUTHOR) if doc.origin.stage == 'units': new_edu.type = one_or_join(frozenset(x.type for x in edus)) # feature keys for all edus all_keys = frozenset(x for edu in edus for x in edu.features.keys()) for key in all_keys: old_values = frozenset(x.features.get(key) for x in edus) new_edu.features[key] = one_or_join(old_values) # in-place replacement for i, _ in enumerate(doc.units): if doc.units[i] in edus: doc.units[i] = new_edu break for edu in edus: if edu in doc.units: doc.units.remove(edu) retarget(doc, edu.local_id(), new_edu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={ 'author': _AUTHOR, 'creation-date': str(cdu_stamp) }) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={'author': _AUTHOR, 'creation-date': str(cdu_stamp)}) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _set(tcache, span, anno): """Assign an annotation an id/span according to the timestamp cache""" stamp = tcache.get(span) set_anno_date(anno, stamp) set_anno_author(anno, _AUTHOR) anno.span = span