def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) span = Span.merge_all(args.spans) _split_edu(tcache, k, new_doc, args.spans) diffs = _mini_diff(k, old_doc, new_doc, span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=span) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info)) announce_output_dir(output_dir)
def merge_turn_stars(doc): """Return a copy of the document in which consecutive turns by the same speaker have been merged. Merging is done by taking the first turn in grouping of consecutive speaker turns, and stretching its span over all the subsequent turns. Additionally turn prefix text (containing turn numbers and speakers) from the removed turns are stripped out. """ def prefix_span(turn): "given a turn annotation, return the span of its prefix" prefix, _ = split_turn_text(doc.text(turn.text_span())) start = turn.text_span().char_start return start, start + len(prefix) doc = copy.deepcopy(doc) dialogues = sorted([x for x in doc.units if is_dialogue(x)], key=lambda x: x.text_span()) rejects = [] # spans for the "deleted" turns' prefixes for dia in dialogues: dia_turns = sorted(turns_in_span(doc, dia.text_span()), key=lambda x: x.text_span()) for _, turns in itr.groupby(dia_turns, anno_speaker): turns = list(turns) tstar = turns[0] tstar.span = Span.merge_all(x.text_span() for x in turns) rejects.extend(turns[1:]) for anno in turns[1:]: doc.units.remove(anno) # pylint: disable=protected-access doc._text = _blank_out(doc._text, [prefix_span(x) for x in rejects]) # pylint: enable=protected-access return doc
def _enclosing_turn_span(doc, span): """ Return the span for any turn annotations that enclose this span. If none are found, return the span itself """ def is_match(anno): "enclosing turn" return educe.stac.is_turn(anno) and anno.text_span().encloses(span) spans = [span] + [u.text_span() for u in doc.units if is_match(u)] return Span.merge_all(spans)
def _recompute_spans(tree, context): """ Recalculate tree node spans from the bottom up (helper for _align_with_context) """ if isinstance(tree, Tree): spans = [] for child in tree: _recompute_spans(child, context) spans.append(_tree_span(child)) treenode(tree).span = Span.merge_all(spans) treenode(tree).context = context
def _split_edu(tcache, k, doc, spans): """ Find the edu covered by these spans and do the split """ # seek edu big_span = Span.merge_all(spans) matches = [x for x in doc.units if x.text_span() == big_span and educe.stac.is_edu(x)] if not matches and k.stage != 'discourse': print("No matches found in %s" % k, file=sys.stderr) elif not matches: _tweak_presplit(tcache, doc, spans) else: _actually_split(tcache, doc, spans, matches[0])
def _split_edu(tcache, k, doc, spans): """ Find the edu covered by these spans and do the split """ # seek edu big_span = Span.merge_all(spans) matches = [ x for x in doc.units if x.text_span() == big_span and educe.stac.is_edu(x) ] if not matches and k.stage != 'discourse': print("No matches found in %s" % k, file=sys.stderr) elif not matches: _tweak_presplit(tcache, doc, spans) else: _actually_split(tcache, doc, spans, matches[0])
def _actually_merge(tcache, edus, doc): """ Given a timestamp cache, a document and a collection of edus, replace the edus with a single merged edu in the document Anything that points to one of the EDUs should point instead to the new edu. Anything which points exclusively to EDUs in the span should be deleted (or signaled?) Annotations and features should be merged """ def one_or_join(strs): "Return element if singleton, otherwise moosh together" strs = [x for x in strs if x is not None] return list(strs)[0] if len(strs) == 1\ else _MERGE_PREFIX + "/".join(strs) if not edus: return new_edu = copy.deepcopy(edus[0]) new_edu.span = Span.merge_all(x.text_span() for x in edus) stamp = tcache.get(new_edu.span) set_anno_date(new_edu, stamp) set_anno_author(new_edu, _AUTHOR) if doc.origin.stage == 'units': new_edu.type = one_or_join(frozenset(x.type for x in edus)) # feature keys for all edus all_keys = frozenset(x for edu in edus for x in edu.features.keys()) for key in all_keys: old_values = frozenset(x.features.get(key) for x in edus) new_edu.features[key] = one_or_join(old_values) # in-place replacement for i, _ in enumerate(doc.units): if doc.units[i] in edus: doc.units[i] = new_edu break for edu in edus: if edu in doc.units: doc.units.remove(edu) retarget(doc, edu.local_id(), new_edu)
def _merge_edus(tcache, span, doc): """ Find any EDUs within the given span in the document and merge them into a single one. The EDUs should stretch from the beginning to the end of the span (gaps OK). The output EDU should have the same ID in all documents """ edus = edus_in_span(doc, span) if not edus: sys.exit("No EDUs in span %s" % span) espan = Span.merge_all(x.text_span() for x in edus) if espan != span: sys.exit("EDUs in do not cover full span %s [only %s]" % (span, espan)) _actually_merge(tcache, edus, doc)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={ 'author': _AUTHOR, 'creation-date': str(cdu_stamp) }) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _actually_split(tcache, doc, spans, edu): """ Split the EDU, trying to generate the same new ID for the same new EDU across all sections Discourse stage: If the EDU is in any relations or CDUs, replace any references to it with a new CDU encompassing the newly created EDUs """ new_edus = {} for span in sorted(spans): stamp = tcache.get(span) edu2 = copy.deepcopy(edu) new_id = anno_id_from_tuple((_AUTHOR, stamp)) set_anno_date(edu2, stamp) set_anno_author(edu2, _AUTHOR) if doc.origin.stage == 'units': edu2.type = _SPLIT_PREFIX + edu2.type for key in edu2.features: edu2.features[key] = _SPLIT_PREFIX + edu2.features[key] new_edus[new_id] = edu2 edu2.span = span doc.units.append(edu2) cdu_stamp = tcache.get(Span.merge_all(spans)) cdu = educe.annotation.Schema(anno_id_from_tuple((_AUTHOR, cdu_stamp)), frozenset(new_edus), frozenset(), frozenset(), 'Complex_discourse_unit', {}, metadata={'author': _AUTHOR, 'creation-date': str(cdu_stamp)}) cdu.fleshout(new_edus) want_cdu = retarget(doc, edu.local_id(), cdu) doc.units.remove(edu) if want_cdu: doc.schemas.append(cdu)
def _nudge_down(turn, dialogue, prev_turn, next_dialogue): """ Move last turn to next dialogue. (ie. shorten the right boundary of this dialogue and extend the left boundary of this dialogue) Return encompassing span to show what we've changed """ if not prev_turn: sys.exit("Can't move very first turn. " "Try `stac-util merge-dialogue` instead") elif not next_dialogue: sys.exit("Can't move from last dialogue." "Try `stac-util move` instead") elif turn.span.char_end != dialogue.span.char_end: sys.exit("Turn %d %s is not at the end of its dialogue %s" % (st.turn_id(turn), turn.span, dialogue.span)) offset = prev_turn.span.char_end - turn.span.char_end # take both dialogue boundaries down a bit (to next turn end) next_dialogue.span.char_start += offset dialogue.span.char_end += offset return Span.merge_all([dialogue.span, next_dialogue.span])
def _nudge_up(turn, dialogue, next_turn, prev_dialogue): """ Move first turn to previous dialogue (ie. extend the previous dialogue to incorporate this turn, and push this dialogue to exclude it) Return encompassing span to show what we've changed """ if not next_turn: sys.exit("Can't move very last turn. " "Try `stac-util merge-dialogue` instead") elif not prev_dialogue: sys.exit("Can't move from first dialogue. " "Try `stac-util move` instead") elif turn.span.char_start - 1 != dialogue.span.char_start: sys.exit("Turn %d %s is not at the start of its dialogue %s" % (st.turn_id(turn), turn.span, dialogue.span)) offset = next_turn.span.char_start - turn.span.char_start # take both dialogue boundaries up a bit (to prev turn end) prev_dialogue.span.char_end += offset dialogue.span.char_start += offset return Span.merge_all([prev_dialogue.span, dialogue.span])
def _nudge_up(turn, dialogue, next_turn, prev_dialogue): """ Move first turn to previous dialogue (ie. extend the previous dialogue to incorporate this turn, and push this dialogue to exclude it) Return encompassing span to show what we've changed """ if not next_turn: sys.exit("Can't move very last turn. " "Try `stac-util merge-dialogue` instead") elif not prev_dialogue: sys.exit("Can't move from first dialogue." "Try `stac-util move` instead") elif turn.span.char_start - 1 != dialogue.span.char_start: sys.exit("Turn %d %s is not at the start of its dialogue %s" % (st.turn_id(turn), turn.span, dialogue.span)) offset = next_turn.span.char_start - turn.span.char_start # take both dialogue boundaries up a bit (to prev turn end) prev_dialogue.span.char_end += offset dialogue.span.char_start += offset return Span.merge_all([prev_dialogue.span, dialogue.span])