def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k, src_doc in src_corpus.items(): # retrieve target subdoc tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k) tgt_doc = tgt_corpus[tgt_k] # move portion from source to target subdoc if start == 0: # move up new_src_doc, new_tgt_doc = move_portion( renames, src_doc, tgt_doc, end, # src_split tgt_split=-1) elif end == len(src_doc.text()): # src_doc.text_span().char_end: # move down # move_portion inserts src_doc[0:src_split] between # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:], # so we detach src_doc[start:] into a temporary doc, # then call move_portion on this temporary doc new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc = move_portion( renames, src_doc2, tgt_doc, -1, # src_split tgt_split=0) # the whitespace between new_src_doc and src_doc2 went to # src_doc2, so we need to append a new whitespace to new_src_doc evil_set_text(new_src_doc, new_src_doc.text() + ' ') else: raise ValueError("Sorry, can only move to the start or to the " "end of a document at the moment") # print diff for suggested commit message diffs = ["======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), ""] print("\n".join(diffs), file=sys.stderr) # dump the modified documents save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k in src_corpus: tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k) else: src_doc = src_corpus[src_k] tgt_doc = tgt_corpus[tgt_k] if start == 0: new_src_doc, new_tgt_doc =\ move_portion(renames, src_doc, tgt_doc, src_split=end, tgt_split=-1) elif end == src_doc.text_span().char_end: new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc =\ move_portion(renames, src_doc2, tgt_doc, src_split=-1, tgt_split=0) else: sys.exit("Sorry, can only move to the start or to the " "end of a document at the moment") diffs = ["======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), ""] print("\n".join(diffs), file=sys.stderr) save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append("Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append( "Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def _mini_diff(k, args, old_doc, new_doc, span): """ Return lines of text to be printed out, showing how the nudge affected the text """ mini_old_doc = narrow_to_span(old_doc, span) mini_new_doc = narrow_to_span(new_doc, span) return [ "======= SPLIT AT TURN {} in {} ========".format(args.turn, k), "...", show_diff(mini_old_doc, mini_new_doc), "...", "" ]
def _mini_diff(k, old_doc, new_doc, span): """ Return lines of text to be printed out, showing how the EDU split affected the text """ mini_old_doc = narrow_to_span(old_doc, span) mini_new_doc = narrow_to_span(new_doc, span) return [ "======= MERGE EDUS %s ========" % (k), "...", show_diff(mini_old_doc, mini_new_doc), "...", "" ]
def _mini_diff(k, old_doc, new_doc, span): """ Return lines of text to be printed out, showing how the EDU split affected the text """ mini_old_doc = narrow_to_span(old_doc, span) mini_new_doc = narrow_to_span(new_doc, span) return ["======= SPLIT EDU %s ========" % (k), "...", show_diff(mini_old_doc, mini_new_doc), "...", ""]
def _mini_diff(k, args, old_doc, new_doc, span): """ Return lines of text to be printed out, showing how the nudge affected the text """ mini_old_doc = narrow_to_span(old_doc, span) mini_new_doc = narrow_to_span(new_doc, span) return ["======= SPLIT AT TURN {} in {} ========".format(args.turn, k), "...", show_diff(mini_old_doc, mini_new_doc), "...", ""]
def _mini_diff(k, args, old_doc, new_doc, span): """ Return lines of text to be printed out, showing how the nudge affected the text """ mini_old_doc = narrow_to_span(old_doc, span) mini_new_doc = narrow_to_span(new_doc, span) return ["======= NUDGE TURN %d %s in %s ========" % (args.turn, args.direction, k), "...", show_diff(mini_old_doc, mini_new_doc), "...", ""]
def _mini_diff(k, old_doc_span, new_doc_span): """ Return lines of text to be printed out, showing how the nudge affected the text """ old_doc, old_span = old_doc_span new_doc, new_span = new_doc_span interesting_span = _enclosing_turn_span(old_doc, old_span) mini_old_doc = narrow_to_span(old_doc, interesting_span) mini_new_doc = narrow_to_span(new_doc, interesting_span) return [ "======= NUDGE %s to %s in %s ========" % (old_span, new_span, k), "...", show_diff(mini_old_doc, mini_new_doc), "...", "" ]
def _mini_diff(k, old_doc_span, new_doc_span): """ Return lines of text to be printed out, showing how the nudge affected the text """ old_doc, old_span = old_doc_span new_doc, new_span = new_doc_span interesting_span = _enclosing_turn_span(old_doc, old_span) mini_old_doc = narrow_to_span(old_doc, interesting_span) mini_new_doc = narrow_to_span(new_doc, interesting_span) return ["======= NUDGE %s to %s in %s ========" % (old_span, new_span, k), "...", show_diff(mini_old_doc, mini_new_doc), "...", ""]
def main(args): """Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser`. """ output_dir = get_output_dir(args, default_overwrite=True) # locate insertion site: target document reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) # TODO mark units with FIXME, optionally delete in/out relations span = args.span sub_text = args.sub_text minor = args.minor # store before/after annos_before = [] annos_after = [] for tgt_k, tgt_doc in tgt_corpus.items(): annos_before.append(annotate_doc(tgt_doc, span=span)) # process new_tgt_doc = replace_text_at_span(tgt_doc, span, sub_text, minor=minor) # WIP new_span, depends on the offset offset = len(sub_text) - (span.char_end - span.char_start) new_span = Span(span.char_start, span.char_end + offset) # end WIP annos_after.append(annotate_doc(new_tgt_doc, span=new_span)) # show diff and save doc diffs = [ "======= REPLACE TEXT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc) ] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir) # commit message tgt_k, tgt_doc = list(tgt_corpus.items())[0] anno_str_before = annos_before[0] anno_str_after = annos_after[0] if tgt_k and not args.no_commit_msg: print("-----8<------") print(commit_msg(tgt_k, anno_str_before, anno_str_after))
def main(args): """Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser`. """ output_dir = get_output_dir(args, default_overwrite=True) # locate insertion site: target document reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) # TODO mark units with FIXME, optionally delete in/out relations span = args.span sub_text = args.sub_text minor = args.minor # store before/after annos_before = [] annos_after = [] for tgt_k, tgt_doc in tgt_corpus.items(): annos_before.append(annotate_doc(tgt_doc, span=span)) # process new_tgt_doc = replace_text_at_span( tgt_doc, span, sub_text, minor=minor) # WIP new_span, depends on the offset offset = len(sub_text) - (span.char_end - span.char_start) new_span = Span(span.char_start, span.char_end + offset) # end WIP annos_after.append(annotate_doc(new_tgt_doc, span=new_span)) # show diff and save doc diffs = ["======= REPLACE TEXT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc)] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir) # commit message tgt_k, tgt_doc = list(tgt_corpus.items())[0] anno_str_before = annos_before[0] anno_str_after = annos_after[0] if tgt_k and not args.no_commit_msg: print("-----8<------") print(commit_msg(tgt_k, anno_str_before, anno_str_after))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) src_reader = educe.stac.LiveInputReader(args.insert) src_corpus = src_reader.slurp(src_reader.files()) if not src_corpus: sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)") elif len(src_corpus) > 1: sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" % len(src_corpus)) src_doc = src_corpus.values()[0] reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) renames = compute_renames(tgt_corpus, src_corpus) for tgt_k in tgt_corpus: tgt_doc = tgt_corpus[tgt_k] _, new_tgt_doc = move_portion(renames, src_doc, tgt_doc, -1, tgt_split=args.start) diffs = [ "======= INSERT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc) ] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) src_reader = educe.stac.LiveInputReader(args.insert) src_corpus = src_reader.slurp(src_reader.files()) if not src_corpus: sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)") elif len(src_corpus) > 1: sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" % len(src_corpus)) src_doc = src_corpus.values()[0] reader = educe.stac.Reader(args.corpus) tgt_files = reader.filter(reader.files(), is_requested(args)) tgt_corpus = reader.slurp(tgt_files) renames = compute_renames(tgt_corpus, src_corpus) for tgt_k in tgt_corpus: tgt_doc = tgt_corpus[tgt_k] _, new_tgt_doc = move_portion(renames, src_doc, tgt_doc, -1, tgt_split=args.start) diffs = ["======= INSERT IN %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc)] print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args, default_overwrite=True) start = args.span.char_start end = args.span.char_end src_corpus = read_source_corpus(args) tgt_corpus = read_target_corpus(args) renames = compute_renames(tgt_corpus, src_corpus) for src_k, src_doc in src_corpus.items(): # retrieve target subdoc tgt_k = copy.copy(src_k) tgt_k.subdoc = args.target print(src_k, tgt_k, file=sys.stderr) if tgt_k not in tgt_corpus: raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k) tgt_doc = tgt_corpus[tgt_k] # move portion from source to target subdoc if start == 0: # move up new_src_doc, new_tgt_doc = move_portion( renames, src_doc, tgt_doc, end, # src_split tgt_split=-1) elif end == len(src_doc.text()): # src_doc.text_span().char_end: # move down # move_portion inserts src_doc[0:src_split] between # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:], # so we detach src_doc[start:] into a temporary doc, # then call move_portion on this temporary doc new_src_doc, src_doc2 = split_doc(src_doc, start) _, new_tgt_doc = move_portion( renames, src_doc2, tgt_doc, -1, # src_split tgt_split=0) # the whitespace between new_src_doc and src_doc2 went to # src_doc2, so we need to append a new whitespace to new_src_doc evil_set_text(new_src_doc, new_src_doc.text() + ' ') else: raise ValueError("Sorry, can only move to the start or to the " "end of a document at the moment") # print diff for suggested commit message diffs = [ "======= TO %s ========" % tgt_k, show_diff(tgt_doc, new_tgt_doc), "^------ FROM %s" % src_k, show_diff(src_doc, new_src_doc), "" ] print("\n".join(diffs), file=sys.stderr) # dump the modified documents save_document(output_dir, src_k, new_src_doc) save_document(output_dir, tgt_k, new_tgt_doc) announce_output_dir(output_dir)