def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) span = Span.merge_all(args.spans) _split_edu(tcache, k, new_doc, args.spans) diffs = _mini_diff(k, old_doc, new_doc, span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=span) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info)) announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) # iterate on annotated versions for key, tgt_doc in sorted(corpus.items()): print('<== weaving {} ==>'.format(key), file=sys.stderr) # DEBUG # locate augmented version ukey = unannotated_key(key) try: src_doc = augmented[ukey] except KeyError: print('Cannot find augmented version of {}'.format(str(ukey))) raise # weave new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc) save_document(output_dir, key, new_tgt_doc) print('<== done ==>', file=sys.stderr) # DEBUG announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) _merge_edus(tcache, args.span, new_doc) diffs = _mini_diff(k, old_doc, new_doc, args.span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=args.span) announce_output_dir(output_dir) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) # iterate on annotated versions for key, tgt_doc in sorted(corpus.items()): print('<== weaving {} ==>'.format(key), file=sys.stderr) # DEBUG # locate augmented version ukey = unannotated_key(key) try: src_doc = augmented[ukey] except KeyError: print('Cannot find augmented version of {}'.format(str(ukey))) raise # weave new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc, args.gen) save_document(output_dir, key, new_tgt_doc) print('<== done ==>', file=sys.stderr) # DEBUG announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append("Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append( "Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ output_dir = get_output_dir(args) augmented = read_augmented_corpus(args) corpus = read_corpus_with_unannotated(args) renames = compute_renames(corpus, augmented) for key in corpus: ukey = unannotated_key(key) new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key]) save_document(output_dir, key, new_tgt_doc) announce_output_dir(output_dir)