def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append("Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) postags = educe.stac.postag.read_tags(corpus, args.corpus) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) families = collections.defaultdict(list) discourse_subcorpus = {} for k in corpus: fam = (k.doc, k.subdoc) families[fam].append(k) if k.stage == 'discourse': discourse_subcorpus[fam] = k for fam in sorted(families): print(family_banner(fam[0], fam[1], families[fam])) disc_k = discourse_subcorpus[fam] doc = corpus[disc_k] turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k]) warnings = [] if warn_turns: warnings.append("Note: These turns have emoticon-only EDUs that " "I dare not touch because they either " "participate in relations or CDUs: ") warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns) warnings.append("If the " "relations can be removed, or the CDUs reduced, " "please do this by hand and re-run the script:") if not turns: warnings.append( "Skipping %s (and related); no offending emoticons" % disc_k) print("\n".join(warnings)) if not turns: continue turn_spans = [x.text_span() for x in turns] for k in families[fam]: doc = copy.deepcopy(corpus[k]) tags = postags[k] merge_final_emoticons(tcache, turn_spans, doc, tags) if k == discourse_subcorpus[fam]: for turn_span in turn_spans: print(show_diff(corpus[k], doc, span=turn_span)) print() save_document(output_dir, k, doc) tcache.reset() announce_output_dir(output_dir)
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus(args, verbose=True) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) for key in corpus: print(key) new_doc = corpus[key] old_doc = copy.deepcopy(new_doc) span = _split_dialogue(tcache, new_doc, args.turn) diffs = _mini_diff(key, args, old_doc, new_doc, span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, key, new_doc) commit_info = CommitInfo(key=key, before=old_doc, after=new_doc, span=span, tid=args.turn) announce_output_dir(output_dir) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info))
def main(args): """ Subcommand main. You shouldn't need to call this yourself if you're using `config_argparser` """ corpus = read_corpus_with_unannotated(args) tcache = TimestampCache() output_dir = get_output_dir(args, default_overwrite=True) commit_info = None for k in corpus: old_doc = corpus[k] new_doc = copy.deepcopy(old_doc) _merge_edus(tcache, args.span, new_doc) diffs = _mini_diff(k, old_doc, new_doc, args.span) print("\n".join(diffs).encode('utf-8'), file=sys.stderr) save_document(output_dir, k, new_doc) # for commit message generation commit_info = CommitInfo(key=k, annotator=args.annotator, before=old_doc, after=new_doc, span=args.span) announce_output_dir(output_dir) if commit_info and not args.no_commit_msg: print("-----8<------") print(commit_msg(commit_info))
def fix_dialogue_boundaries(dir_ling, dir_situ, doc, seg_path=None): """Fix dialogue boundaries in a woven game. Dialogue boundaries are adjusted in the woven version, so they are tighter around the dialogues that existed in the annotated version. Parameters ---------- dir_ling: filepath Path to the folder of the original version of the game. dir_situ: filepath Path to the folder of the woven version of the game. doc: string Name of the game. seg_path: TODO TODO ? """ # select files for this game only, annotator GOLD is_interesting = lambda k: (k.doc == doc and (k.annotator == 'GOLD' or k.annotator is None)) # locate files dir_ling = os.path.abspath(dir_ling) reader_ling = Reader(dir_ling) files_ling = reader_ling.filter(reader_ling.files(), is_interesting) corpus_ling = reader_ling.slurp(cfiles=files_ling, verbose=True) dir_situ = os.path.abspath(dir_situ) reader_situ = Reader(dir_situ) files_situ = reader_situ.filter(reader_situ.files(), is_interesting) corpus_situ = reader_situ.slurp(cfiles=files_situ, verbose=True) # need a TimestampCache to generate unit_id for new dialogues tcache = TimestampCache() for key, doc_situ in sorted(corpus_situ.items()): doc_ling = corpus_ling[key] print(key) doc_situ_fixed = _fix_dialogue_boundaries(tcache, doc_ling, doc_situ) # DEBUG dlgs = sorted((x for x in doc_situ_fixed.units if is_dialogue(x)), key=lambda x: x.span) dlg_beg = [x.span.char_start for x in dlgs] dlg_end = [x.span.char_end for x in dlgs] print(zip(dlg_beg, dlg_end)) # end DEBUG save_document(dir_situ, key, doc_situ_fixed)