Пример #1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Пример #2
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append(
                "Skipping %s (and related); no offending emoticons" % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
Пример #3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    for key in corpus:
        print(key)
        new_doc = corpus[key]
        old_doc = copy.deepcopy(new_doc)
        span = _split_dialogue(tcache, new_doc, args.turn)
        diffs = _mini_diff(key, args, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, key, new_doc)
        commit_info = CommitInfo(key=key,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span,
                                 tid=args.turn)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
Пример #4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        _merge_edus(tcache, args.span, new_doc)
        diffs = _mini_diff(k, old_doc, new_doc, args.span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=args.span)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
Пример #5
0
def fix_dialogue_boundaries(dir_ling, dir_situ, doc, seg_path=None):
    """Fix dialogue boundaries in a woven game.

    Dialogue boundaries are adjusted in the woven version, so they
    are tighter around the dialogues that existed in the annotated
    version.

    Parameters
    ----------
    dir_ling: filepath
        Path to the folder of the original version of the game.
    dir_situ: filepath
        Path to the folder of the woven version of the game.
    doc: string
        Name of the game.
    seg_path: TODO
        TODO ?
    """
    # select files for this game only, annotator GOLD
    is_interesting = lambda k: (k.doc == doc
                                and (k.annotator == 'GOLD'
                                     or k.annotator is None))

    # locate files
    dir_ling = os.path.abspath(dir_ling)
    reader_ling = Reader(dir_ling)
    files_ling = reader_ling.filter(reader_ling.files(), is_interesting)
    corpus_ling = reader_ling.slurp(cfiles=files_ling, verbose=True)

    dir_situ = os.path.abspath(dir_situ)
    reader_situ = Reader(dir_situ)
    files_situ = reader_situ.filter(reader_situ.files(), is_interesting)
    corpus_situ = reader_situ.slurp(cfiles=files_situ, verbose=True)
    # need a TimestampCache to generate unit_id for new dialogues
    tcache = TimestampCache()

    for key, doc_situ in sorted(corpus_situ.items()):
        doc_ling = corpus_ling[key]
        print(key)
        doc_situ_fixed = _fix_dialogue_boundaries(tcache, doc_ling, doc_situ)
        # DEBUG
        dlgs = sorted((x for x in doc_situ_fixed.units if is_dialogue(x)),
                      key=lambda x: x.span)
        dlg_beg = [x.span.char_start for x in dlgs]
        dlg_end = [x.span.char_end for x in dlgs]
        print(zip(dlg_beg, dlg_end))
        # end DEBUG
        save_document(dir_situ, key, doc_situ_fixed)