示例#1
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    for key in corpus:
        print(key)
        new_doc = corpus[key]
        old_doc = copy.deepcopy(new_doc)
        span = _split_dialogue(tcache, new_doc, args.turn)
        diffs = _mini_diff(key, args, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, key, new_doc)
        commit_info = CommitInfo(key=key,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span,
                                 tid=args.turn)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
示例#2
0
文件: move.py 项目: irit-melodi/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames, src_doc, tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames, src_doc2, tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = ["======= TO %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc),
                 "^------ FROM %s" % src_k,
                 show_diff(src_doc, new_src_doc),
                 ""]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
示例#3
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        _merge_edus(tcache, args.span, new_doc)
        diffs = _mini_diff(k, old_doc, new_doc, args.span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=args.span)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
示例#4
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
示例#5
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    for key in corpus:
        print(key)
        new_doc = corpus[key]
        old_doc = copy.deepcopy(new_doc)
        span = _split_dialogue(tcache, new_doc, args.turn)
        diffs = _mini_diff(key, args, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, key, new_doc)
        commit_info = CommitInfo(key=key,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span,
                                 tid=args.turn)
    announce_output_dir(output_dir)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
示例#6
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)
    commit_info = None
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        span = Span.merge_all(args.spans)
        _split_edu(tcache, k, new_doc, args.spans)
        diffs = _mini_diff(k, old_doc, new_doc, span)
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        commit_info = CommitInfo(key=k,
                                 annotator=args.annotator,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
示例#7
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    # iterate on annotated versions
    for key, tgt_doc in sorted(corpus.items()):
        print('<== weaving {} ==>'.format(key), file=sys.stderr)  # DEBUG
        # locate augmented version
        ukey = unannotated_key(key)
        try:
            src_doc = augmented[ukey]
        except KeyError:
            print('Cannot find augmented version of {}'.format(str(ukey)))
            raise
        # weave
        new_tgt_doc = _weave_docs(renames, src_doc, tgt_doc, args.gen)
        save_document(output_dir, key, new_tgt_doc)
        print('<== done ==>', file=sys.stderr)  # DEBUG
    announce_output_dir(output_dir)
示例#8
0
文件: rename.py 项目: tjane/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    source = args.source
    target = _get_target(args, source, corpus)

    for k in corpus:
        print(k)
        doc = corpus[k]
        _rename_in_doc(source, target, doc)
        save_document(output_dir, k, doc)
    pretty_source = anno_id_from_tuple(source)
    pretty_target = anno_id_from_tuple(target)
    print("Renamed from %s to %s" % (pretty_source, pretty_target),
          file=sys.stderr)
    announce_output_dir(output_dir)
示例#9
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append("Skipping %s (and related); no offending emoticons"
                            % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
示例#10
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus_with_unannotated(args)
    postags = educe.stac.postag.read_tags(corpus, args.corpus)
    tcache = TimestampCache()
    output_dir = get_output_dir(args, default_overwrite=True)

    families = collections.defaultdict(list)
    discourse_subcorpus = {}
    for k in corpus:
        fam = (k.doc, k.subdoc)
        families[fam].append(k)
        if k.stage == 'discourse':
            discourse_subcorpus[fam] = k

    for fam in sorted(families):
        print(family_banner(fam[0], fam[1], families[fam]))
        disc_k = discourse_subcorpus[fam]

        doc = corpus[disc_k]
        turns, warn_turns = turns_with_final_emoticons(doc, postags[disc_k])

        warnings = []
        if warn_turns:
            warnings.append("Note: These turns have emoticon-only EDUs that "
                            "I dare not touch because they either "
                            "participate in relations or CDUs: ")
            warnings.extend(" " + doc.text(x.text_span()) for x in warn_turns)
            warnings.append("If the "
                            "relations can be removed, or the CDUs reduced, "
                            "please do this by hand and re-run the script:")

        if not turns:
            warnings.append(
                "Skipping %s (and related); no offending emoticons" % disc_k)

        print("\n".join(warnings))

        if not turns:
            continue

        turn_spans = [x.text_span() for x in turns]
        for k in families[fam]:
            doc = copy.deepcopy(corpus[k])
            tags = postags[k]
            merge_final_emoticons(tcache, turn_spans, doc, tags)
            if k == discourse_subcorpus[fam]:
                for turn_span in turn_spans:
                    print(show_diff(corpus[k], doc, span=turn_span))
                    print()
            save_document(output_dir, k, doc)
        tcache.reset()
    announce_output_dir(output_dir)
示例#11
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)
    for src_k in src_corpus:
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            sys.exit("Uh-oh! we don't have %s in the corpus" % tgt_k)
        else:
            src_doc = src_corpus[src_k]
            tgt_doc = tgt_corpus[tgt_k]
            if start == 0:
                new_src_doc, new_tgt_doc =\
                    move_portion(renames, src_doc, tgt_doc,
                                 src_split=end,
                                 tgt_split=-1)
            elif end == src_doc.text_span().char_end:
                new_src_doc, src_doc2 = split_doc(src_doc, start)
                _, new_tgt_doc =\
                    move_portion(renames, src_doc2, tgt_doc,
                                 src_split=-1,
                                 tgt_split=0)
            else:
                sys.exit("Sorry, can only move to the start or to the "
                         "end of a document at the moment")
            diffs = ["======= TO %s   ========" % tgt_k,
                     show_diff(tgt_doc, new_tgt_doc),
                     "^------ FROM %s" % src_k,
                     show_diff(src_doc, new_src_doc),
                     ""]
            print("\n".join(diffs), file=sys.stderr)
            save_document(output_dir, src_k, new_src_doc)
            save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
def fix_dialogue_boundaries(dir_ling, dir_situ, doc, seg_path=None):
    """Fix dialogue boundaries in a woven game.

    Dialogue boundaries are adjusted in the woven version, so they
    are tighter around the dialogues that existed in the annotated
    version.

    Parameters
    ----------
    dir_ling: filepath
        Path to the folder of the original version of the game.
    dir_situ: filepath
        Path to the folder of the woven version of the game.
    doc: string
        Name of the game.
    seg_path: TODO
        TODO ?
    """
    # select files for this game only, annotator GOLD
    is_interesting = lambda k: (k.doc == doc
                                and (k.annotator == 'GOLD'
                                     or k.annotator is None))

    # locate files
    dir_ling = os.path.abspath(dir_ling)
    reader_ling = Reader(dir_ling)
    files_ling = reader_ling.filter(reader_ling.files(), is_interesting)
    corpus_ling = reader_ling.slurp(cfiles=files_ling, verbose=True)

    dir_situ = os.path.abspath(dir_situ)
    reader_situ = Reader(dir_situ)
    files_situ = reader_situ.filter(reader_situ.files(), is_interesting)
    corpus_situ = reader_situ.slurp(cfiles=files_situ, verbose=True)
    # need a TimestampCache to generate unit_id for new dialogues
    tcache = TimestampCache()

    for key, doc_situ in sorted(corpus_situ.items()):
        doc_ling = corpus_ling[key]
        print(key)
        doc_situ_fixed = _fix_dialogue_boundaries(tcache, doc_ling, doc_situ)
        # DEBUG
        dlgs = sorted((x for x in doc_situ_fixed.units if is_dialogue(x)),
                      key=lambda x: x.span)
        dlg_beg = [x.span.char_start for x in dlgs]
        dlg_end = [x.span.char_end for x in dlgs]
        print(zip(dlg_beg, dlg_end))
        # end DEBUG
        save_document(dir_situ, key, doc_situ_fixed)
示例#13
0
文件: weave.py 项目: tjane/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
示例#14
0
文件: weave.py 项目: kowey/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args)
    augmented = read_augmented_corpus(args)
    corpus = read_corpus_with_unannotated(args)
    renames = compute_renames(corpus, augmented)
    for key in corpus:
        ukey = unannotated_key(key)
        new_tgt_doc = _weave_docs(renames, augmented[ukey], corpus[key])
        save_document(output_dir, key, new_tgt_doc)
    announce_output_dir(output_dir)
示例#15
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(tgt_doc,
                                           span,
                                           sub_text,
                                           minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = [
            "======= REPLACE TEXT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
示例#16
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        if args.diff_friendly:
            doc.units = _diff_friendly(doc.units)
            doc.relations = _diff_friendly(doc.relations)
            doc.schemas = _diff_friendly(doc.schemas)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
示例#17
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        if args.diff_friendly:
            doc.units = _diff_friendly(doc.units)
            doc.relations = _diff_friendly(doc.relations)
            doc.schemas = _diff_friendly(doc.schemas)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
示例#18
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    corpus = read_corpus(args, preselected={"stage": ["units"]})
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        for edu in [x for x in doc.units if educe.stac.is_edu(x)]:
            etypes = frozenset(educe.stac.split_type(edu))
            etypes2 = frozenset(RENAMES.get(t, t) for t in etypes)
            if etypes != etypes2:
                edu.type = "/".join(sorted(etypes2))
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
示例#19
0
def main(args):
    """Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`.
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    # locate insertion site: target document
    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    # TODO mark units with FIXME, optionally delete in/out relations
    span = args.span
    sub_text = args.sub_text
    minor = args.minor
    # store before/after
    annos_before = []
    annos_after = []
    for tgt_k, tgt_doc in tgt_corpus.items():
        annos_before.append(annotate_doc(tgt_doc, span=span))
        # process
        new_tgt_doc = replace_text_at_span(
            tgt_doc, span, sub_text, minor=minor)
        # WIP new_span, depends on the offset
        offset = len(sub_text) - (span.char_end - span.char_start)
        new_span = Span(span.char_start, span.char_end + offset)
        # end WIP
        annos_after.append(annotate_doc(new_tgt_doc, span=new_span))
        # show diff and save doc
        diffs = ["======= REPLACE TEXT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)
    announce_output_dir(output_dir)
    # commit message
    tgt_k, tgt_doc = list(tgt_corpus.items())[0]
    anno_str_before = annos_before[0]
    anno_str_after = annos_after[0]
    if tgt_k and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(tgt_k, anno_str_before, anno_str_after))
示例#20
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args, preselected={'stage': ['discourse', 'units']})
    output_dir = get_output_dir(args, default_overwrite=True)
    for key in corpus:
        doc = corpus[key]
        to_delete = []
        for sch in doc.schemas:
            if not sch.members:
                to_delete.append(sch)
        for sch in to_delete:
            doc.schemas.remove(sch)
        save_document(output_dir, key, doc)
    announce_output_dir(output_dir)
示例#21
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    corpus = read_corpus(args,
                         preselected={"stage": ["units"]})
    output_dir = get_output_dir(args, default_overwrite=True)
    for k in corpus:
        doc = corpus[k]
        for edu in [x for x in doc.units if educe.stac.is_edu(x)]:
            etypes = frozenset(educe.stac.split_type(edu))
            etypes2 = frozenset(RENAMES.get(t, t) for t in etypes)
            if etypes != etypes2:
                edu.type = "/".join(sorted(etypes2))
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
示例#22
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    corpus = read_corpus(args,
                         preselected={'stage': ['discourse', 'units']})
    output_dir = get_output_dir(args, default_overwrite=True)
    for key in corpus:
        doc = corpus[key]
        to_delete = []
        for sch in doc.schemas:
            if not sch.members:
                to_delete.append(sch)
        for sch in to_delete:
            doc.schemas.remove(sch)
        save_document(output_dir, key, doc)
    announce_output_dir(output_dir)
示例#23
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = [
            "======= INSERT IN %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc)
        ]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
示例#24
0
文件: nudge.py 项目: eipiplusun/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    _screen_args(args)
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)

    old_span = args.span
    new_span = Span(old_span.char_start + args.nudge_start,
                    old_span.char_end + args.nudge_end)
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        found = False
        for anno in new_doc.units:
            if anno.span == old_span:
                anno.span = copy.deepcopy(new_span)
                found = True
        if found:
            diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span))
            print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        else:
            print("WARNING: No annotations found for %s in %s" % (old_span, k),
                  file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        span = old_span.merge(new_span)
        commit_info = CommitInfo(key=k,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
示例#25
0
文件: nudge.py 项目: moreymat/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    _screen_args(args)
    corpus = read_corpus(args, verbose=True)
    output_dir = get_output_dir(args, default_overwrite=True)

    old_span = args.span
    new_span = Span(old_span.char_start + args.nudge_start,
                    old_span.char_end + args.nudge_end)
    for k in corpus:
        old_doc = corpus[k]
        new_doc = copy.deepcopy(old_doc)
        found = False
        for anno in new_doc.units:
            if anno.span == old_span:
                anno.span = copy.deepcopy(new_span)
                found = True
        if found:
            diffs = _mini_diff(k, (old_doc, old_span), (new_doc, new_span))
            print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        else:
            print("WARNING: No annotations found for %s in %s" % (old_span, k),
                  file=sys.stderr)
        save_document(output_dir, k, new_doc)
        # for commit message generation
        span = old_span.merge(new_span)
        commit_info = CommitInfo(key=k,
                                 before=old_doc,
                                 after=new_doc,
                                 span=span)
    if commit_info and not args.no_commit_msg:
        print("-----8<------")
        print(commit_msg(commit_info))
    announce_output_dir(output_dir)
示例#26
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)

    src_reader = educe.stac.LiveInputReader(args.insert)
    src_corpus = src_reader.slurp(src_reader.files())

    if not src_corpus:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (none found)")
    elif len(src_corpus) > 1:
        sys.exit("Insert dir must have exactly one .aa/.ac pair (%d found)" %
                 len(src_corpus))

    src_doc = src_corpus.values()[0]

    reader = educe.stac.Reader(args.corpus)
    tgt_files = reader.filter(reader.files(), is_requested(args))
    tgt_corpus = reader.slurp(tgt_files)

    renames = compute_renames(tgt_corpus, src_corpus)
    for tgt_k in tgt_corpus:
        tgt_doc = tgt_corpus[tgt_k]
        _, new_tgt_doc = move_portion(renames,
                                      src_doc,
                                      tgt_doc,
                                      -1,
                                      tgt_split=args.start)
        diffs = ["======= INSERT IN %s   ========" % tgt_k,
                 show_diff(tgt_doc, new_tgt_doc)]
        print("\n".join(diffs).encode('utf-8'), file=sys.stderr)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)
示例#27
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
示例#28
0
def command_annotate(args):
    """
    Top-level command: given a dialogue act model, and a corpus with some
    Glozz documents, perform dialogue act annotation on them, and simple
    addressee detection, and dump Glozz documents in the output directory
    """
    args.ignore_cdus = False
    args.parsing = True
    args.single = True
    inputs = stac_features.read_corpus_inputs(args)
    model = load_model(args.model)
    vocab = {f: i for i, f in
             enumerate(load_vocab(args.vocabulary))}
    labels = load_labels(args.labels)

    # add dialogue acts and addressees
    annotate_edus(model, vocab, labels, inputs)

    # corpus has been modified in-memory, now save to disk
    for key in inputs.corpus:
        key2 = _output_key(key)
        doc = inputs.corpus[key]
        save_document(args.output, key2, doc)
示例#29
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    if args.stage:
        if args.stage != 'unannotated' and not args.annotator:
            sys.exit("--annotator is required unless --stage is unannotated")
        elif args.stage == 'unannotated' and args.annotator:
            sys.exit("--annotator is forbidden if --stage is unannotated")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)

    for key in corpus:
        print(key)
        doc = corpus[key]
        _delete_in_doc(args.anno_id, doc)
        save_document(output_dir, key, doc)
    pretty_id = anno_id_from_tuple(args.anno_id)
    print("Deleted %s" % pretty_id, file=sys.stderr)
    announce_output_dir(output_dir)
示例#30
0
def command_annotate(args):
    """
    Top-level command: given a dialogue act model, and a corpus with some
    Glozz documents, perform dialogue act annotation on them, and simple
    addressee detection, and dump Glozz documents in the output directory
    """
    args.ignore_cdus = False
    args.parsing = True
    args.single = True
    args.strip_mode = 'head'  # FIXME should not be specified here
    inputs = stac_features.read_corpus_inputs(args)
    model = joblib.load(args.model)
    vocab = {f: i for i, f in enumerate(load_vocab(args.vocabulary))}
    labels = load_labels(args.labels)

    # add dialogue acts and addressees
    annotate_edus(model, vocab, labels, inputs)

    # corpus has been modified in-memory, now save to disk
    for key in inputs.corpus:
        key2 = _output_key(key)
        doc = inputs.corpus[key]
        save_document(args.output, key2, doc)
示例#31
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)
示例#32
0
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """

    if not args.turns and len(args.dialogues) < 2:
        sys.exit("Must specify at least two dialogues")
    output_dir = get_output_dir(args, default_overwrite=True)
    corpus = read_corpus(args, verbose=True)
    if args.turns:
        try:
            sought = _dialogues_in_turns(corpus, args.turns[0], args.turns[1])
            if len(sought) < 2:
                sys.exit("Must specify at least two dialogues")
            print("Merging dialogues: " +
                  ", ".join(anno_id_from_tuple(x) for x in sought),
                  file=sys.stderr)
        except GlozzException as oops:
            sys.exit(str(oops))
    else:
        sought = args.dialogues
    if corpus and not args.no_commit_msg:
        key0 = list(corpus)[0]
        # compute this before we change things
        cmsg = commit_msg(args, corpus, key0, sought)
    for k in corpus:
        doc = corpus[k]
        _merge_dialogues_in_document(sought, doc)
        save_document(output_dir, k, doc)
    announce_output_dir(output_dir)
    if corpus and not args.no_commit_msg:
        print("-----8<------")
        print(cmsg)
示例#33
0
文件: move.py 项目: moreymat/educe
def main(args):
    """
    Subcommand main.

    You shouldn't need to call this yourself if you're using
    `config_argparser`
    """
    output_dir = get_output_dir(args, default_overwrite=True)
    start = args.span.char_start
    end = args.span.char_end

    src_corpus = read_source_corpus(args)
    tgt_corpus = read_target_corpus(args)

    renames = compute_renames(tgt_corpus, src_corpus)

    for src_k, src_doc in src_corpus.items():
        # retrieve target subdoc
        tgt_k = copy.copy(src_k)
        tgt_k.subdoc = args.target
        print(src_k, tgt_k, file=sys.stderr)
        if tgt_k not in tgt_corpus:
            raise ValueError("Uh-oh! we don't have %s in the corpus" % tgt_k)
        tgt_doc = tgt_corpus[tgt_k]
        # move portion from source to target subdoc
        if start == 0:
            # move up
            new_src_doc, new_tgt_doc = move_portion(
                renames,
                src_doc,
                tgt_doc,
                end,  # src_split
                tgt_split=-1)
        elif end == len(src_doc.text()):  # src_doc.text_span().char_end:
            # move down
            # move_portion inserts src_doc[0:src_split] between
            # tgt_doc[0:tgt_split] and tgt_doc[tgt_split:],
            # so we detach src_doc[start:] into a temporary doc,
            # then call move_portion on this temporary doc
            new_src_doc, src_doc2 = split_doc(src_doc, start)
            _, new_tgt_doc = move_portion(
                renames,
                src_doc2,
                tgt_doc,
                -1,  # src_split
                tgt_split=0)
            # the whitespace between new_src_doc and src_doc2 went to
            # src_doc2, so we need to append a new whitespace to new_src_doc
            evil_set_text(new_src_doc, new_src_doc.text() + ' ')
        else:
            raise ValueError("Sorry, can only move to the start or to the "
                             "end of a document at the moment")
        # print diff for suggested commit message
        diffs = [
            "======= TO %s   ========" % tgt_k,
            show_diff(tgt_doc, new_tgt_doc),
            "^------ FROM %s" % src_k,
            show_diff(src_doc, new_src_doc), ""
        ]
        print("\n".join(diffs), file=sys.stderr)
        # dump the modified documents
        save_document(output_dir, src_k, new_src_doc)
        save_document(output_dir, tgt_k, new_tgt_doc)

    announce_output_dir(output_dir)