예제 #1
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.relations.sparse')
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
예제 #2
0
def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
예제 #3
0
파일: extract.py 프로젝트: eipiplusun/educe
def main_single(args):
    """Extract feature vectors for single EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'unannotated' if args.parsing else 'units'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.dialogue-acts.sparse')
    instance_generator = lambda x: x.edus[1:]  # drop fake root

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_single_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    # TODO? just transform() if args.parsing or args.vocabulary?
    X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = DialogueActVectorizer(instance_generator, DIALOGUE_ACTS)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    # list dialogue acts
    comment = labels_comment(labtor.labelset_)

    # dump: EDUs, pairings, vectorized pairings with label
    edu_input_file = out_file + '.edu_input'
    dump_edu_input_file(dialogues, edu_input_file)
    dump_svmlight_file(X_gen, y_gen, out_file, comment=comment)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
예제 #4
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # X, y follow the naming convention in sklearn
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    # create directory structure
    outdir = args.output
    if not fp.exists(outdir):
        os.makedirs(outdir)

    corpus_name = fp.basename(args.corpus)
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(
        outdir,
        '{corpus_name}.relations.sparse'.format(
            corpus_name=corpus_name))

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = fp.join(outdir,
                         '{corpus_name}.relations.sparse.vocab'.format(
                             corpus_name=corpus_name))
    dump_vocabulary(vzer.vocabulary_, vocab_file)
예제 #5
0
파일: extract.py 프로젝트: chloebt/educe
def main_parsing_pairs(args):
    """
    Main to call when live data are passed in (--parsing). Live data are data
    that we want to discourse parsing on, so we don't know if they are attached
    or what the label is.

    As of 2014-08-19, there must be an 'unannotated' stage and an optional
    'units' stage (for dialogue acts)
    """
    inputs = features.read_corpus_inputs(args, stage='units|unannotated')
    features_file = os.path.join(args.output, 'extracted-features.csv')
    with codecs.open(features_file, 'wb') as ofile:
        header = features.PairKeys(inputs)
        writer = mk_csv_writer(header, ofile)
        feats = features.extract_pair_features(inputs,
                                               args.window,
                                               live=True)
        for row, _ in feats:
            writer.writerow(row)
예제 #6
0
파일: extract.py 프로젝트: kowey/educe
def main_pairs(args):
    """
    The usual main. Extract feature vectors from the corpus
    """
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output, fp.basename(args.corpus))
    out_file += '.relations.sparse'
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen,
             y_gen,
             out_file,
             labtor.labelset_,
             dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
예제 #7
0
파일: extract.py 프로젝트: chloebt/educe
def main_corpus_single(args):
    """
    The usual main. Extract feature vectors from the corpus
    (single edus only)
    """
    inputs = features.read_corpus_inputs(args)
    of_bn = os.path.join(args.output, os.path.basename(args.corpus))
    of_ext = '.csv'
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    just_edus_file = of_bn + '.just-edus' + of_ext
    with codecs.open(just_edus_file, 'wb') as ofile:
        gen = features.extract_single_features(inputs)
        try:
            _write_singles(gen, ofile)
        except StopIteration:
            # FIXME: I have a nagging feeling that we should properly
            # support this by just printing a CSV header and nothing
            # else, but I'm trying to minimise code paths and for now
            # failing in this corner case feels like a lesser evil :-/
            sys.exit("No features to extract!")
예제 #8
0
def command_annotate(args):
    """
    Top-level command: given a dialogue act model, and a corpus with some
    Glozz documents, perform dialogue act annotation on them, and simple
    addressee detection, and dump Glozz documents in the output directory
    """
    args.ignore_cdus = False
    args.parsing = True
    args.single = True
    args.strip_mode = 'head'  # FIXME should not be specified here
    inputs = stac_features.read_corpus_inputs(args)
    model = joblib.load(args.model)
    vocab = {f: i for i, f in enumerate(load_vocab(args.vocabulary))}
    labels = load_labels(args.labels)

    # add dialogue acts and addressees
    annotate_edus(model, vocab, labels, inputs)

    # corpus has been modified in-memory, now save to disk
    for key in inputs.corpus:
        key2 = _output_key(key)
        doc = inputs.corpus[key]
        save_document(args.output, key2, doc)
예제 #9
0
def command_annotate(args):
    """
    Top-level command: given a dialogue act model, and a corpus with some
    Glozz documents, perform dialogue act annotation on them, and simple
    addressee detection, and dump Glozz documents in the output directory
    """
    args.ignore_cdus = False
    args.parsing = True
    args.single = True
    inputs = stac_features.read_corpus_inputs(args)
    model = load_model(args.model)
    vocab = {f: i for i, f in
             enumerate(load_vocab(args.vocabulary))}
    labels = load_labels(args.labels)

    # add dialogue acts and addressees
    annotate_edus(model, vocab, labels, inputs)

    # corpus has been modified in-memory, now save to disk
    for key in inputs.corpus:
        key2 = _output_key(key)
        doc = inputs.corpus[key]
        save_document(args.output, key2, doc)