Пример #1
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output,
                       fp.basename(args.corpus) + '.relations.sparse')
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS + COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels, zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Пример #2
0
def main_pairs(args):
    """Extract feature vectors for pairs of EDUs in the corpus."""
    inputs = read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # X, y follow the naming convention in sklearn
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    # create directory structure
    outdir = args.output
    if not fp.exists(outdir):
        os.makedirs(outdir)

    corpus_name = fp.basename(args.corpus)
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(
        outdir,
        '{corpus_name}.relations.sparse'.format(
            corpus_name=corpus_name))

    dump_all(X_gen, y_gen, out_file, labtor.labelset_, dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = fp.join(outdir,
                         '{corpus_name}.relations.sparse.vocab'.format(
                             corpus_name=corpus_name))
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Пример #3
0
def main_pairs(args):
    """
    The usual main. Extract feature vectors from the corpus
    """
    inputs = features.read_corpus_inputs(args)
    stage = 'units' if args.parsing else 'discourse'
    dialogues = list(mk_high_level_dialogues(inputs, stage))
    # these paths should go away once we switch to a proper dumper
    out_file = fp.join(args.output, fp.basename(args.corpus))
    out_file += '.relations.sparse'
    instance_generator = lambda x: x.edu_pairs()

    labels = frozenset(SUBORDINATING_RELATIONS +
                       COORDINATING_RELATIONS)

    # pylint: disable=invalid-name
    # scikit-convention
    feats = extract_pair_features(inputs, stage)
    vzer = KeyGroupVectorizer()
    if args.parsing or args.vocabulary:
        vzer.vocabulary_ = load_vocabulary(args.vocabulary)
        X_gen = vzer.transform(feats)
    else:
        X_gen = vzer.fit_transform(feats)
    # pylint: enable=invalid-name
    labtor = LabelVectorizer(instance_generator, labels,
                             zero=args.parsing)
    y_gen = labtor.transform(dialogues)

    if not fp.exists(args.output):
        os.makedirs(args.output)

    dump_all(X_gen,
             y_gen,
             out_file,
             labtor.labelset_,
             dialogues,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Пример #4
0
def main(args):
    "main for feature extraction mode"
    # retrieve parameters
    feature_set = args.feature_set
    live = args.parsing

    # RST data
    rst_reader = RstDtParser(args.corpus, args, coarse_rels=True)
    rst_corpus = rst_reader.corpus
    # TODO: change educe.corpus.Reader.slurp*() so that they return an object
    # which contains a *list* of FileIds and a *list* of annotations
    # (see sklearn's Bunch)
    # on creation of these lists, one can impose the list of names to be
    # sorted so that the order in which docs are iterated is guaranteed
    # to be always the same

    # PTB data
    ptb_parser = PtbParser(args.ptb)

    # align EDUs with sentences, tokens and trees from PTB
    def open_plus(doc):
        """Open and fully load a document

        doc is an educe.corpus.FileId
        """
        # create a DocumentPlus
        doc = rst_reader.decode(doc)
        # populate it with layers of info
        # tokens
        doc = ptb_parser.tokenize(doc)
        # syn parses
        doc = ptb_parser.parse(doc)
        # disc segments
        doc = rst_reader.segment(doc)
        # disc parse
        doc = rst_reader.parse(doc)
        # pre-compute the relevant info for each EDU
        doc = doc.align_with_doc_structure()
        # logical order is align with tokens, then align with trees
        # but aligning with trees first for the PTB enables
        # to get proper sentence segmentation
        doc = doc.align_with_trees()
        doc = doc.align_with_tokens()
        # dummy, fallback tokenization if there is no PTB gold or silver
        doc = doc.align_with_raw_words()

        return doc

    # generate DocumentPluses
    # TODO remove sorted() once educe.corpus.Reader is able
    # to iterate over a stable (sorted) list of FileIds
    docs = [open_plus(doc) for doc in sorted(rst_corpus)]
    # instance generator
    instance_generator = lambda doc: doc.all_edu_pairs()

    # extract vectorized samples
    if args.vocabulary is not None:
        vocab = load_vocabulary(args.vocabulary)
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       vocabulary=vocab)
        X_gen = vzer.transform(docs)
    else:
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       min_df=5)
        X_gen = vzer.fit_transform(docs)

    # extract class label for each instance
    if live:
        y_gen = itertools.repeat(0)
    elif args.labels is not None:
        labelset = load_labels(args.labels)
        labtor = DocumentLabelExtractor(instance_generator,
                                        labelset=labelset)
        labtor.fit(docs)
        y_gen = labtor.transform(docs)
    else:
        labtor = DocumentLabelExtractor(instance_generator)
        # y_gen = labtor.fit_transform(rst_corpus)
        # fit then transform enables to get classes_ for the dump
        labtor.fit(docs)
        y_gen = labtor.transform(docs)

    # dump instances to files
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    # data file
    of_ext = '.sparse'
    if live:
        out_file = os.path.join(args.output, 'extracted-features' + of_ext)
    else:
        of_bn = os.path.join(args.output, os.path.basename(args.corpus))
        out_file = '{}.relations{}'.format(of_bn, of_ext)

    # dump
    dump_all(X_gen, y_gen, out_file, labtor.labelset_, docs,
             instance_generator)

    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Пример #5
0
def main(args):
    "main for feature extraction mode"
    # retrieve parameters
    feature_set = args.feature_set
    live = args.parsing

    # NEW lecsie features
    lecsie_data_dir = args.lecsie_data_dir

    # RST data
    # fileX docs are currently not supported by CoreNLP
    exclude_file_docs = args.corenlp_out_dir

    rst_reader = RstDtParser(args.corpus, args,
                             coarse_rels=args.coarse,
                             fix_pseudo_rels=args.fix_pseudo_rels,
                             exclude_file_docs=exclude_file_docs)
    rst_corpus = rst_reader.corpus
    # TODO: change educe.corpus.Reader.slurp*() so that they return an object
    # which contains a *list* of FileIds and a *list* of annotations
    # (see sklearn's Bunch)
    # on creation of these lists, one can impose the list of names to be
    # sorted so that the order in which docs are iterated is guaranteed
    # to be always the same

    # syntactic preprocessing
    if args.corenlp_out_dir:
        # get the precise path to CoreNLP parses for the corpus currently used
        # the folder layout of CoreNLP's output currently follows that of the
        # corpus: RSTtrees-main-1.0/{TRAINING,TEST}, RSTtrees-double-1.0
        # FIXME clean rewrite ; this could mean better modelling of the corpus
        # subparts/versions, e.g. RST corpus have "version: 1.0", annotators
        # "main" or "double"

        # find the suffix of the path name that starts with RSTtrees-*
        # FIXME find a cleaner way to do this ;
        # should probably use pathlib, included in the standard lib
        # for python >= 3.4
        try:
            rel_idx = (args.corpus).index('RSTtrees-WSJ-')
        except ValueError:
            # if no part of the path starts with "RSTtrees", keep the
            # entire path (no idea whether this is good)
            relative_corpus_path = args.corpus
        else:
            relative_corpus_path = args.corpus[rel_idx:]

        corenlp_out_dir = os.path.join(args.corenlp_out_dir,
                                       relative_corpus_path)
        csyn_parser = CoreNlpParser(corenlp_out_dir)
    else:
        # TODO improve switch between gold and predicted syntax
        # PTB data
        csyn_parser = PtbParser(args.ptb)
    # FIXME
    print('offline syntactic preprocessing: ready')

    # align EDUs with sentences, tokens and trees from PTB
    def open_plus(doc):
        """Open and fully load a document

        doc is an educe.corpus.FileId
        """
        # create a DocumentPlus
        doc = rst_reader.decode(doc)
        # populate it with layers of info
        # tokens
        doc = csyn_parser.tokenize(doc)
        # syn parses
        doc = csyn_parser.parse(doc)
        # disc segments
        doc = rst_reader.segment(doc)
        # disc parse
        doc = rst_reader.parse(doc)
        # pre-compute the relevant info for each EDU
        doc = doc.align_with_doc_structure()
        # logical order is align with tokens, then align with trees
        # but aligning with trees first for the PTB enables
        # to get proper sentence segmentation
        doc = doc.align_with_trees()
        doc = doc.align_with_tokens()
        # dummy, fallback tokenization if there is no PTB gold or silver
        doc = doc.align_with_raw_words()

        return doc

    # generate DocumentPluses
    # TODO remove sorted() once educe.corpus.Reader is able
    # to iterate over a stable (sorted) list of FileIds
    docs = [open_plus(doc) for doc in sorted(rst_corpus)]
    # instance generator
    instance_generator = lambda doc: doc.all_edu_pairs()
    split_feat_space = 'dir_sent'
    # extract vectorized samples
    if args.vocabulary is not None:
        vocab = load_vocabulary(args.vocabulary)
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       lecsie_data_dir=lecsie_data_dir,
                                       vocabulary=vocab,
                                       split_feat_space=split_feat_space)
        X_gen = vzer.transform(docs)
    else:
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       lecsie_data_dir=lecsie_data_dir,
                                       min_df=5,
                                       split_feat_space=split_feat_space)
        X_gen = vzer.fit_transform(docs)

    # extract class label for each instance
    if live:
        y_gen = itertools.repeat(0)
    elif args.labels is not None:
        labelset = load_labels(args.labels)
        labtor = DocumentLabelExtractor(instance_generator,
                                        labelset=labelset)
        labtor.fit(docs)
        y_gen = labtor.transform(docs)
    else:
        labtor = DocumentLabelExtractor(instance_generator)
        # y_gen = labtor.fit_transform(rst_corpus)
        # fit then transform enables to get classes_ for the dump
        labtor.fit(docs)
        y_gen = labtor.transform(docs)

    # dump instances to files
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    # data file
    of_ext = '.sparse'
    if live:
        out_file = os.path.join(args.output, 'extracted-features' + of_ext)
    else:
        of_bn = os.path.join(args.output, os.path.basename(args.corpus))
        out_file = '{}.relations{}'.format(of_bn, of_ext)
    # dump EDUs and features in svmlight format
    dump_all(X_gen, y_gen, out_file, labtor.labelset_, docs,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)
Пример #6
0
def main(args):
    "main for feature extraction mode"
    # retrieve parameters
    feature_set = args.feature_set
    live = args.parsing

    # NEW lecsie features
    lecsie_data_dir = args.lecsie_data_dir

    # RST data
    # fileX docs are currently not supported by CoreNLP
    exclude_file_docs = args.corenlp_out_dir

    rst_reader = RstDtParser(args.corpus, args,
                             coarse_rels=args.coarse,
                             fix_pseudo_rels=args.fix_pseudo_rels,
                             exclude_file_docs=exclude_file_docs)
    rst_corpus = rst_reader.corpus
    # TODO: change educe.corpus.Reader.slurp*() so that they return an object
    # which contains a *list* of FileIds and a *list* of annotations
    # (see sklearn's Bunch)
    # on creation of these lists, one can impose the list of names to be
    # sorted so that the order in which docs are iterated is guaranteed
    # to be always the same

    # syntactic preprocessing
    if args.corenlp_out_dir:
        # get the precise path to CoreNLP parses for the corpus currently used
        # the folder layout of CoreNLP's output currently follows that of the
        # corpus: RSTtrees-main-1.0/{TRAINING,TEST}, RSTtrees-double-1.0
        # FIXME clean rewrite ; this could mean better modelling of the corpus
        # subparts/versions, e.g. RST corpus have "version: 1.0", annotators
        # "main" or "double"

        # find the suffix of the path name that starts with RSTtrees-*
        # FIXME find a cleaner way to do this ;
        # should probably use pathlib, included in the standard lib
        # for python >= 3.4
        try:
            rel_idx = (args.corpus).index('RSTtrees-WSJ-')
        except ValueError:
            # if no part of the path starts with "RSTtrees", keep the
            # entire path (no idea whether this is good)
            relative_corpus_path = args.corpus
        else:
            relative_corpus_path = args.corpus[rel_idx:]

        corenlp_out_dir = os.path.join(args.corenlp_out_dir,
                                       relative_corpus_path)
        csyn_parser = CoreNlpParser(corenlp_out_dir)
    else:
        # TODO improve switch between gold and predicted syntax
        # PTB data
        csyn_parser = PtbParser(args.ptb)
    # FIXME
    print('offline syntactic preprocessing: ready')

    # align EDUs with sentences, tokens and trees from PTB
    def open_plus(doc):
        """Open and fully load a document

        doc is an educe.corpus.FileId
        """
        # create a DocumentPlus
        doc = rst_reader.decode(doc)
        # populate it with layers of info
        # tokens
        doc = csyn_parser.tokenize(doc)
        # syn parses
        doc = csyn_parser.parse(doc)
        # disc segments
        doc = rst_reader.segment(doc)
        # disc parse
        doc = rst_reader.parse(doc)
        # pre-compute the relevant info for each EDU
        doc = doc.align_with_doc_structure()
        # logical order is align with tokens, then align with trees
        # but aligning with trees first for the PTB enables
        # to get proper sentence segmentation
        doc = doc.align_with_trees()
        doc = doc.align_with_tokens()
        # dummy, fallback tokenization if there is no PTB gold or silver
        doc = doc.align_with_raw_words()

        return doc

    # generate DocumentPluses
    # TODO remove sorted() once educe.corpus.Reader is able
    # to iterate over a stable (sorted) list of FileIds
    docs = [open_plus(doc) for doc in sorted(rst_corpus)]
    # instance generator
    instance_generator = lambda doc: doc.all_edu_pairs()
    split_feat_space = 'dir_sent'
    # extract vectorized samples
    if args.vocabulary is not None:
        vocab = load_vocabulary(args.vocabulary)
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       lecsie_data_dir=lecsie_data_dir,
                                       vocabulary=vocab,
                                       split_feat_space=split_feat_space)
        X_gen = vzer.transform(docs)
    else:
        vzer = DocumentCountVectorizer(instance_generator,
                                       feature_set,
                                       lecsie_data_dir=lecsie_data_dir,
                                       min_df=5,
                                       split_feat_space=split_feat_space)
        X_gen = vzer.fit_transform(docs)

    # extract class label for each instance
    if live:
        y_gen = itertools.repeat(0)
    elif args.labels is not None:
        labelset = load_labels(args.labels)
        labtor = DocumentLabelExtractor(instance_generator,
                                        labelset=labelset)
        labtor.fit(docs)
        y_gen = labtor.transform(docs)
    else:
        labtor = DocumentLabelExtractor(instance_generator)
        # y_gen = labtor.fit_transform(rst_corpus)
        # fit then transform enables to get classes_ for the dump
        labtor.fit(docs)
        y_gen = labtor.transform(docs)

    # dump instances to files
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    # data file
    of_ext = '.sparse'
    if live:
        out_file = os.path.join(args.output, 'extracted-features' + of_ext)
    else:
        of_bn = os.path.join(args.output, os.path.basename(args.corpus))
        out_file = '{}.relations{}'.format(of_bn, of_ext)
    # dump EDUs and features in svmlight format
    dump_all(X_gen, y_gen, out_file, labtor.labelset_, docs,
             instance_generator)
    # dump vocabulary
    vocab_file = out_file + '.vocab'
    dump_vocabulary(vzer.vocabulary_, vocab_file)