예제 #1
0
파일: features.py 프로젝트: fbuijs/educe
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus, mode=args.strip_mode)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {}  # _read_inquirer_lexicon(args)

    verbnet_entries = [
        VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES
    ]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
예제 #2
0
파일: features.py 프로젝트: kowey/educe
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {} #_read_inquirer_lexicon(args)

    verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x)))
                       for x in VERBNET_CLASSES]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
예제 #3
0
def _read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    is_interesting = mk_is_interesting(args, preselected={"stage": ["units"]})
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(), is_interesting)
    corpus = reader.slurp(anno_files, verbose=True)

    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    LEXICON.read(args.resources)
    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=[LEXICON],
                        pdtb_lex=None,
                        verbnet_entries=None,
                        inquirer_lex=None)
예제 #4
0
파일: res_nps.py 프로젝트: eipiplusun/educe
def _read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    is_interesting = mk_is_interesting(args,
                                       preselected={"stage": ["units"]})
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(), is_interesting)
    corpus = reader.slurp(anno_files, verbose=True)

    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    LEXICON.read(args.resources)
    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=[LEXICON],
                        pdtb_lex=None,
                        verbnet_entries=None,
                        inquirer_lex=None)