def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus, mode=args.strip_mode) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} # _read_inquirer_lexicon(args) verbnet_entries = [ VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES ] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def _main_enclosure_graph(args): """ Draw graphs showing which annotations' spans include the others """ corpus = _read_corpus(args) output_dir = get_output_dir(args) keys = corpus if args.tokens: postags = postag.read_tags(corpus, args.corpus) else: postags = None for k in sorted(keys): if postags: gra_ = stacgraph.EnclosureGraph(corpus[k], postags[k]) else: gra_ = stacgraph.EnclosureGraph(corpus[k]) if args.reduce: gra_.reduce() dot_gra = stacgraph.EnclosureDotGraph(gra_) if dot_gra.get_nodes(): dot_gra.set("ratio", "compress") write_dot_graph(k, output_dir, dot_gra, run_graphviz=args.draw) else: print("Skipping %s (empty graph)" % k, file=sys.stderr)
def read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), mk_is_interesting(args, args.single)) corpus = reader.slurp(anno_files, verbose=True) if not args.ignore_cdus: strip_cdus(corpus) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) _fuse_corpus(corpus, postags) for lex in LEXICONS: lex.read(args.resources) pdtb_lex = read_pdtb_lexicon(args) inq_lex = {} #_read_inquirer_lexicon(args) verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES] return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=LEXICONS, pdtb_lex=pdtb_lex, verbnet_entries=verbnet_entries, inquirer_lex=inq_lex)
def _read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ is_interesting = mk_is_interesting(args, preselected={"stage": ["units"]}) reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), is_interesting) corpus = reader.slurp(anno_files, verbose=True) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) LEXICON.read(args.resources) return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=[LEXICON], pdtb_lex=None, verbnet_entries=None, inquirer_lex=None)