def read_augmented_corpus(args, verbose=True): """ Read the unannotated stage of the augmented corpus """ aug_args = copy.copy(args) aug_args.annotator = None preselection = {'stage': ['unannotated']} is_interesting = mk_is_interesting(aug_args, preselected=preselection) reader = educe.stac.Reader(args.augmented) anno_files = reader.filter(reader.files(), is_interesting) return reader.slurp(anno_files, verbose)
def _read_corpus_inputs(args): """ Read and filter the part of the corpus we want features for """ is_interesting = mk_is_interesting(args, preselected={"stage": ["units"]}) reader = educe.stac.Reader(args.corpus) anno_files = reader.filter(reader.files(), is_interesting) corpus = reader.slurp(anno_files, verbose=True) postags = postag.read_tags(corpus, args.corpus) parses = corenlp.read_results(corpus, args.corpus) LEXICON.read(args.resources) return FeatureInput(corpus=corpus, postags=postags, parses=parses, lexicons=[LEXICON], pdtb_lex=None, verbnet_entries=None, inquirer_lex=None)
# --------------------------------------------------------------------- # args # --------------------------------------------------------------------- arg_parser = argparse.ArgumentParser(description='Dump EDU text' ) arg_parser.add_argument('idir', metavar='DIR', help='Input directory' ) educe_group = arg_parser.add_argument_group('corpus filtering arguments') util.add_corpus_filters(educe_group, fields=[ 'doc' ]) args=arg_parser.parse_args() args.subdoc = None args.stage = 'unannotated' args.annotator = None is_interesting=util.mk_is_interesting(args) # --------------------------------------------------------------------- # main # --------------------------------------------------------------------- reader = educe.stac.Reader(args.idir) anno_files = reader.filter(reader.files(), is_interesting) trello = tr.TrelloApi(secrets.apikey, secrets.token) board = trello.boards.get(board_id) columns = trello.boards.get_list(board_id) cards = trello.boards.get_card(board_id) subdocs = collections.defaultdict(list) for k in anno_files: subdocs[k.doc].append(k.subdoc)
# # then visit in your web browser to approve # # and paste in the resulting token # --------------------------------------------------------------------- # args # --------------------------------------------------------------------- arg_parser = argparse.ArgumentParser(description='Dump EDU text') arg_parser.add_argument('idir', metavar='DIR', help='Input directory') educe_group = arg_parser.add_argument_group('corpus filtering arguments') util.add_corpus_filters(educe_group, fields=['doc']) args = arg_parser.parse_args() args.subdoc = None args.stage = 'unannotated' args.annotator = None is_interesting = util.mk_is_interesting(args) # --------------------------------------------------------------------- # main # --------------------------------------------------------------------- reader = educe.stac.Reader(args.idir) anno_files = reader.filter(reader.files(), is_interesting) trello = tr.TrelloApi(secrets.apikey, secrets.token) board = trello.boards.get(board_id) columns = trello.boards.get_list(board_id) cards = trello.boards.get_card(board_id) subdocs = collections.defaultdict(list) for k in anno_files: subdocs[k.doc].append(k.subdoc)