def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [ t.pformat(margin=TREE_PRINT_MARGIN) for t in trees ] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [t.pprint(TREE_PRINT_MARGIN) for t in trees] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def batch_process(docs, output_path, zpar_model_directory, segmentation_model, parsing_model): ''' docs is a list or tuple of (doc_id, text) tuples. ''' syntax_parser = SyntaxParserWrapper(zpar_model_directory) segmenter = Segmenter(segmentation_model) parser = Parser(max_acts=1, max_states=1, n_best=1) parser.load_model(parsing_model) with open(output_path, 'w') as outfile: for doc_id, text in docs: logging.info('doc_id: {}'.format(doc_id)) doc_dict = {"doc_id": doc_id, "raw_text": text} edu_tokens, complete_trees = \ segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \ "scored_rst_trees": \ [{"score": tree["score"], "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}), file=outfile)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}'.format( input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) complete_trees = [tree for tree in complete_trees ] # can't use a generator twice print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]})) for i, tree in enumerate(complete_trees, 1): ptree_str = tree['tree'].__repr__() + '\n' with codecs.open(input_path + '_{}.parentedtree'.format(str(i)), 'w', 'utf-8') as ptree_file: ptree_file.write(ptree_str)
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('evaluation_set', help='The dev or test set JSON file', type=argparse.FileType('r')) parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model. If not specified,' + 'then gold EDUs will be used.', default=None) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-z', '--zpar_directory', default='zpar') parser.add_argument('-t', '--use_gold_syntax', help='If specified, then gold PTB syntax trees will' + 'be used.', action='store_true') parser.add_argument('-a', '--max_acts', help='Maximum number of actions to perform on each ' + 'state', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for ' + 'best-first search', type=int, default=1) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() assert args.use_gold_syntax or args.segmentation_model # Convert verbose flag to actually logging level log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) logger = logging.getLogger(__name__) # read the models logger.info('Loading models') # TODO add port, host, model args syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None segmenter = Segmenter(args.segmentation_model) \ if args.segmentation_model else None rst_parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=1) rst_parser.load_model(args.parsing_model) eval_data = json.load(args.evaluation_set) results = \ predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser, eval_data, use_gold_syntax=args.use_gold_syntax) print(json.dumps(sorted(results.items())))