def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [ t.pformat(margin=TREE_PRINT_MARGIN) for t in trees ] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('model_path', help='crf++ model file created by ' + 'tune_segmentation_model.py.') parser.add_argument('input_path', help='document text file') parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) args = parser.parse_args() raw_text = read_text_file(args.input_path) doc_dict = {"doc_id": args.input_path, "raw_text": raw_text} parser = SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname) trees, _ = parser.parse_document(doc_dict) tokens_doc = [extract_converted_terminals(tree) for tree in trees] preterminals = [extract_preterminals(tree) for tree in trees] token_tree_positions = [[x.treeposition() for x in preterminals_sentence] for preterminals_sentence in preterminals] pos_tags = [[x.label() for x in preterminals_sentence] for preterminals_sentence in preterminals] doc_dict["tokens"] = tokens_doc doc_dict["syntax_trees"] = [t.pprint(TREE_PRINT_MARGIN) for t in trees] doc_dict["token_tree_positions"] = token_tree_positions doc_dict["pos_tags"] = pos_tags segmenter = Segmenter(args.model_path) segmenter.segment_document(doc_dict) edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'], tokens_doc) for edu_tokens in edu_token_lists: print(' '.join(edu_tokens))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}' .format(input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pprint(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]}))
def main(): import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_paths', nargs='+', help='A document to segment and parse.' + ' Paragraphs should be separated by two or more' + ' newline characters.') parser.add_argument('-g', '--segmentation_model', help='Path to segmentation model.', required=True) parser.add_argument('-p', '--parsing_model', help='Path to RST parsing model.', required=True) parser.add_argument('-a', '--max_acts', help='Maximum number of actions for...?', type=int, default=1) parser.add_argument('-n', '--n_best', help='Number of parses to return', type=int, default=1) parser.add_argument('-s', '--max_states', help='Maximum number of states to retain for \ best-first search', type=int, default=1) parser.add_argument('-zp', '--zpar_port', type=int) parser.add_argument('-zh', '--zpar_hostname', default=None) parser.add_argument('-zm', '--zpar_model_directory', default=None) parser.add_argument('-v', '--verbose', help='Print more status information. For every ' + 'additional time this flag is specified, ' + 'output gets more verbose.', default=0, action='count') args = parser.parse_args() # Convert verbose flag to actually logging level. log_levels = [logging.WARNING, logging.INFO, logging.DEBUG] log_level = log_levels[min(args.verbose, 2)] # Make warnings from built-in warnings module get formatted more nicely. logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' + '%(message)s'), level=log_level) # Read the models. logging.info('Loading models') syntax_parser = \ SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname, zpar_model_directory=args.zpar_model_directory) segmenter = Segmenter(args.segmentation_model) parser = Parser(max_acts=args.max_acts, max_states=args.max_states, n_best=args.n_best) parser.load_model(args.parsing_model) for input_path in args.input_paths: logging.info('rst_parse input file: {}'.format(input_path)) doc = read_text_file(input_path) logging.debug('rst_parse input. doc_id = {}, text = {}'.format( input_path, doc)) doc_dict = {"raw_text": doc, "doc_id": input_path} edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser, segmenter, parser) complete_trees = [tree for tree in complete_trees ] # can't use a generator twice print(json.dumps({"edu_tokens": edu_tokens, \ "scored_rst_trees": [{"score": tree["score"], "tree": tree["tree"] .pformat(margin=TREE_PRINT_MARGIN)} for tree in complete_trees]})) for i, tree in enumerate(complete_trees, 1): ptree_str = tree['tree'].__repr__() + '\n' with codecs.open(input_path + '_{}.parentedtree'.format(str(i)), 'w', 'utf-8') as ptree_file: ptree_file.write(ptree_str)
if len(res) == 1 and len(text) > 500 and re.search(r'\.\s*\n', text): logging.info('The text was over 500 characters, no indentation' + ' or blank lines were found, and there is a period' + ' followed by a newline. Falling back to splitting' + ' by newlines. doc_id = {}'.format(doc_id)) res = re.split(r'\n+', text) # Replace multiple spaces/newlines within a paragraph with one space. res = [re.sub(r'\s+', ' ', x) for x in res] # Make sure the number of non-whitespace characters is unchanged. assert len(re.sub(r'\s', '', text)) \ == len(''.join([re.sub(r'\s', '', x) for x in res])) logging.info('Number of paragraphs found: {}'.format(len(res))) return res if __name__ == '__main__': import argparse parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('input_text', help='raw text to split into paragraphs') args = parser.parse_args() doc = read_text_file(args.input_text) paragraphs = ParagraphSplitter.find_paragraphs(doc, args.input_text) for paragraph in paragraphs: print(paragraph)