Пример #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('model_path',
                        help='crf++ model file created by ' +
                        'tune_segmentation_model.py.')
    parser.add_argument('input_path', help='document text file')
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    args = parser.parse_args()

    raw_text = read_text_file(args.input_path)
    doc_dict = {"doc_id": args.input_path, "raw_text": raw_text}

    parser = SyntaxParserWrapper(port=args.zpar_port,
                                 hostname=args.zpar_hostname)
    trees, _ = parser.parse_document(doc_dict)
    tokens_doc = [extract_converted_terminals(tree) for tree in trees]
    preterminals = [extract_preterminals(tree) for tree in trees]
    token_tree_positions = [[x.treeposition() for x in preterminals_sentence]
                            for preterminals_sentence in preterminals]
    pos_tags = [[x.label() for x in preterminals_sentence]
                for preterminals_sentence in preterminals]

    doc_dict["tokens"] = tokens_doc
    doc_dict["syntax_trees"] = [
        t.pformat(margin=TREE_PRINT_MARGIN) for t in trees
    ]
    doc_dict["token_tree_positions"] = token_tree_positions
    doc_dict["pos_tags"] = pos_tags

    segmenter = Segmenter(args.model_path)
    segmenter.segment_document(doc_dict)

    edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'],
                                          tokens_doc)
    for edu_tokens in edu_token_lists:
        print(' '.join(edu_tokens))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('model_path', help='crf++ model file created by ' +
                        'tune_segmentation_model.py.')
    parser.add_argument('input_path', help='document text file')
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    args = parser.parse_args()

    raw_text = read_text_file(args.input_path)
    doc_dict = {"doc_id": args.input_path, "raw_text": raw_text}

    parser = SyntaxParserWrapper(port=args.zpar_port,
                                 hostname=args.zpar_hostname)
    trees, _ = parser.parse_document(doc_dict)
    tokens_doc = [extract_converted_terminals(tree) for tree in trees]
    preterminals = [extract_preterminals(tree) for tree in trees]
    token_tree_positions = [[x.treeposition() for x in
                             preterminals_sentence]
                            for preterminals_sentence
                            in preterminals]
    pos_tags = [[x.label() for x in preterminals_sentence]
                for preterminals_sentence in preterminals]

    doc_dict["tokens"] = tokens_doc
    doc_dict["syntax_trees"] = [t.pprint(TREE_PRINT_MARGIN) for t in trees]
    doc_dict["token_tree_positions"] = token_tree_positions
    doc_dict["pos_tags"] = pos_tags

    segmenter = Segmenter(args.model_path)
    segmenter.segment_document(doc_dict)

    edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'],
                                          tokens_doc)
    for edu_tokens in edu_token_lists:
        print(' '.join(edu_tokens))
Пример #3
0
def batch_process(docs, output_path, zpar_model_directory,
                  segmentation_model, parsing_model):
    '''
    docs is a list or tuple of (doc_id, text) tuples.
    '''
    syntax_parser = SyntaxParserWrapper(zpar_model_directory)
    segmenter = Segmenter(segmentation_model)

    parser = Parser(max_acts=1, max_states=1, n_best=1)
    parser.load_model(parsing_model)

    with open(output_path, 'w') as outfile:
        for doc_id, text in docs:
            logging.info('doc_id: {}'.format(doc_id))
            doc_dict = {"doc_id": doc_id, "raw_text": text}
            edu_tokens, complete_trees = \
                segment_and_parse(doc_dict, syntax_parser, segmenter, parser)
            print(json.dumps({"doc_id": doc_id, "edu_tokens": edu_tokens, \
                "scored_rst_trees": \
                [{"score": tree["score"],
                  "tree": tree["tree"].pformat(margin=TREE_PRINT_MARGIN)}
                 for tree in complete_trees]}), file=outfile)
Пример #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_paths',
                        nargs='+',
                        help='A document to segment and parse.' +
                        ' Paragraphs should be separated by two or more' +
                        ' newline characters.')
    parser.add_argument('-g',
                        '--segmentation_model',
                        help='Path to segmentation model.',
                        required=True)
    parser.add_argument('-p',
                        '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-a',
                        '--max_acts',
                        help='Maximum number of actions for...?',
                        type=int,
                        default=1)
    parser.add_argument('-n',
                        '--n_best',
                        help='Number of parses to return',
                        type=int,
                        default=1)
    parser.add_argument('-s',
                        '--max_states',
                        help='Maximum number of states to retain for \
                              best-first search',
                        type=int,
                        default=1)
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    parser.add_argument('-zm', '--zpar_model_directory', default=None)
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    args = parser.parse_args()

    # Convert verbose flag to actually logging level.
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely.
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)

    # Read the models.
    logging.info('Loading models')
    syntax_parser = \
        SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname,
                            zpar_model_directory=args.zpar_model_directory)
    segmenter = Segmenter(args.segmentation_model)

    parser = Parser(max_acts=args.max_acts,
                    max_states=args.max_states,
                    n_best=args.n_best)
    parser.load_model(args.parsing_model)

    for input_path in args.input_paths:
        logging.info('rst_parse input file: {}'.format(input_path))
        doc = read_text_file(input_path)

        logging.debug('rst_parse input. doc_id = {}, text = {}'.format(
            input_path, doc))
        doc_dict = {"raw_text": doc, "doc_id": input_path}

        edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser,
                                                       segmenter, parser)

        complete_trees = [tree for tree in complete_trees
                          ]  # can't use a generator twice

        print(json.dumps({"edu_tokens": edu_tokens, \
            "scored_rst_trees": [{"score": tree["score"],
                                  "tree": tree["tree"]
                                          .pformat(margin=TREE_PRINT_MARGIN)}
                                 for tree in complete_trees]}))

        for i, tree in enumerate(complete_trees, 1):
            ptree_str = tree['tree'].__repr__() + '\n'
            with codecs.open(input_path + '_{}.parentedtree'.format(str(i)),
                             'w', 'utf-8') as ptree_file:
                ptree_file.write(ptree_str)
Пример #5
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('evaluation_set',
                        help='The dev or test set JSON file',
                        type=argparse.FileType('r'))
    parser.add_argument('-g',
                        '--segmentation_model',
                        help='Path to segmentation model.  If not specified,' +
                        'then gold EDUs will be used.',
                        default=None)
    parser.add_argument('-p',
                        '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-z', '--zpar_directory', default='zpar')
    parser.add_argument('-t',
                        '--use_gold_syntax',
                        help='If specified, then gold PTB syntax trees will' +
                        'be used.',
                        action='store_true')
    parser.add_argument('-a',
                        '--max_acts',
                        help='Maximum number of actions to perform on each ' +
                        'state',
                        type=int,
                        default=1)
    parser.add_argument('-s',
                        '--max_states',
                        help='Maximum number of states to retain for ' +
                        'best-first search',
                        type=int,
                        default=1)
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    args = parser.parse_args()
    assert args.use_gold_syntax or args.segmentation_model

    # Convert verbose flag to actually logging level
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)
    logger = logging.getLogger(__name__)

    # read the models
    logger.info('Loading models')

    # TODO add port, host, model args
    syntax_parser = SyntaxParserWrapper() if not args.use_gold_syntax else None
    segmenter = Segmenter(args.segmentation_model) \
        if args.segmentation_model else None

    rst_parser = Parser(max_acts=args.max_acts,
                        max_states=args.max_states,
                        n_best=1)
    rst_parser.load_model(args.parsing_model)

    eval_data = json.load(args.evaluation_set)

    results = \
        predict_and_evaluate_rst_trees(syntax_parser, segmenter, rst_parser,
                                       eval_data,
                                       use_gold_syntax=args.use_gold_syntax)
    print(json.dumps(sorted(results.items())))