예제 #1
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('model_path',
                        help='crf++ model file created by ' +
                        'tune_segmentation_model.py.')
    parser.add_argument('input_path', help='document text file')
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    args = parser.parse_args()

    raw_text = read_text_file(args.input_path)
    doc_dict = {"doc_id": args.input_path, "raw_text": raw_text}

    parser = SyntaxParserWrapper(port=args.zpar_port,
                                 hostname=args.zpar_hostname)
    trees, _ = parser.parse_document(doc_dict)
    tokens_doc = [extract_converted_terminals(tree) for tree in trees]
    preterminals = [extract_preterminals(tree) for tree in trees]
    token_tree_positions = [[x.treeposition() for x in preterminals_sentence]
                            for preterminals_sentence in preterminals]
    pos_tags = [[x.label() for x in preterminals_sentence]
                for preterminals_sentence in preterminals]

    doc_dict["tokens"] = tokens_doc
    doc_dict["syntax_trees"] = [
        t.pformat(margin=TREE_PRINT_MARGIN) for t in trees
    ]
    doc_dict["token_tree_positions"] = token_tree_positions
    doc_dict["pos_tags"] = pos_tags

    segmenter = Segmenter(args.model_path)
    segmenter.segment_document(doc_dict)

    edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'],
                                          tokens_doc)
    for edu_tokens in edu_token_lists:
        print(' '.join(edu_tokens))
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('model_path', help='crf++ model file created by ' +
                        'tune_segmentation_model.py.')
    parser.add_argument('input_path', help='document text file')
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    args = parser.parse_args()

    raw_text = read_text_file(args.input_path)
    doc_dict = {"doc_id": args.input_path, "raw_text": raw_text}

    parser = SyntaxParserWrapper(port=args.zpar_port,
                                 hostname=args.zpar_hostname)
    trees, _ = parser.parse_document(doc_dict)
    tokens_doc = [extract_converted_terminals(tree) for tree in trees]
    preterminals = [extract_preterminals(tree) for tree in trees]
    token_tree_positions = [[x.treeposition() for x in
                             preterminals_sentence]
                            for preterminals_sentence
                            in preterminals]
    pos_tags = [[x.label() for x in preterminals_sentence]
                for preterminals_sentence in preterminals]

    doc_dict["tokens"] = tokens_doc
    doc_dict["syntax_trees"] = [t.pprint(TREE_PRINT_MARGIN) for t in trees]
    doc_dict["token_tree_positions"] = token_tree_positions
    doc_dict["pos_tags"] = pos_tags

    segmenter = Segmenter(args.model_path)
    segmenter.segment_document(doc_dict)

    edu_token_lists = extract_edus_tokens(doc_dict['edu_start_indices'],
                                          tokens_doc)
    for edu_tokens in edu_token_lists:
        print(' '.join(edu_tokens))
예제 #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_paths',
                        nargs='+',
                        help='A document to segment and parse.' +
                        ' Paragraphs should be separated by two or more' +
                        ' newline characters.')
    parser.add_argument('-g', '--segmentation_model',
                        help='Path to segmentation model.',
                        required=True)
    parser.add_argument('-p', '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-a', '--max_acts',
                        help='Maximum number of actions for...?',
                        type=int, default=1)
    parser.add_argument('-n', '--n_best',
                        help='Number of parses to return', type=int, default=1)
    parser.add_argument('-s', '--max_states',
                        help='Maximum number of states to retain for \
                              best-first search',
                        type=int, default=1)
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    parser.add_argument('-zm', '--zpar_model_directory', default=None)
    parser.add_argument('-v', '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0, action='count')
    args = parser.parse_args()

    # Convert verbose flag to actually logging level.
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely.
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'), level=log_level)

    # Read the models.
    logging.info('Loading models')
    syntax_parser = \
        SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname,
                            zpar_model_directory=args.zpar_model_directory)
    segmenter = Segmenter(args.segmentation_model)

    parser = Parser(max_acts=args.max_acts,
                    max_states=args.max_states,
                    n_best=args.n_best)
    parser.load_model(args.parsing_model)

    for input_path in args.input_paths:
        logging.info('rst_parse input file: {}'.format(input_path))
        doc = read_text_file(input_path)

        logging.debug('rst_parse input. doc_id = {}, text = {}'
                      .format(input_path, doc))
        doc_dict = {"raw_text": doc, "doc_id": input_path}

        edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser,
                                                       segmenter, parser)

        print(json.dumps({"edu_tokens": edu_tokens, \
            "scored_rst_trees": [{"score": tree["score"],
                                  "tree": tree["tree"]
                                          .pprint(margin=TREE_PRINT_MARGIN)}
                                 for tree in complete_trees]}))
예제 #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_paths',
                        nargs='+',
                        help='A document to segment and parse.' +
                        ' Paragraphs should be separated by two or more' +
                        ' newline characters.')
    parser.add_argument('-g',
                        '--segmentation_model',
                        help='Path to segmentation model.',
                        required=True)
    parser.add_argument('-p',
                        '--parsing_model',
                        help='Path to RST parsing model.',
                        required=True)
    parser.add_argument('-a',
                        '--max_acts',
                        help='Maximum number of actions for...?',
                        type=int,
                        default=1)
    parser.add_argument('-n',
                        '--n_best',
                        help='Number of parses to return',
                        type=int,
                        default=1)
    parser.add_argument('-s',
                        '--max_states',
                        help='Maximum number of states to retain for \
                              best-first search',
                        type=int,
                        default=1)
    parser.add_argument('-zp', '--zpar_port', type=int)
    parser.add_argument('-zh', '--zpar_hostname', default=None)
    parser.add_argument('-zm', '--zpar_model_directory', default=None)
    parser.add_argument('-v',
                        '--verbose',
                        help='Print more status information. For every ' +
                        'additional time this flag is specified, ' +
                        'output gets more verbose.',
                        default=0,
                        action='count')
    args = parser.parse_args()

    # Convert verbose flag to actually logging level.
    log_levels = [logging.WARNING, logging.INFO, logging.DEBUG]
    log_level = log_levels[min(args.verbose, 2)]
    # Make warnings from built-in warnings module get formatted more nicely.
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'),
                        level=log_level)

    # Read the models.
    logging.info('Loading models')
    syntax_parser = \
        SyntaxParserWrapper(port=args.zpar_port, hostname=args.zpar_hostname,
                            zpar_model_directory=args.zpar_model_directory)
    segmenter = Segmenter(args.segmentation_model)

    parser = Parser(max_acts=args.max_acts,
                    max_states=args.max_states,
                    n_best=args.n_best)
    parser.load_model(args.parsing_model)

    for input_path in args.input_paths:
        logging.info('rst_parse input file: {}'.format(input_path))
        doc = read_text_file(input_path)

        logging.debug('rst_parse input. doc_id = {}, text = {}'.format(
            input_path, doc))
        doc_dict = {"raw_text": doc, "doc_id": input_path}

        edu_tokens, complete_trees = segment_and_parse(doc_dict, syntax_parser,
                                                       segmenter, parser)

        complete_trees = [tree for tree in complete_trees
                          ]  # can't use a generator twice

        print(json.dumps({"edu_tokens": edu_tokens, \
            "scored_rst_trees": [{"score": tree["score"],
                                  "tree": tree["tree"]
                                          .pformat(margin=TREE_PRINT_MARGIN)}
                                 for tree in complete_trees]}))

        for i, tree in enumerate(complete_trees, 1):
            ptree_str = tree['tree'].__repr__() + '\n'
            with codecs.open(input_path + '_{}.parentedtree'.format(str(i)),
                             'w', 'utf-8') as ptree_file:
                ptree_file.write(ptree_str)
        if len(res) == 1 and len(text) > 500 and re.search(r'\.\s*\n', text):
            logging.info('The text was over 500 characters, no indentation' +
                         ' or blank lines were found, and there is a period' +
                         ' followed by a newline. Falling back to splitting' +
                         ' by newlines. doc_id = {}'.format(doc_id))
            res = re.split(r'\n+', text)

        # Replace multiple spaces/newlines within a paragraph with one space.
        res = [re.sub(r'\s+', ' ', x) for x in res]

        # Make sure the number of non-whitespace characters is unchanged.
        assert len(re.sub(r'\s', '', text)) \
            == len(''.join([re.sub(r'\s', '', x) for x in res]))

        logging.info('Number of paragraphs found: {}'.format(len(res)))

        return res


if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('input_text', help='raw text to split into paragraphs')
    args = parser.parse_args()

    doc = read_text_file(args.input_text)
    paragraphs = ParagraphSplitter.find_paragraphs(doc, args.input_text)
    for paragraph in paragraphs:
        print(paragraph)