Пример #1
0
def save_grobid_training_tei_structured_document(
        filename, grobid_training_tei_structured_document):
    try:
        xml = etree.tostring(grobid_training_tei_structured_document.root)
    except Exception as e:
        raise RuntimeError('failed to convert to xml for %s due to %s' %
                           (filename, e)) from e
    save_file_content(filename, xml)
Пример #2
0
def main(argv=None):
    args = parse_args(argv)

    if args.debug:
        logging.getLogger().setLevel('DEBUG')

    structured_document = load_structured_document(args.lxml_path)

    xml_root = extract_structured_document_to_xml(structured_document,
                                                  tag_scope=args.tag_scope)

    get_logger().info('writing result to: %s', args.output_path)
    save_file_content(args.output_path,
                      etree.tostring(xml_root, pretty_print=True))
Пример #3
0
def fix_jats_xml_file(input_file: str, output_file: str, log_file_enabled: bool = True):
    if log_file_enabled:
        LOGGER.info('processing: %r -> %r', input_file, output_file)
    else:
        LOGGER.debug('processing: %r -> %r', input_file, output_file)
    fixed_malformatted_xml = False
    with auto_download_input_file(input_file) as local_input_file:
        try:
            tree = parse_xml(local_input_file, filename=input_file, fix_xml=False)
        except ValueError:
            tree = parse_xml(local_input_file, filename=input_file, fix_xml=True)
            fixed_malformatted_xml = True
    root = tree.getroot()
    original_root = clone_node(root)
    fix_jats_xml_node(root)
    add_fix_xml_meta_data(root, original_root, fixed_malformatted_xml=fixed_malformatted_xml)
    output_bytes = etree.tostring(
        tree,
        xml_declaration=True,
        encoding=tree.docinfo.encoding
    )
    save_file_content(output_file, output_bytes)
Пример #4
0
def process_file(file_url: str,
                 simple_runner: SimplePipelineRunner,
                 get_output_file_for_source_url: Callable[[str], str],
                 session: requests.Session,
                 request_args: MultiDict = None):
    output_file_url = get_output_file_for_source_url(file_url)
    file_content = read_all_from_path(file_url)
    LOGGER.info('read source content: %s (%s)', file_url,
                format_size(len(file_content)))
    data_type = guess_type(file_url)[0]
    LOGGER.debug('data_type: %s', data_type)
    LOGGER.debug('session: %s', session)
    context = {RequestsPipelineStep.REQUESTS_SESSION_KEY: session}
    if request_args:
        context['request_args'] = request_args
    result = simple_runner.convert(file_content,
                                   file_url,
                                   data_type,
                                   context=context)
    LOGGER.debug('result.keys: %s', result.keys())
    output_content = encode_if_text_type(result[DataProps.CONTENT])
    save_file_content(output_file_url, output_content)
    LOGGER.info('saved output to: %s (%s)', output_file_url,
                format_size(len(output_content)))
Пример #5
0
def configure_pipeline(p, opt, pipeline, config):
    def get_pipeline_output_file(source_url, ext):
        return get_output_file(source_url, opt.base_data_path, opt.output_path,
                               ext)

    def get_default_output_file_for_source_file(source_url):
        return get_pipeline_output_file(source_url, opt.output_suffix)

    def output_file_not_exists(source_url):
        return not _file_exists(
            get_default_output_file_for_source_file(source_url))

    steps = pipeline.get_steps(config, opt)

    LOGGER.info('steps: %s', steps)

    input_urls = (p | FileUrlSource(opt) | PreventFusion())

    if opt.resume:
        input_urls |= beam.Filter(output_file_not_exists)

    input_data = (input_urls | ReadFileContent()
                  | "Determine Type" >> beam.Map(lambda d: extend_dict(
                      d, {
                          DataProps.TYPE:
                          mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0]
                      })))

    result = input_data

    for step in steps:
        LOGGER.debug('step: %s', step)
        result |= get_step_transform(step)

    _ = (result | beam.Map(
        lambda x: LOGGER.info('result: %s (%s)', x.keys(), x[DataProps.TYPE])))

    _ = (  # noqa: F841
        result | "WriteOutput" >> TransformAndLog(
            beam.Map(lambda v: save_file_content(
                get_default_output_file_for_source_file(v[DataProps.
                                                          SOURCE_FILENAME]),
                encode_if_text_type(v[DataProps.CONTENT]))),
            log_fn=lambda x: get_logger().info('saved output to: %s', x)))
Пример #6
0
def configure_pipeline(p, opt):
    def get_pipeline_output_file(source_url, ext):
        return get_output_file(source_url, opt.base_data_path, opt.output_path,
                               ext)

    if (opt.use_grobid and not opt.crf_model and not opt.cv_model_export_dir
            and not opt.lxml_file_list):
        extracted_xml = add_read_pdfs_to_grobid_xml_pipeline_steps(p, opt)
    else:
        extracted_xml = add_read_source_to_extracted_xml_pipeline_steps(
            p, opt, get_pipeline_output_file)

    _ = (  # flake8: noqa
        extracted_xml | "WriteXml" >> TransformAndLog(
            beam.Map(lambda v: save_file_content(
                get_pipeline_output_file(v[DataProps.SOURCE_FILENAME], opt.
                                         output_suffix), v[DataProps.
                                                           EXTRACTED_XML])),
            log_fn=lambda x: get_logger().info('saved xml to: %s', x)))
Пример #7
0
def configure_pipeline(p, opt, pipeline, config):
    get_default_output_file_for_source_file = get_output_file_for_source_file_fn(
        opt)
    file_list = get_remaining_file_list_for_args(opt)
    LOGGER.debug('file_list: %s', file_list)

    if not file_list:
        LOGGER.info('no files to process')
        return

    steps = pipeline.get_steps(config, opt)

    LOGGER.info('steps: %s', steps)

    input_urls = (p | beam.Create(file_list) | PreventFusion())

    input_data = (input_urls | ReadFileContent()
                  | "Determine Type" >> beam.Map(lambda d: extend_dict(
                      d, {
                          DataProps.TYPE:
                          mimetypes.guess_type(d[DataProps.SOURCE_FILENAME])[0]
                      })))

    result = input_data

    for step in steps:
        LOGGER.debug('step: %s', step)
        result |= get_step_transform(step)

    _ = (  # noqa: F841
        result | "WriteOutput" >> TransformAndLog(
            beam.Map(lambda v: save_file_content(
                get_default_output_file_for_source_file(v[DataProps.
                                                          SOURCE_FILENAME]),
                encode_if_text_type(v[DataProps.CONTENT]))),
            log_fn=lambda x: get_logger().info('saved output to: %s', x)))
def save_lxml_structured_document(filename, lxml_structured_document):
    save_file_content(filename, etree.tostring(lxml_structured_document.root))
Пример #9
0
def save_model(output_filename, model_bytes):
    LOGGER.info('saving model to %s', output_filename)
    save_file_content(output_filename, model_bytes)
def save_model(output_filename, model_bytes):
    get_logger().info('saving model to %s', output_filename)
    save_file_content(output_filename, model_bytes)
Пример #11
0
def fix_source_file_to(source_url: str,
                       target_url: str,
                       encoding: str = 'utf-8'):
    xml_bytes = read_all_from_path(source_url)
    fixed_xml_bytes = get_fixed_xml_bytes(xml_bytes, encoding=encoding)
    save_file_content(target_url, fixed_xml_bytes)
def fix_source_file_to(source_url: str, target_url: str):
    source_data = read_all_from_path(source_url)
    data = source_data
    if b'</content>' in data and b'<content>' not in data:
        data = data.replace(b'</content>', b'')
    save_file_content(target_url, data)
def configure_pipeline(p, opt):
    image_size = ((opt.image_width, opt.image_height)
                  if opt.image_width and opt.image_height else None)
    page_range = opt.pages
    first_page = page_range[0] if page_range else 1
    xml_mapping = parse_xml_mapping(opt.xml_mapping_path)
    if opt.lxml_path:
        lxml_xml_file_pairs = (
            p | beam.Create(
                [[
                    join_if_relative_path(opt.base_data_path, s)
                    for s in [opt.lxml_path, opt.xml_path]
                ]]) | "FindFilePairs" >> TransformAndLog(
                    beam.FlatMap(lambda patterns: islice(
                        find_file_pairs_grouped_by_parent_directory_or_name(
                            patterns), opt.limit)),
                    log_prefix='file pairs: ',
                    log_level='debug') | PreventFusion()
            | "ReadFileContent" >> beam.Map(
                lambda filenames: {
                    'source_filename': filenames[0],
                    'xml_filename': filenames[1],
                    'lxml_content': read_all_from_path(filenames[0]),
                    'xml_content': read_all_from_path(filenames[1])
                }))
    elif opt.pdf_path or opt.pdf_xml_file_list:
        if opt.pdf_xml_file_list:
            pdf_xml_url_pairs = (
                p | "ReadFilePairUrls" >> ReadDictCsv(opt.pdf_xml_file_list,
                                                      limit=opt.limit)
                | "TranslateFilePairUrls" >>
                beam.Map(lambda row: (row['source_url'], row['xml_url'])))
        else:
            pdf_xml_url_pairs = (p | beam.Create([[
                join_if_relative_path(opt.base_data_path, s)
                for s in [opt.pdf_path, opt.xml_path]
            ]]) | "FindFilePairs" >> TransformAndLog(
                beam.FlatMap(lambda patterns: islice(
                    find_file_pairs_grouped_by_parent_directory_or_name(
                        patterns), opt.limit)),
                log_prefix='file pairs: ',
                log_level='debug'))
        pdf_xml_file_pairs = (
            pdf_xml_url_pairs | PreventFusion()
            | "ReadFileContent" >> TransformAndCount(
                beam.Map(
                    lambda filenames: {
                        'source_filename': filenames[0],
                        'xml_filename': filenames[1],
                        'pdf_content': read_all_from_path(filenames[0]),
                        'xml_content': read_all_from_path(filenames[1])
                    }), MetricCounters.FILE_PAIR))

        lxml_xml_file_pairs = (
            pdf_xml_file_pairs | "ConvertPdfToLxml" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'lxml_content':
                            convert_pdf_bytes_to_lxml(v['pdf_content'],
                                                      path=v['source_filename'
                                                             ],
                                                      page_range=page_range)
                        }),
                    # we don't need the pdf_content unless we are writing tf_records
                    None if opt.save_tfrecords else {'pdf_content'}),
                log_fn=lambda e, v: (get_logger().warning(
                    'caught exception (ignoring item): %s, pdf: %s, xml: %s',
                    e,
                    v['source_filename'],
                    v['xml_filename'],
                    exc_info=e)),
                error_count=MetricCounters.CONVERT_PDF_TO_LXML_ERROR))
    else:
        raise RuntimeError('either lxml-path or pdf-path required')

    if opt.save_png or opt.save_tfrecords:
        with_pdf_png_pages = (
            (lxml_xml_file_pairs if opt.save_tfrecords else pdf_xml_file_pairs)
            | "ConvertPdfToPng" >> MapOrLog(
                lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'pdf_png_pages':
                            list(
                                pdf_bytes_to_png_pages(v['pdf_content'],
                                                       dpi=opt.png_dpi,
                                                       image_size=image_size,
                                                       page_range=page_range))
                        }),
                    {'pdf_content'}  # we no longer need the pdf_content
                ),
                error_count=MetricCounters.CONVERT_PDF_TO_PNG_ERROR))

        if opt.save_png:
            _ = (with_pdf_png_pages | "SavePdfToPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.png.zip')),
                    '.png', v['pdf_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.save_lxml:
        _ = (lxml_xml_file_pairs
             | "SaveLxml" >> TransformAndLog(
                 beam.Map(lambda v: save_file_content(
                     FileSystems.join(
                         opt.output_path,
                         change_ext(
                             relative_path(opt.base_data_path, v[
                                 'source_filename']), None, '.lxml.gz')), v[
                                     'lxml_content'])),
                 log_fn=lambda x: get_logger().info('saved lxml: %s', x)))

    annotation_results = ((
        with_pdf_png_pages if opt.save_tfrecords else lxml_xml_file_pairs
    ) | "ConvertLxmlToSvgAndAnnotate" >> TransformAndCount(
        MapOrLog(
            lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'svg_pages':
                        list(
                            convert_and_annotate_lxml_content(
                                v['lxml_content'],
                                v['xml_content'],
                                xml_mapping,
                                name=v['source_filename']))
                    }),
                # Won't need the XML anymore
                {'lxml_content', 'xml_content'}),
            log_fn=lambda e, v: (get_logger().warning(
                'caught exception (ignoring item): %s, source: %s, xml: %s',
                e,
                v['source_filename'],
                v['xml_filename'],
                exc_info=e)),
            error_count=MetricCounters.CONVERT_LXML_TO_SVG_ANNOT_ERROR),
        MetricCounters.PAGE,
        lambda v: len(v['svg_pages'])))

    if opt.save_svg:
        _ = (annotation_results | "SaveSvgPages" >> TransformAndLog(
            beam.Map(lambda v: save_svg_roots(
                FileSystems.join(
                    opt.output_path,
                    change_ext(
                        relative_path(opt.base_data_path, v['source_filename']
                                      ), None, '.svg.zip')), v['svg_pages'])),
            log_fn=lambda x: get_logger().info('saved result: %s', x)))

    if opt.annotation_evaluation_csv or opt.min_annotation_percentage:
        annotation_evaluation_results = (
            annotation_results | "EvaluateAnnotations" >> TransformAndLog(
                beam.Map(lambda v: remove_keys_from_dict(
                    extend_dict(
                        v, {
                            'annotation_evaluation':
                            evaluate_document_by_page(
                                SvgStructuredDocument(v['svg_pages']))
                        }), None
                    if opt.min_annotation_percentage else {'svg_pages'})),
                log_fn=lambda x: get_logger().info(
                    'annotation evaluation result: %s: %s', x[
                        'source_filename'], x['annotation_evaluation'])))

    if opt.save_block_png or opt.save_tfrecords:
        color_map = parse_color_map_from_file(opt.color_map)
        with_block_png_pages = (
            (annotation_evaluation_results
             if opt.min_annotation_percentage else annotation_results)
            | "GenerateBlockPng" >> beam.Map(lambda v: remove_keys_from_dict(
                extend_dict(
                    v, {
                        'block_png_pages': [
                            svg_page_to_blockified_png_bytes(
                                svg_page, color_map, image_size=image_size)
                            for svg_page in v['svg_pages']
                        ]
                    }), {'svg_pages'})))

        if opt.save_block_png:
            _ = (with_block_png_pages | "SaveBlockPng" >> TransformAndLog(
                beam.Map(lambda v: save_pages(
                    FileSystems.join(
                        opt.output_path,
                        change_ext(
                            relative_path(opt.base_data_path, v[
                                'source_filename']), None, '.block-png.zip')),
                    '.png', v['block_png_pages'])),
                log_fn=lambda x: get_logger().info('saved result: %s', x)))

        if opt.save_tfrecords:
            if opt.min_annotation_percentage:
                filtered_pages = (
                    with_block_png_pages | "FilterPages" >> TransformAndCount(
                        beam.Map(lambda v: filter_list_props_by_indices(
                            v,
                            get_page_indices_with_min_annotation_percentage(
                                v['annotation_evaluation'], opt.
                                min_annotation_percentage),
                            {'pdf_png_pages', 'block_png_pages'})),
                        MetricCounters.FILTERED_PAGE,
                        lambda v: len(v['block_png_pages'])))
            else:
                filtered_pages = with_block_png_pages
            _ = (filtered_pages | "WriteTFRecords" >> WritePropsToTFRecord(
                FileSystems.join(opt.output_path, 'data'), lambda v: ({
                    'input_uri':
                    v['source_filename'] + '#page%d' % (first_page + i),
                    'input_image':
                    pdf_png_page,
                    'annotation_uri':
                    (v['source_filename'] + '.annot' + '#page%d' %
                     (first_page + i)),
                    'annotation_image':
                    block_png_page,
                    'page_no':
                    first_page + i
                } for i, pdf_png_page, block_png_page in zip(
                    range(len(v['pdf_png_pages'])), v['pdf_png_pages'], v[
                        'block_png_pages']))))

    if opt.annotation_evaluation_csv:
        annotation_evaluation_csv_name, annotation_evaluation_ext = (
            os.path.splitext(opt.annotation_evaluation_csv))
        _ = (  # flake8: noqa
            annotation_evaluation_results | "FlattenAnotationEvaluationResults"
            >> beam.FlatMap(lambda v: to_annotation_evaluation_csv_dict_rows(
                v['annotation_evaluation'],
                document=basename(v['source_filename'])))
            | "WriteAnnotationEvaluationToCsv" >> WriteDictCsv(
                join_if_relative_path(opt.output_path,
                                      annotation_evaluation_csv_name),
                file_name_suffix=annotation_evaluation_ext,
                columns=DEFAULT_EVALUATION_COLUMNS))
Пример #14
0
def save_structured_document(filename, structured_document):
    # only support saving lxml for now
    assert isinstance(structured_document, LxmlStructuredDocument)
    save_file_content(
        filename, etree.tostring(structured_document.root, pretty_print=True))
    return filename