Пример #1
0
def _pdf_pageinfo_concurrent(
    pdf, infile, progbar, max_workers, check_pages, detailed_analysis=False
):
    pages = [None] * len(pdf.pages)

    def update_pageinfo(result, pbar):
        page = result
        pages[page.pageno] = page
        pbar.update()

    if max_workers is None:
        max_workers = available_cpu_count()

    total = len(pdf.pages)
    contexts = ((n, infile, check_pages, detailed_analysis) for n in range(total))

    use_threads = False  # No performance gain if threaded due to GIL
    n_workers = min(1 + len(pages) // 4, max_workers)
    if n_workers == 1:
        # But if we decided on only one worker, there is no point in using
        # a separate process.
        use_threads = True

    exec_progress_pool(
        use_threads=use_threads,
        max_workers=n_workers,
        tqdm_kwargs=dict(
            total=total, desc="Scanning contents", unit='page', disable=not progbar
        ),
        task_initializer=partial(_pdf_pageinfo_sync_init, infile),
        task=_pdf_pageinfo_sync,
        task_arguments=contexts,
        task_finished=update_pageinfo,
    )
    return pages
Пример #2
0
def exec_concurrent(context: PdfContext):
    """Execute the pipeline concurrently"""

    # Run exec_page_sync on every page context
    options = context.options
    max_workers = min(len(context.pdfinfo), options.jobs)
    if max_workers > 1:
        log.info("Start processing %d pages concurrently", max_workers)

    sidecars: List[Optional[Path]] = [None] * len(context.pdfinfo)
    ocrgraft = OcrGrafter(context)

    def update_page(result: PageResult, pbar):
        try:
            tls.pageno = result.pageno + 1
            sidecars[result.pageno] = result.text
            pbar.update()
            ocrgraft.graft_page(
                pageno=result.pageno,
                image=result.pdf_page_from_image,
                textpdf=result.ocr,
                autorotate_correction=result.orientation_correction,
            )
            pbar.update()
        finally:
            tls.pageno = None

    exec_progress_pool(
        use_threads=options.use_threads,
        max_workers=max_workers,
        tqdm_kwargs=dict(
            total=(2 * len(context.pdfinfo)),
            desc='OCR'
            if options.tesseract_timeout > 0 else 'Image processing',
            unit='page',
            unit_scale=0.5,
            disable=not options.progress_bar,
        ),
        task_initializer=partial(worker_init, PIL.Image.MAX_IMAGE_PIXELS),
        task=exec_page_sync,
        task_arguments=context.get_page_contexts(),
        task_finished=update_page,
    )

    # Output sidecar text
    if options.sidecar:
        text = merge_sidecars(sidecars, context)
        # Copy text file to destination
        copy_final(text, options.sidecar, context)

    # Merge layers to one single pdf
    pdf = ocrgraft.finalize()

    # PDF/A and metadata
    log.info("Postprocessing...")
    pdf = post_process(pdf, context)

    # Copy PDF file to destination
    copy_final(pdf, options.output_file, context)
Пример #3
0
def _pdf_pageinfo_concurrent(pdf,
                             infile,
                             progbar,
                             max_workers,
                             check_pages,
                             detailed_analysis=False):
    global worker_pdf  # pylint: disable=global-statement
    pages = [None] * len(pdf.pages)

    def update_pageinfo(result, pbar):
        page = result
        if not page:
            raise InputFileError("Could read a page in the PDF")
        pages[page.pageno] = page
        pbar.update()

    if max_workers is None:
        max_workers = available_cpu_count()

    total = len(pdf.pages)
    contexts = ((n, infile, check_pages, detailed_analysis)
                for n in range(total))

    use_threads = False  # No performance gain if threaded due to GIL
    n_workers = min(1 + len(pages) // 4, max_workers)
    if n_workers == 1:
        # But if we decided on only one worker, there is no point in using
        # a separate process.
        use_threads = True

    try:
        exec_progress_pool(
            use_threads=use_threads,
            max_workers=n_workers,
            tqdm_kwargs=dict(total=total,
                             desc="Scanning contents",
                             unit='page',
                             disable=not progbar),
            task_initializer=partial(_pdf_pageinfo_sync_init, infile,
                                     logging.getLogger('pdfminer').level),
            task=_pdf_pageinfo_sync,
            task_arguments=contexts,
            task_finished=update_pageinfo,
        )
    finally:
        if worker_pdf and use_threads:
            assert n_workers == 1, "Should have only one worker when threaded"
            # This is messy, but if we ran in thread, close worker_pdf
            worker_pdf.close()
    return pages
Пример #4
0
def _produce_jbig2_images(
    jbig2_groups: Dict[int, List[XrefExt]], root: Path, options
) -> None:
    """Produce JBIG2 images from their groups"""

    def jbig2_group_args(root: Path, groups: Dict[int, List[XrefExt]]):
        for group, xref_exts in groups.items():
            prefix = f'group{group:08d}'
            yield dict(
                cwd=fspath(root),
                infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
                out_prefix=prefix,
            )

    def jbig2_single_args(root, groups: Dict[int, List[XrefExt]]):
        for group, xref_exts in groups.items():
            prefix = f'group{group:08d}'
            # Second loop is to ensure multiple images per page are unpacked
            for n, xref_ext in enumerate(xref_exts):
                xref, ext = xref_ext
                yield dict(
                    cwd=fspath(root),
                    infile=img_name(root, xref, ext),
                    outfile=root / f'{prefix}.{n:04d}',
                )

    def convert_generic(fn, kwargs_dict):
        return fn(**kwargs_dict)

    if options.jbig2_page_group_size > 1:
        jbig2_args = jbig2_group_args
        jbig2_convert = partial(convert_generic, jbig2enc.convert_group)
    else:
        jbig2_args = jbig2_single_args
        jbig2_convert = partial(convert_generic, jbig2enc.convert_single)

    exec_progress_pool(
        use_threads=True,
        max_workers=options.jobs,
        tqdm_kwargs=dict(
            total=len(jbig2_groups),
            desc="JBIG2",
            unit='item',
            disable=not options.progress_bar,
        ),
        task=jbig2_convert,
        task_arguments=jbig2_args(root, jbig2_groups),
    )
Пример #5
0
def transcode_pngs(
    pike: Pdf,
    images: Sequence[Xref],
    image_name_fn: Callable[[Path, Xref], Path],
    root: Path,
    options,
) -> None:
    modified: MutableSet[Xref] = set()
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )

        def pngquant_args():
            for xref in images:
                log.debug(image_name_fn(root, xref))
                yield (
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )
                modified.add(xref)

        def pngquant_fn(args):
            pngquant.quantize(*args)

        exec_progress_pool(
            use_threads=True,
            max_workers=options.jobs,
            tqdm_kwargs=dict(
                desc="PNGs",
                total=len(images),
                unit='image',
                disable=not options.progress_bar,
            ),
            task=pngquant_fn,
            task_arguments=pngquant_args(),
        )

    for xref in modified:
        filename = png_name(root, xref)
        _transcode_png(pike, filename, xref)
Пример #6
0
def transcode_pngs(pike, images, image_name_fn, root, options):
    modified = set()
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )

        def pngquant_args():
            for xref in images:
                log.debug(image_name_fn(root, xref))
                yield (
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )
                modified.add(xref)

        def pngquant_fn(args):
            pngquant.quantize(*args)

        exec_progress_pool(
            use_threads=True,
            max_workers=options.jobs,
            tqdm_kwargs=dict(
                desc="PNGs",
                total=len(images),
                unit='image',
                disable=not options.progress_bar,
            ),
            task=pngquant_fn,
            task_arguments=pngquant_args(),
        )

    for xref in modified:
        im_obj = pike.get_object(xref, 0)
        try:
            pix = leptonica.Pix.open(png_name(root, xref))
            if pix.mode == '1':
                compdata = pix.generate_pdf_ci_data(leptonica.lept.L_G4_ENCODE, 0)
            else:
                compdata = leptonica.CompressedData.open(png_name(root, xref))
        except leptonica.LeptonicaError as e:
            # Most likely this means file not found, i.e. quantize did not
            # produce an improved version
            log.error(e)
            continue

        # If re-coded image is larger don't use it - we test here because
        # pngquant knows the size of the temporary output file but not the actual
        # object in the PDF
        if len(compdata) > int(im_obj.stream_dict.Length):
            log.debug(
                f"pngquant: pngquant did not improve over original image "
                f"{len(compdata)} > {int(im_obj.stream_dict.Length)}"
            )
            continue
        if compdata.type == leptonica.lept.L_FLATE_ENCODE:
            rewrite_png(pike, im_obj, compdata)
        elif compdata.type == leptonica.lept.L_G4_ENCODE:
            rewrite_png_as_g4(pike, im_obj, compdata)