def test_prevent_gs_invalid_xml(resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') # Inject a string with a trailing nul character into the DocumentInfo # dictionary of this PDF, as often occurs in practice. with pikepdf.open(outdir / 'layers.rendered.pdf') as pike: pike.Root.DocumentInfo = pikepdf.Dictionary( Title=b'String with trailing nul\x00' ) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext( options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([]) ) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context ) with open(outdir / 'pdfa.pdf', 'r+b') as f: with mmap.mmap(f.fileno(), 0) as mm: # Since the XML may be invalid, we scan instead of actually feeding it # to a parser. XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d' xmp_start = mm.find(XMP_MAGIC) xmp_end = mm.rfind(b'<?xpacket end', xmp_start) assert 0 < xmp_start < xmp_end # Ensure we did not carry the nul forward. assert mm.find(b'�', xmp_start, xmp_end) == -1, "found escaped nul" assert mm.find(b'\x00', xmp_start, xmp_end) == -1
def test_metadata_fixup_warning(resources, outdir, caplog): options = get_parser().parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'] ) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') context = PdfContext( options, outdir, outdir / 'graph.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' # Now add some metadata that will not be copyable graph = pikepdf.open(outdir / 'graph.pdf') with graph.open_metadata() as meta: meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') context = PdfContext( options, outdir, outdir / 'graph_mod.pdf', None, get_plugin_manager([]) ) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records)
def get_parser_options_plugins( args, ) -> (argparse.ArgumentParser, argparse.Namespace, pluggy.PluginManager): pre_options, _unused = plugins_only_parser.parse_known_args(args=args) plugin_manager = get_plugin_manager(pre_options.plugins) parser = get_parser() plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member options = parser.parse_args(args=args) return parser, options, plugin_manager
def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs): if language is not None: kwargs['language'] = language parser = get_parser() pm = get_plugin_manager(kwargs.get('plugins', [])) pm.hook.add_options(parser=parser) # pylint: disable=no-member return ( create_options( input_file=input_file, output_file=output_file, parser=parser, **kwargs ), pm, )
def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf') with pikepdf.open(resources / 'trivial.pdf') as pike: pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') context = PdfContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])) convert_to_pdfa(str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context) print(caplog.records) assert any('malformed DocumentInfo block' in record.message for record in caplog.records)
def ocr( # pylint: disable=unused-argument input_file: PathOrIO, output_file: PathOrIO, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[Union[str, Path]] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads: Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. input_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the input file. If the object appears to be a readable stream (with methods such as ``.read()`` and ``.seek()``), the object will be read in its entirety and saved to a temporary file. If ``input_file`` is ``"-"``, standard input will be read. output_file: If a :class:`pathlib.Path`, ``str`` or ``bytes``, this is interpreted as file system path to the output file. If the object appears to be a writable stream (with methods such as ``.read()`` and ``.seek()``), the output will be written to this stream. If ``output_file`` is ``"-"``, the output will be written to ``sys.stdout`` (provided that standard output does not seem to be a terminal device). When a stream is used as output, whether via a writable object or ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] elif isinstance(plugins, (str, Path)): plugins = [plugins] else: plugins = list(plugins) parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) if 'verbose' in kwargs: warn("ocrmypdf.ocr(verbose=) is ignored. Use ocrmypdf.configure_logging().") options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
def ocr( # pylint: disable=unused-argument input_file: os.PathLike, output_file: os.PathLike, *, language: Iterable[str] = None, image_dpi: int = None, output_type=None, sidecar: os.PathLike = None, jobs: int = None, use_threads: bool = None, title: str = None, author: str = None, subject: str = None, keywords: str = None, rotate_pages: bool = None, remove_background: bool = None, deskew: bool = None, clean: bool = None, clean_final: bool = None, unpaper_args: str = None, oversample: int = None, remove_vectors: bool = None, threshold: bool = None, force_ocr: bool = None, skip_text: bool = None, redo_ocr: bool = None, skip_big: float = None, optimize: int = None, jpg_quality: int = None, png_quality: int = None, jbig2_lossy: bool = None, jbig2_page_group_size: int = None, pages: str = None, max_image_mpixels: float = None, tesseract_config: Iterable[str] = None, tesseract_pagesegmode: int = None, tesseract_oem: int = None, pdf_renderer=None, tesseract_timeout: float = None, rotate_pages_threshold: float = None, pdfa_image_compression=None, user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, **kwargs, ): """Run OCRmyPDF on one PDF or image. For most arguments, see documentation for the equivalent command line parameter. A few specific arguments are discussed here: Args: use_threads (bool): Use worker threads instead of processes. This reduces performance but may make debugging easier since it is easier to set breakpoints. Raises: ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that could not be read, or some other file type that is not a PDF. ocrmypdf.DpiError: If the input file is an image, but the resolution of the image is not credible (allowing it to proceed would cause poor OCR). ocrmypdf.OutputFileAccessError: If an attempt to write to the intended output file failed. ocrmypdf.PriorOcrFoundError: If the input PDF seems to have OCR or digital text already, and settings did not tell us to proceed. ocrmypdf.InputFileError: Any other problem with the input file. ocrmypdf.SubprocessOutputError: Any error related to executing a subprocess. ocrmypdf.EncryptedPdfERror: If the input PDF is encrypted (password protected). OCRmyPDF does not remove passwords. ocrmypdf.TesseractConfigError: If Tesseract reported its configuration was not valid. Returns: :class:`ocrmypdf.ExitCode` """ if not plugins: plugins = [] parser = get_parser() _plugin_manager = get_plugin_manager(plugins) _plugin_manager.hook.add_options(parser=parser) # pylint: disable=no-member create_options_kwargs = { k: v for k, v in locals().items() if not k.startswith('_') and k != 'kwargs' } create_options_kwargs.update(kwargs) options = create_options(**create_options_kwargs) check_options(options, _plugin_manager) return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)