def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float): args_tesseract = tess_base_args(['osd'], engine_mode) + [ '--psm', '0', fspath(input_file), 'stdout', ] try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: return OrientationConfidence(angle=0, confidence=0.0) except CalledProcessError as e: tesseract_log_output(e.stdout) tesseract_log_output(e.stderr) if ( b'Too few characters. Skipping this page' in e.output or b'Image too large' in e.output ): return OrientationConfidence(0, 0) raise SubprocessOutputError() from e else: osd = {} for line in stdout.decode().splitlines(): line = line.strip() parts = line.split(':', maxsplit=2) if len(parts) == 2: osd[parts[0].strip()] = parts[1].strip() angle = int(osd.get('Orientation in degrees', 0)) oc = OrientationConfidence( angle=angle, confidence=float(osd.get('Orientation confidence', 0)) ) return oc
def generate_pdf( *, input_file: Path, output_pdf: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): """Use Tesseract to render a PDF. input_file -- image to analyze output_pdf -- file to generate output_text -- OCR text file languages -- list of languages to consider engine_mode -- engine mode argument for tess v4 tessconfig -- tesseract configuration timeout -- timeout (seconds) """ args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) args_tesseract.extend(['-c', 'textonly_pdf=1']) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes # Reminder: test suite tesseract test plugins might break after any changes # to the number of order parameters here args_tesseract.extend([os.fspath(input_file), os.fspath(prefix), 'pdf', 'txt']) args_tesseract.extend(tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout if os.path.exists(prefix + '.txt'): shutil.move(prefix + '.txt', output_text) except TimeoutExpired: page_timedout(timeout) use_skip_page(output_pdf, output_text) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: use_skip_page(output_pdf, output_text) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout)
def run(input_file, output_file, dpi, mode_args): args_unpaper = ['unpaper', '-v', '--dpi', str(round(dpi, 6))] + mode_args with TemporaryDirectory() as tmpdir: input_pnm, output_pnm = _setup_unpaper_io(Path(tmpdir), input_file) # To prevent any shenanigans from accepting arbitrary parameters in # --unpaper-args, we: # 1) run with cwd set to a tmpdir with only unpaper's files # 2) forbid the use of '/' in arguments, to prevent changing paths # 3) append absolute paths for the input and output file # This should ensure that a user cannot clobber some other file with # their unpaper arguments (whether intentionally or otherwise) args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)]) external_run( args_unpaper, close_fds=True, check=True, stderr=STDOUT, # unpaper writes logging output to stdout and stderr stdout=PIPE, # and cannot send file output to stdout cwd=tmpdir, logs_errors_to_stdout=True, ) try: with Image.open(output_pnm) as imout: imout.save(output_file, dpi=(dpi, dpi)) except (FileNotFoundError, OSError): raise SubprocessOutputError( "unpaper: failed to produce the expected output file. " + " Called with: " + str(args_unpaper)) from None
def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, *, raster_device: str, raster_dpi: Resolution, pageno: int = 1, page_dpi: Optional[Resolution] = None, rotation: Optional[int] = None, filter_vector: bool = False, ): """Rasterize one page of a PDF at resolution raster_dpi in canvas units.""" raster_dpi = raster_dpi.round(6) if not page_dpi: page_dpi = raster_dpi args_gs = ([ GS, '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', f'-sDEVICE={raster_device}', f'-dFirstPage={pageno}', f'-dLastPage={pageno}', f'-r{raster_dpi.x:f}x{raster_dpi.y:f}', ] + (['-dFILTERVECTOR'] if filter_vector else []) + [ '-o', '-', '-sstdout=%stderr', '-dAutoRotatePages=/None', # Probably has no effect on raster '-f', fspath(input_file), ]) try: p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) except CalledProcessError as e: log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript rasterizing failed') else: stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): log.error(stderr) with Image.open(BytesIO(p.stdout)) as im: if rotation is not None: log.debug("Rotating output by %i", rotation) # rotation is a clockwise angle and Image.ROTATE_* is # counterclockwise so this cancels out the rotation if rotation == 90: im = im.transpose(Image.ROTATE_90) elif rotation == 180: im = im.transpose(Image.ROTATE_180) elif rotation == 270: im = im.transpose(Image.ROTATE_270) if rotation % 180 == 90: page_dpi = page_dpi.flip_axis() im.save(fspath(output_file), dpi=page_dpi)
def run(input_file, output_file, dpi, mode_args): args_unpaper = ['unpaper', '-v', '--dpi', str(dpi)] + mode_args SUFFIXES = {'1': '.pbm', 'L': '.pgm', 'RGB': '.ppm'} with TemporaryDirectory() as tmpdir, Image.open(input_file) as im: if im.mode not in SUFFIXES.keys(): log.info("Converting image to other colorspace") try: if im.mode == 'P' and len(im.getcolors()) == 2: im = im.convert(mode='1') else: im = im.convert(mode='RGB') except IOError as e: im.close() raise MissingDependencyError( "Could not convert image with type " + im.mode) from e try: suffix = SUFFIXES[im.mode] except KeyError: raise MissingDependencyError( "Failed to convert image to a supported format.") from e input_pnm = Path(tmpdir) / f'input{suffix}' output_pnm = Path(tmpdir) / f'output{suffix}' im.save(input_pnm, format='PPM') # To prevent any shenanigans from accepting arbitrary parameters in # --unpaper-args, we: # 1) run with cwd set to a tmpdir with only unpaper's files # 2) forbid the use of '/' in arguments, to prevent changing paths # 3) append absolute paths for the input and output file # This should ensure that a user cannot clobber some other file with # their unpaper arguments (whether intentionally or otherwise) args_unpaper.extend([os.fspath(input_pnm), os.fspath(output_pnm)]) try: proc = external_run( args_unpaper, check=True, close_fds=True, universal_newlines=True, stderr=STDOUT, cwd=tmpdir, stdout=PIPE, ) except CalledProcessError as e: log.debug(e.output) raise e from e else: log.debug(proc.stdout) # unpaper sets dpi to 72; fix this try: with Image.open(output_pnm) as imout: imout.save(output_file, dpi=(dpi, dpi)) except (FileNotFoundError, OSError): raise SubprocessOutputError( "unpaper: failed to produce the expected output file. " + " Called with: " + str(args_unpaper)) from None
def generate_hocr( *, input_file: Path, output_hocr: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): prefix = output_hocr.with_suffix('') args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) # Reminder: test suite tesseract test plugins will break after any changes # to the number of order parameters here args_tesseract.extend([fspath(input_file), fspath(prefix), 'hocr', 'txt']) args_tesseract.extend(tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: # Generate a HOCR file with no recognized text if tesseract times out # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(timeout) _generate_null_hocr(output_hocr, output_text, input_file) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: _generate_null_hocr(output_hocr, output_text, input_file) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout) # The sidecar text file will get the suffix .txt; rename it to # whatever caller wants it named if prefix.with_suffix('.txt').exists(): prefix.with_suffix('.txt').replace(output_text)
def generate_pdfa( pdf_pages, output_file: os.PathLike, compression: str, pdf_version: str = '1.5', pdfa_part: str = '2', ): compression_args = [] if compression == 'jpeg': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/DCTEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/DCTEncode", ] elif compression == 'lossless': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/FlateEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/FlateEncode", ] else: compression_args = [ "-dAutoFilterColorImages=true", "-dAutoFilterGrayImages=true", ] # Older versions of Ghostscript expect a leading slash in # sColorConversionStrategy, newer ones should not have it. See Ghostscript # git commit fe1c025d. strategy = 'RGB' if version() >= '9.19' else '/RGB' if version() == '9.23': # 9.23: new feature JPEG passthrough is broken in some cases, best to # disable it always # https://bugs.ghostscript.com/show_bug.cgi?id=699216 compression_args.append('-dPassThroughJPEGImages=false') # nb no need to specify ProcessColorModel when ColorConversionStrategy # is set; see: # https://bugs.ghostscript.com/show_bug.cgi?id=699392 args_gs = ([ GS, "-dQUIET", "-dBATCH", "-dNOPAUSE", "-dSAFER", "-dCompatibilityLevel=" + str(pdf_version), "-sDEVICE=pdfwrite", "-dAutoRotatePages=/None", "-sColorConversionStrategy=" + strategy, ] + compression_args + [ "-dJPEGQ=95", "-dPDFA=" + pdfa_part, "-dPDFACompatibilityPolicy=1", "-o", "-", "-sstdout=%stderr", ]) args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs try: with Path(output_file).open('wb') as output: p = run(args_gs, stdout=output, stderr=PIPE, check=True) except CalledProcessError as e: # Ghostscript does not change return code when it fails to create # PDF/A - check PDF/A status elsewhere log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript PDF/A rendering failed') else: stderr = p.stderr.decode('utf-8', errors='replace') if _gs_error_reported(stderr): last_part = None repcount = 0 for part in stderr.split('****'): if part != last_part: if repcount > 1: log.error( f"(previous error message repeated {repcount} times)" ) repcount = 0 log.error(part) else: repcount += 1 last_part = part elif 'overprint mode not set' in stderr: # Unless someone is going to print PDF/A documents on a # magical sRGB printer I can't see the removal of overprinting # being a problem.... log.debug("Ghostscript had to remove PDF 'overprinting' from the " "input file to complete PDF/A conversion. ")
def generate_pdfa( pdf_pages, output_file: os.PathLike, *, compression: str, pdf_version: str = '1.5', pdfa_part: str = '2', progressbar_class=None, ): # Ghostscript's compression is all or nothing. We can either force all images # to JPEG, force all to Flate/PNG, or let it decide how to encode the images. # In most case it's best to let it decide. compression_args = [] if compression == 'jpeg': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/DCTEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/DCTEncode", ] elif compression == 'lossless': compression_args = [ "-dAutoFilterColorImages=false", "-dColorImageFilter=/FlateEncode", "-dAutoFilterGrayImages=false", "-dGrayImageFilter=/FlateEncode", ] else: compression_args = [ "-dAutoFilterColorImages=true", "-dAutoFilterGrayImages=true", ] strategy = 'LeaveColorUnchanged' # Older versions of Ghostscript expect a leading slash in # sColorConversionStrategy, newer ones should not have it. See Ghostscript # git commit fe1c025d. strategy = ('/' + strategy) if version() < '9.19' else strategy if version() == '9.23': # 9.23: added JPEG passthrough as a new feature, but with a bug that # incorrectly formats some images. Fixed as of 9.24. So we disable this # feature for 9.23. # https://bugs.ghostscript.com/show_bug.cgi?id=699216 compression_args.append('-dPassThroughJPEGImages=false') # nb no need to specify ProcessColorModel when ColorConversionStrategy # is set; see: # https://bugs.ghostscript.com/show_bug.cgi?id=699392 args_gs = ([ GS, "-dBATCH", "-dNOPAUSE", "-dSAFER", "-dCompatibilityLevel=" + str(pdf_version), "-sDEVICE=pdfwrite", "-dAutoRotatePages=/None", "-sColorConversionStrategy=" + strategy, ] + compression_args + [ "-dJPEGQ=95", "-dPDFA=" + pdfa_part, "-dPDFACompatibilityPolicy=1", "-o", "-", "-sstdout=%stderr", ]) args_gs.extend(fspath(s) for s in pdf_pages) # Stringify Path objs try: with Path(output_file).open('wb') as output: p = run_polling_stderr( args_gs, stdout=output, stderr=PIPE, check=True, text=True, encoding='utf-8', errors='replace', callback=GhostscriptFollower(progressbar_class), ) except CalledProcessError as e: # Ghostscript does not change return code when it fails to create # PDF/A - check PDF/A status elsewhere log.error(e.stderr) raise SubprocessOutputError( 'Ghostscript PDF/A rendering failed') from e else: stderr = p.stderr # If there is an error we log the whole stderr, except for filtering # duplicates. if _gs_error_reported(stderr): last_part = None repcount = 0 for part in stderr.split('****'): if part != last_part: if repcount > 1: log.error( f"(previous error message repeated {repcount} times)" ) repcount = 0 log.error(part) else: repcount += 1 last_part = part