def get_orientation(input_file: Path, engine_mode: Optional[int], timeout: float): args_tesseract = tess_base_args(['osd'], engine_mode) + [ '--psm', '0', fspath(input_file), 'stdout', ] try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: return OrientationConfidence(angle=0, confidence=0.0) except CalledProcessError as e: tesseract_log_output(e.stdout) tesseract_log_output(e.stderr) if ( b'Too few characters. Skipping this page' in e.output or b'Image too large' in e.output ): return OrientationConfidence(0, 0) raise SubprocessOutputError() from e else: osd = {} for line in stdout.decode().splitlines(): line = line.strip() parts = line.split(':', maxsplit=2) if len(parts) == 2: osd[parts[0].strip()] = parts[1].strip() angle = int(osd.get('Orientation in degrees', 0)) oc = OrientationConfidence( angle=angle, confidence=float(osd.get('Orientation confidence', 0)) ) return oc
def get_languages(): def lang_error(output): msg = ( "Tesseract failed to report available languages.\n" "Output from Tesseract:\n" "-----------\n" ) msg += output return msg args_tess = ['tesseract', '--list-langs'] try: proc = run( args_tess, text=True, stdout=PIPE, stderr=STDOUT, logs_errors_to_stdout=True, check=True, ) output = proc.stdout except CalledProcessError as e: raise MissingDependencyError(lang_error(e.output)) from e for line in output.splitlines(): if line.startswith('Error'): raise MissingDependencyError(lang_error(output)) _header, *rest = output.splitlines() return set(lang.strip() for lang in rest)
def generate_pdf( *, input_file: Path, output_pdf: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): """Use Tesseract to render a PDF. input_file -- image to analyze output_pdf -- file to generate output_text -- OCR text file languages -- list of languages to consider engine_mode -- engine mode argument for tess v4 tessconfig -- tesseract configuration timeout -- timeout (seconds) """ args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) args_tesseract.extend(['-c', 'textonly_pdf=1']) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes # Reminder: test suite tesseract test plugins might break after any changes # to the number of order parameters here args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout if os.path.exists(prefix + '.txt'): shutil.move(prefix + '.txt', output_text) except TimeoutExpired: page_timedout(timeout) use_skip_page(output_pdf, output_text) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: use_skip_page(output_pdf, output_text) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout)
def rasterize_pdf( input_file: os.PathLike, output_file: os.PathLike, *, raster_device: str, raster_dpi: Resolution, pageno: int = 1, page_dpi: Optional[Resolution] = None, rotation: Optional[int] = None, filter_vector: bool = False, ): """Rasterize one page of a PDF at resolution raster_dpi in canvas units.""" raster_dpi = raster_dpi.round(6) if not page_dpi: page_dpi = raster_dpi args_gs = ([ GS, '-dQUIET', '-dSAFER', '-dBATCH', '-dNOPAUSE', f'-sDEVICE={raster_device}', f'-dFirstPage={pageno}', f'-dLastPage={pageno}', f'-r{raster_dpi.x:f}x{raster_dpi.y:f}', ] + (['-dFILTERVECTOR'] if filter_vector else []) + [ '-o', '-', '-sstdout=%stderr', '-dAutoRotatePages=/None', # Probably has no effect on raster '-f', fspath(input_file), ]) try: p = run(args_gs, stdout=PIPE, stderr=PIPE, check=True) except CalledProcessError as e: log.error(e.stderr.decode(errors='replace')) raise SubprocessOutputError('Ghostscript rasterizing failed') else: stderr = p.stderr.decode(errors='replace') if _gs_error_reported(stderr): log.error(stderr) with Image.open(BytesIO(p.stdout)) as im: if rotation is not None: log.debug("Rotating output by %i", rotation) # rotation is a clockwise angle and Image.ROTATE_* is # counterclockwise so this cancels out the rotation if rotation == 90: im = im.transpose(Image.ROTATE_90) elif rotation == 180: im = im.transpose(Image.ROTATE_180) elif rotation == 270: im = im.transpose(Image.ROTATE_270) if rotation % 180 == 90: page_dpi = page_dpi.flip_axis() im.save(fspath(output_file), dpi=page_dpi)
def generate_hocr( *, input_file: Path, output_hocr: Path, output_text: Path, languages: List[str], engine_mode: int, tessconfig: List[str], timeout: float, pagesegmode: int, user_words, user_patterns, ): prefix = output_hocr.with_suffix('') args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) if user_words: args_tesseract.extend(['--user-words', user_words]) if user_patterns: args_tesseract.extend(['--user-patterns', user_patterns]) # Reminder: test suite tesseract test plugins will break after any changes # to the number of order parameters here args_tesseract.extend([input_file, prefix, 'hocr', 'txt'] + tessconfig) try: p = run(args_tesseract, stdout=PIPE, stderr=STDOUT, timeout=timeout, check=True) stdout = p.stdout except TimeoutExpired: # Generate a HOCR file with no recognized text if tesseract times out # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(timeout) _generate_null_hocr(output_hocr, output_text, input_file) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: _generate_null_hocr(output_hocr, output_text, input_file) return raise SubprocessOutputError() from e else: tesseract_log_output(stdout) # The sidecar text file will get the suffix .txt; rename it to # whatever caller wants it named if prefix.with_suffix('.txt').exists(): shutil.move(prefix.with_suffix('.txt'), output_text)
def convert_group(*, cwd, infiles, out_prefix): args = [ 'jbig2', '-b', out_prefix, '-s', # symbol mode (lossy) # '-r', # refinement mode (lossless symbol mode, currently disabled in # jbig2) '-p', ] args.extend(infiles) proc = run(args, cwd=cwd, stdout=PIPE, stderr=PIPE) proc.check_returncode() return proc
def quantize(input_file: Path, output_file: Path, quality_min: int, quality_max: int): with input_as_png(input_file) as input_stream: args = [ 'pngquant', '--force', '--skip-if-larger', '--quality', f'{quality_min}-{quality_max}', '--', # pngquant: stop processing arguments '-', # pngquant: stream input and output ] result = run(args, stdin=input_stream, stdout=PIPE, stderr=PIPE, check=False) if result.returncode == 0: # input_file could be the same as output_file, so we defer the write output_file.write_bytes(result.stdout)
def convert_single(*, cwd, infile, outfile): args = ['jbig2', '-p', infile] with open(outfile, 'wb') as fstdout: proc = run(args, cwd=cwd, stdout=fstdout, stderr=PIPE) proc.check_returncode() return proc
def cached_run(options, run_args, **run_kwargs): run_args = [str(arg) for arg in run_args] # flatten PosixPaths args = parser.parse_args(run_args[1:]) if args.imagename in ('stdin', '-'): return run(run_args, **run_kwargs) source_file = options.input_file cache_folder = get_cache_folder(source_file, run_args, args) cache_folder.mkdir(parents=True, exist_ok=True) log.debug(f"Using Tesseract cache {cache_folder}") if (cache_folder / 'stderr.bin').exists(): log.debug("Cache HIT") # Replicate stdout/err if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: # cp cache -> output tessfile = args.outputbase + '.' + configfile shutil.copy(str(cache_folder / configfile) + '.bin', tessfile) return CompletedProcess( args=run_args, returncode=0, stdout=(cache_folder / 'stdout.bin').read_bytes(), stderr=(cache_folder / 'stderr.bin').read_bytes(), ) log.debug("Cache MISS") cache_kwargs = { k: v for k, v in run_kwargs.items() if k not in ('stdout', 'stderr') } assert cache_kwargs['check'] try: p = run(run_args, stdout=PIPE, stderr=PIPE, **cache_kwargs) except CalledProcessError as e: log.exception(e) raise # Pass exception onward # Update cache (cache_folder / 'stdout.bin').write_bytes(p.stdout) (cache_folder / 'stderr.bin').write_bytes(p.stderr) if args.outputbase != 'stdout': if not args.configfiles: args.configfiles.append('txt') for configfile in args.configfiles: if configfile not in ('hocr', 'pdf', 'txt'): continue # cp pwd/{outputbase}.{configfile} -> {cache}/{configfile} tessfile = args.outputbase + '.' + configfile shutil.copy(tessfile, str(cache_folder / configfile) + '.bin') manifest = {} manifest['tesseract_version'] = TesseractOcrEngine.version().replace( '\n', ' ') manifest['platform'] = platform.platform() manifest['python'] = platform.python_version() manifest['argv_slug'] = cache_folder.name manifest['sourcefile'] = str(Path(source_file).relative_to(TESTS_ROOT)) def clean_sys_argv(): for arg in run_args[1:]: yield re.sub(r'.*/ocrmypdf[.]io[.][^/]+[/](.*)', r'$TMPDIR/\1', arg) manifest['args'] = list(clean_sys_argv()) with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: json.dump(manifest, f) f.write('\n') f.flush() return p