def optimized_image2text(filename, lang='eng', verbose=False): ocr_txt = False ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_ocr_descew_") if verbose: print("Optimizing image {}".format(filename)) # start external OCR Program result = subprocess.call(['scantailor-cli', filename, ocr_temp_dirname]) if result == 0: images = os.listdir(ocr_temp_dirname) images.sort() for image in images: try: result = False imagefilename = ocr_temp_dirname + os.path.sep + image # ignore the cache directory of scantailor, only files in directory if os.path.isfile(imagefilename): result = enhance_ocr.image2text(imagefilename, lang, verbose=verbose) os.remove(imagefilename) if result: if ocr_txt: ocr_txt = ocr_txt + '\n' + result else: ocr_txt = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while OCR descewed image of: {} - Maybe descewed image {} corrupt? Exception: {}\n" .format(filename, imagefilename, e)) else: sys.stderr.write( "Error: Descewing images for OCR failed for {} with return code {}" .format(filename, result)) shutil.rmtree(ocr_temp_dirname) return ocr_txt
def optimized_image2text(filename, lang='eng', verbose=False): ocr_txt = False ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_ocr_descew_") if verbose: print ( "Optimizing image {}".format(filename) ) # start external OCR Program result = subprocess.call(['scantailor-cli', filename, ocr_temp_dirname]) if result == 0: images = os.listdir(ocr_temp_dirname) images.sort() for image in images: try: result = False imagefilename = ocr_temp_dirname + os.path.sep + image # ignore the cache directory of scantailor, only files in directory if os.path.isfile(imagefilename): result = enhance_ocr.image2text(imagefilename, lang, verbose=verbose) os.remove(imagefilename) if result: if ocr_txt: ocr_txt = ocr_txt + '\n' + result else: ocr_txt = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while OCR descewed image of: {} - Maybe descewed image {} corrupt? Exception: {}\n" .format(filename, imagefilename, e) ) else: sys.stderr.write ( "Error: Descewing images for OCR failed for {} with return code {}".format(filename, result) ) shutil.rmtree(ocr_temp_dirname) return ocr_txt
def pdfimages2text(filename, lang='eng', verbose=False, pdf_ocr=True, pdf_ocr_descew=False): ocr_txt = {} ocr_descew_txt = {} ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_") # Extract all images of the pdf to tempdir with commandline tool "pdfimages" from poppler pdf toolbox # -j = export as JPEG # -p = write page name in image filename result = subprocess.call([ 'pdfimages', '-p', '-j', filename, ocr_temp_dirname + os.path.sep + 'image' ]) if result == 0: images = os.listdir(ocr_temp_dirname) images.sort() for image in images: imagefilename = ocr_temp_dirname + os.path.sep + image if pdf_ocr: try: # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg) pagenumber = int(image.split('-')[1]) result = enhance_ocr.image2text(filename=imagefilename, lang=lang, verbose=verbose) if result: if pagenumber in ocr_txt: ocr_txt[pagenumber] += '\n' + result else: ocr_txt[pagenumber] = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while OCR of PDF: {} - maybe corrupt image: {} - exception: {}" .format(filename, imagefilename, e)) if pdf_ocr_descew: try: # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg) pagenumber = int(image.split('-')[1]) result = enhance_ocr_descew.optimized_image2text( imagefilename, lang, verbose=verbose) if result: if pagenumber in ocr_descew_txt: ocr_descew_txt[pagenumber] += '\n\n' + result else: ocr_descew_txt[pagenumber] = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while optimized ocr pdf: {} - maybe corrupt image: {} - exception: {}" .format(filename, imagefilename, e)) os.remove(imagefilename) os.rmdir(ocr_temp_dirname) else: sys.stderr.write( "Error: Extracting images from PDF failed for {} {}".format( filename, result)) return ocr_txt, ocr_descew_txt
def pdfimages2text(filename, lang='eng', verbose=False, pdf_ocr=True, pdf_ocr_descew=False): ocr_txt = {} ocr_descew_txt = {} ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_") # Extract all images of the pdf to tempdir with commandline tool "pdfimages" from poppler pdf toolbox # -j = export as JPEG # -p = write page name in image filename result = subprocess.call(['pdfimages', '-p' ,'-j', filename, ocr_temp_dirname + os.path.sep + 'image']) if result == 0: images = os.listdir(ocr_temp_dirname) images.sort() for image in images: imagefilename = ocr_temp_dirname + os.path.sep + image if pdf_ocr: try: # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg) pagenumber = int( image.split('-')[1] ) result = enhance_ocr.image2text(filename=imagefilename, lang=lang, verbose=verbose) if result: if pagenumber in ocr_txt: ocr_txt[pagenumber] += '\n' + result else: ocr_txt[pagenumber] = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while OCR of PDF: {} - maybe corrupt image: {} - exception: {}".format(filename, imagefilename, e) ) if pdf_ocr_descew: try: # extract page number from extracted image filename (image-pagenumber-imagenumber.jpg) pagenumber = int( image.split('-')[1] ) result = enhance_ocr_descew.optimized_image2text(imagefilename, lang, verbose=verbose) if result: if pagenumber in ocr_descew_txt: ocr_descew_txt[pagenumber] += '\n\n' + result else: ocr_descew_txt[pagenumber] = result except KeyboardInterrupt: raise KeyboardInterrupt except BaseException as e: sys.stderr.write( "Exception while optimized ocr pdf: {} - maybe corrupt image: {} - exception: {}".format(filename, imagefilename, e) ) os.remove(imagefilename) os.rmdir(ocr_temp_dirname) else: sys.stderr.write( "Error: Extracting images from PDF failed for {} {}".format(filename, result) ) return ocr_txt, ocr_descew_txt
def pdfimages2text(filename, lang='eng', verbose=False, pdf_ocr=True, pdf_ocr_descew=False, cache=None): ocr_txt = {} ocr_descew_txt = {} if cache is not None: try: return load_cache(filename, cache, lang, pdf_ocr, pdf_ocr_descew) except (FileNotFoundError, KeyError): if verbose: print('Not in OCR cache, starting OCR for {}'.format(filename)) ocr_temp_dirname = tempfile.mkdtemp(prefix="opensemanticetl_pdf_ocr_") # Extract all images of the pdf to tempdir with commandline tool # "pdfimages" from poppler pdf toolbox # -j = export as JPEG # -p = write page name in image filename result = subprocess.call([ 'pdfimages', '-p', '-j', filename, ocr_temp_dirname + os.path.sep + 'image' ]) if result != 0: sys.stderr.write( "Error: Extracting images from PDF failed for {} {}".format( filename, result)) return {}, {} images = os.listdir(ocr_temp_dirname) images.sort() for image in images: imagefilename = ocr_temp_dirname + os.path.sep + image if pdf_ocr: try: result = enhance_ocr.image2text(filename=imagefilename, lang=lang, verbose=verbose) if result: # extract page number from extracted image # filename (image-pagenumber-imagenumber.jpg) pagenumber = int(image.split('-')[1]) append_page(ocr_txt, pagenumber, result) except BaseException as e: sys.stderr.write( "Exception while OCR of PDF: {} - " "maybe corrupt image: {} - exception: {}\n".format( filename, imagefilename, e)) if pdf_ocr_descew: try: result = enhance_ocr_descew.optimized_image2text( imagefilename, lang, verbose=verbose) if result: # extract page number from extracted image # filename (image-pagenumber-imagenumber.jpg) pagenumber = int(image.split('-')[1]) append_page(enhance_ocr_descew, pagenumber, result) except BaseException as e: sys.stderr.write( "Exception while optimized ocr pdf: {} - " "maybe corrupt image: {} - exception: {}\n".format( filename, imagefilename, e)) os.remove(imagefilename) os.rmdir(ocr_temp_dirname) return ocr_txt, ocr_descew_txt