class PyPDFOCR(object): """ The main clas. Performs the following functions: * Parses command line options * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step * Runs a single file conversion: * Runs ghostscript to get tiff/jpg * Runs Tesseract-OCR to do the actual OCR * Takes the HOCR from Tesseract and creates a new PDF with the text overlay * Files the OCR'ed file in the proper place if specified * Files the original file if specified * """ def __init__ (self): """ Initializes the GhostScript, Tesseract, and PDF helper classes. """ self.config = None self.gs = PyGs() self.ts = PyTesseract() self.pdf = PyPdf(self.gs) self.preprocess = PyPreprocess() """PDF read and generation class""" def _get_config_file(self, config_file): """ Read in the yaml config file :param config_file: Configuration file (YAML format) :type config_file: file :returns: dict of yaml file :rtype: dict """ with config_file: myconfig = yaml.load(config_file) return myconfig def get_options(self, argv): """ Parse the command-line options and set the following object properties: :param argv: usually just sys.argv[1:] :returns: Nothing :ivar debug: Enable logging debug statements :ivar verbose: Enable verbose logging :ivar enable_filing: Whether to enable post-OCR filing of PDFs :ivar pdf_filename: Filename for single conversion mode :ivar watch_dir: Directory to watch for files to convert :ivar config: Dict of the config file :ivar watch: Whether folder watching mode is turned on :ivar enable_evernote: Enable filing to evernote :ivar disable_preprocessing : Disable preprocessing step """ p = argparse.ArgumentParser( description = "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", epilog = "PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, ) p.add_argument('-d', '--debug', action='store_true', default=False, dest='debug', help='Turn on debugging') p.add_argument('-v', '--verbose', action='store_true', default=False, dest='verbose', help='Turn on verbose mode') p.add_argument('-m', '--mail', action='store_true', default=False, dest='mail', help='Send email after conversion') p.add_argument('-l', '--lang', default='eng', dest='lang', help='Language(default eng)') p.add_argument('-p', '--preprocessing', action='store_true', default=False, dest='disable_preprocessing', help='Turn off preprocessing') #--------- # Single or watch mode #-------- single_or_watch_group = p.add_mutually_exclusive_group(required=True) # Positional argument for single file conversion single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR") # Watch directory for watch mode single_or_watch_group.add_argument('-w', '--watch', dest='watch_dir', help='Watch given directory and run ocr automatically until terminated') #----------- # Filing options #---------- filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument('-c', '--config', type = lambda x: open_file_with_timeout(p,x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', default=False, dest='enable_evernote', help='Enable filing to Evernote') filing_group.add_argument('-n', action='store_true', default=False, dest='match_using_filename', help='Use filename to match if contents did not match anything, before filing to default folder') args = p.parse_args(argv) self.debug = args.debug self.verbose = args.verbose self.pdf_filename = args.pdf_filename self.lang = args.lang self.watch_dir = args.watch_dir self.enable_email = args.mail self.match_using_filename = args.match_using_filename if self.debug: logging.basicConfig(level=logging.DEBUG, format='%(message)s') if self.verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') # Parse configuration file (YAML) if specified if args.configfile: self.config = self._get_config_file(args.configfile) logging.debug("Read in configuration file") logging.debug(self.config) if args.disable_preprocessing: self.disable_preprocessing = True else: self.disable_preprocessing = False if args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False if args.enable_filing or args.enable_evernote: self.enable_filing = True if not args.configfile: p.error("Please specify a configuration file(CONFIGFILE) to enable filing") else: self.enable_filing = False self.watch = False if args.watch_dir: logging.debug("Starting to watch") self.watch = True if self.enable_email: if not args.configfile: p.error("Please specify a configuration file(CONFIGFILE) to enable email") def _clean_up_files(self, files): """ Helper function to delete files :param files: List of files to delete :type files: list :returns: None """ for f in files: try: os.remove(f) except: logging.info("Error removing file %s .... continuing" % file) def _setup_filing(self): """ Instance the proper PyFiler object (either :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`) TODO: Make this more generic to allow third-party plugin filing objects :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading :returns: Nothing """ # Look at self.config and create a self.pdf_filer object # -------------------------------------------------- # Some sanity checks # -------------------------------------------------- assert(self.config and self.enable_filing) for required in ['target_folder', 'default_folder']: if not required in self.config: error ("%s must be specified in config file" % required) else: # Make sure these required folders are in abspath format self.config[required] = os.path.abspath(self.config[required]) if 'original_move_folder' in self.config: # User wants to move the original after filing orig = 'original_move_folder' self.config[orig] = os.path.abspath(self.config[orig]) if not os.path.exists(self.config[orig]): os.makedirs(self.config[orig]) original_move_folder = self.config[orig] else: original_move_folder = None # -------------------------------------------------- # Start the filing object # -------------------------------------------------- if self.enable_evernote: self.filer = PyFilerEvernote(self.config['evernote_developer_token']) else: self.filer = PyFilerDirs() self.filer.target_folder = self.config['target_folder'] self.filer.default_folder = self.config['default_folder'] self.filer.original_move_folder = original_move_folder self.pdf_filer = PyPdfFiler(self.filer) if self.match_using_filename: print("Matching using filename as a fallback to pdf contents") self.pdf_filer.file_using_filename = True # ------------------------------ # Add all the folder names with associated keywords # to the filer object # ------------------------------ keyword_count = 0 folder_count = 0 if 'folders' in self.config: for folder, keywords in self.config['folders'].items(): folder_count +=1 keyword_count += len(keywords) # Make sure keywords are lower-cased before adding keywords = [x.lower() for x in keywords] self.filer.add_folder_target(folder, keywords) print ("Filing of PDFs is enabled") print (" - %d target filing folders" % (folder_count)) print (" - %d keywords" % (keyword_count)) def _setup_external_tools(self): """ Override the Tesseract and Ghostscript binary locations if the user specified them in the config file """ if not self.config: return programs = [("tesseract", self.ts), ("ghostscript", self.gs)] for (program, obj) in programs: if program in self.config and "binary" in self.config[program]: binary = self.config[program]["binary"] if os.name == 'nt': binary = '"%s"' % binary binary = binary.replace("\\", "\\\\") logging.info("Setting location for %s executable to %s" % (program, binary)) obj.binary = binary def run_conversion(self, pdf_filename): """ Does the following: - Convert the PDF using GhostScript to TIFF and JPG - Run Tesseract on the TIFF to extract the text into HOCR (html) - Use PDF generator to overlay the text on the JPG and output a new PDF - Clean up temporary image files :param pdf_filename: Scanned PDF :type pdf_filename: string :returns: OCR'ed PDF :rtype: filename string """ print ("Starting conversion of %s" % pdf_filename) conversion_format = "tiff" # Make the images for Tesseract img_dpi, glob_img_filename = self.gs.make_img_from_pdf(pdf_filename, conversion_format) fns = glob.glob(glob_img_filename) # Preprocess if self.disable_preprocessing: preprocess_imagefilenames = fns else: preprocess_imagefilenames = self.preprocess.preprocess(fns) # Run teserract self.ts.lang = self.lang hocr_filenames = self.ts.make_hocr_from_pnms(preprocess_imagefilenames) # Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) ocr_pdf_filename = self.pdf.overlay_hocr_pages(img_dpi, hocr_filenames, pdf_filename) # Clean up the files if not self.debug: self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs self._clean_up_files(itertools.chain(fns, preprocess_imagefilenames)) print ("Completed conversion successfully to %s" % ocr_pdf_filename) return ocr_pdf_filename def file_converted_file(self, ocr_pdffilename, original_pdffilename): """ move the converted filename to its destiantion directory. Optionally also moves the original PDF. :param ocr_pdffilename: Converted PDF file :type ocr_pdffilename: filename string :param original_pdffilename: Original scanned PDF file :type original_pdffilename: filename string :returns: Target folder name "rtype: string """ filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename) print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))) tgt_path = self.pdf_filer.file_original(original_pdffilename) if tgt_path != original_pdffilename: print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))) return os.path.dirname(filed_path) def _send_email(self, infilename, outfilename, filing ): """ Send email using smtp """ print("Sending email status") from_addr = self.config["mail_from_addr"] to_addr_list = self.config["mail_to_list"] smtpserver = self.config["mail_smtp_server"] login = self.config["mail_smtp_login"] password = self.config["mail_smtp_password"] subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename)) header = 'From: %s\n' % login header += 'To: %s\n' % ','.join(to_addr_list) header += 'Subject: %s\n\n' % subject message = """ PyPDFOCR Conversion: -------------------- Original file: %s Converted file: %s Filing: %s """ % (infilename, outfilename, filing) message = header + message server = smtplib.SMTP(smtpserver) server.starttls() server.login(login,password) problems = server.sendmail(from_addr, to_addr_list, message) server.quit() def go(self, argv): """ The main entry point into PyPDFOCR #. Parses options #. If filing is enabled, call :func:`_setup_filing` #. If watch is enabled, start the watcher #. :func:`run_conversion` #. if filing is enabled, call :func:`file_converted_file` """ # Read the command line options self.get_options(argv) # Setup tesseract and ghostscript self._setup_external_tools() # Setup the pdf filing if enabled if self.enable_filing: self._setup_filing() if self.watch: py_watcher = PyPdfWatcher(self.watch_dir) for pdf_filename in py_watcher.start(): ocr_pdffilename = self.run_conversion(pdf_filename) filing = "None" if self.enable_filing: filing = self.file_converted_file(ocr_pdffilename, pdf_filename) if self.enable_email: self._send_email(pdf_filename, ocr_pdffilename, filing) else: ocr_pdffilename = self.run_conversion(self.pdf_filename) filing = "None" if self.enable_filing: filing = self.file_converted_file(ocr_pdffilename, self.pdf_filename) if self.enable_email: self._send_email(self.pdf_filename, ocr_pdffilename, filing)
class PyPDFOCR(object): """ The main clas. Performs the following functions: * Parses command line options * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step * Runs a single file conversion: * Runs ghostscript to get tiff/jpg * Runs Tesseract-OCR to do the actual OCR * Takes the HOCR from Tesseract and creates a new PDF with the text overlay * Files the OCR'ed file in the proper place if specified * Files the original file if specified * """ def __init__(self): """ Initializes the GhostScript, Tesseract, and PDF helper classes. """ self.config = {} def _get_config_file(self, config_file): """ Read in the yaml config file :param config_file: Configuration file (YAML format) :type config_file: file :returns: dict of yaml file :rtype: dict """ with config_file: myconfig = yaml.load(config_file) return myconfig def get_options(self, argv): """ Parse the command-line options and set the following object properties: :param argv: usually just sys.argv[1:] :returns: Nothing :ivar debug: Enable logging debug statements :ivar verbose: Enable verbose logging :ivar enable_filing: Whether to enable post-OCR filing of PDFs :ivar pdf_filename: Filename for single conversion mode :ivar watch_dir: Directory to watch for files to convert :ivar config: Dict of the config file :ivar watch: Whether folder watching mode is turned on :ivar enable_evernote: Enable filing to evernote """ p = argparse.ArgumentParser( description= "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", epilog="PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, ) p.add_argument('-d', '--debug', action='store_true', default=False, dest='debug', help='Turn on debugging') p.add_argument('-v', '--verbose', action='store_true', default=False, dest='verbose', help='Turn on verbose mode') p.add_argument('-m', '--mail', action='store_true', default=False, dest='mail', help='Send email after conversion') p.add_argument('-l', '--lang', default='eng', dest='lang', help='Language(default eng)') p.add_argument( '--preprocess', action='store_true', default=False, dest='preprocess', help= 'Enable preprocessing. Not really useful now with improved Tesseract 3.04+' ) p.add_argument('--skip-preprocess', action='store_true', default=False, dest='skip_preprocess', help='DEPRECATED: always skips now.') #--------- # Single or watch mode #-------- single_or_watch_group = p.add_mutually_exclusive_group(required=True) # Positional argument for single file conversion single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR") # Watch directory for watch mode single_or_watch_group.add_argument( '-w', '--watch', dest='watch_dir', help= 'Watch given directory and run ocr automatically until terminated') #----------- # Filing options #---------- filing_group = p.add_argument_group(title="Filing optinos") filing_group.add_argument('-f', '--file', action='store_true', default=False, dest='enable_filing', help='Enable filing of converted PDFs') #filing_group.add_argument('-c', '--config', type = argparse.FileType('r'), filing_group.add_argument( '-c', '--config', type=lambda x: open_file_with_timeout(p, x), dest='configfile', help='Configuration file for defaults and PDF filing') filing_group.add_argument('-e', '--evernote', action='store_true', default=False, dest='enable_evernote', help='Enable filing to Evernote') filing_group.add_argument( '-n', action='store_true', default=False, dest='match_using_filename', help= 'Use filename to match if contents did not match anything, before filing to default folder' ) # Add flow option to single mode extract_images,preprocess,ocr,write args = p.parse_args(argv) self.debug = args.debug self.verbose = args.verbose self.pdf_filename = args.pdf_filename self.lang = args.lang self.watch_dir = args.watch_dir self.enable_email = args.mail self.match_using_filename = args.match_using_filename # Deprecating skip_preprocess to make skipping the default (always true). Tesseract 3.04 is so much better now # at handling non-ideal inputs and lines if args.skip_preprocess: print( "Warning: --skip_preprocess is not needed anymore (defaults to skipping preprocessing). If you want to enable preprocessing, use the new --preprocess option" ) self.skip_preprocess = True if args.preprocess: self.skip_preprocess = False if self.debug: logging.basicConfig(level=logging.DEBUG, format='%(message)s') if self.verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') # Parse configuration file (YAML) if specified if args.configfile: self.config = self._get_config_file(args.configfile) logging.debug("Read in configuration file") logging.debug(self.config) if args.enable_evernote: self.enable_evernote = True else: self.enable_evernote = False if args.enable_filing or args.enable_evernote: self.enable_filing = True if not args.configfile: p.error( "Please specify a configuration file(CONFIGFILE) to enable filing" ) else: self.enable_filing = False self.watch = False if args.watch_dir: logging.debug("Starting to watch") self.watch = True if self.enable_email: if not args.configfile: p.error( "Please specify a configuration file(CONFIGFILE) to enable email" ) def _clean_up_files(self, files): """ Helper function to delete files :param files: List of files to delete :type files: list :returns: None """ for f in files: try: os.remove(f) except: logging.debug("Error removing file %s .... continuing" % f) def _setup_filing(self): """ Instance the proper PyFiler object (either :class:`pypdfocr.pypdfocr_filer_dirs.PyFilerDirs` or :class:`pypdfocr.pypdfocr_filer_evernote.PyFilerEvernote`) TODO: Make this more generic to allow third-party plugin filing objects :ivar filer: :class:`pypdfocr.pypdfocr_filer.PyFiler` PyFiler subclass object that is instantiated :ivar pdf_filer: :class:`pypdfocr.pypdfocr_pdffiler.PyPdfFiler` object to help with PDF reading :returns: Nothing """ # Look at self.config and create a self.pdf_filer object # -------------------------------------------------- # Some sanity checks # -------------------------------------------------- assert (self.config and self.enable_filing) for required in ['target_folder', 'default_folder']: if not required in self.config: error("%s must be specified in config file" % required) else: # Make sure these required folders are in abspath format self.config[required] = os.path.abspath(self.config[required]) if 'original_move_folder' in self.config: # User wants to move the original after filing orig = 'original_move_folder' self.config[orig] = os.path.abspath(self.config[orig]) if not os.path.exists(self.config[orig]): os.makedirs(self.config[orig]) original_move_folder = self.config[orig] else: original_move_folder = None # -------------------------------------------------- # Start the filing object # -------------------------------------------------- if self.enable_evernote: self.filer = PyFilerEvernote( self.config['evernote_developer_token']) else: self.filer = PyFilerDirs() self.filer.target_folder = self.config['target_folder'] self.filer.default_folder = self.config['default_folder'] self.filer.original_move_folder = original_move_folder self.pdf_filer = PyPdfFiler(self.filer) if self.match_using_filename: print("Matching using filename as a fallback to pdf contents") self.pdf_filer.file_using_filename = True # ------------------------------ # Add all the folder names with associated keywords # to the filer object # ------------------------------ keyword_count = 0 folder_count = 0 if 'folders' in self.config: for folder, keywords in self.config['folders'].items(): folder_count += 1 keyword_count += len(keywords) # Make sure keywords are lower-cased before adding keywords = [str(x).lower() for x in keywords] self.filer.add_folder_target(folder, keywords) print("Filing of PDFs is enabled") print(" - %d target filing folders" % (folder_count)) print(" - %d keywords" % (keyword_count)) def _setup_external_tools(self): """ Instantiate the external tool wrappers with their config dicts """ self.gs = PyGs(self.config.get('ghostscript', {})) self.ts = PyTesseract(self.config.get('tesseract', {})) self.pdf = PyPdf(self.gs) self.preprocess = PyPreprocess(self.config.get('preprocess', {})) return def run_conversion(self, pdf_filename): """ Does the following: - Convert the PDF using GhostScript to TIFF and JPG - Run Tesseract on the TIFF to extract the text into HOCR (html) - Use PDF generator to overlay the text on the JPG and output a new PDF - Clean up temporary image files :param pdf_filename: Scanned PDF :type pdf_filename: string :returns: OCR'ed PDF :rtype: filename string """ print("Starting conversion of %s" % pdf_filename) try: # Make the images for Tesseract img_dpi, glob_img_filename = self.gs.make_img_from_pdf( pdf_filename) fns = glob.glob(glob_img_filename) except Exception: raise try: # Preprocess if not self.skip_preprocess: preprocess_imagefilenames = self.preprocess.preprocess(fns) else: logging.info("Skipping preprocess step") preprocess_imagefilenames = fns # Run teserract self.ts.lang = self.lang hocr_filenames = self.ts.make_hocr_from_pnms( preprocess_imagefilenames) # Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) ocr_pdf_filename = self.pdf.overlay_hocr_pages( img_dpi, hocr_filenames, pdf_filename) finally: # Clean up the files time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing if locals().has_key( "fns" ): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) if locals().has_key( "preprocess_imagefilenames" ): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files( preprocess_imagefilenames ) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: fns_to_remove = [ os.path.splitext(fn)[0] + ext for fn in preprocess_imagefilenames ] logging.info("Cleaning up %s" % fns_to_remove) self._clean_up_files( fns_to_remove ) # splat the hocr_filenames as it is a list of pairs # clean up the hocr input (jpg) and output (html) files #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/? #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames]) print("Completed conversion successfully to %s" % ocr_pdf_filename) return ocr_pdf_filename def file_converted_file(self, ocr_pdffilename, original_pdffilename): """ move the converted filename to its destiantion directory. Optionally also moves the original PDF. :param ocr_pdffilename: Converted PDF file :type ocr_pdffilename: filename string :param original_pdffilename: Original scanned PDF file :type original_pdffilename: filename string :returns: Target folder name "rtype: string """ filed_path = self.pdf_filer.move_to_matching_folder(ocr_pdffilename) print("Filed %s to %s as %s" % (ocr_pdffilename, os.path.dirname(filed_path), os.path.basename(filed_path))) tgt_path = self.pdf_filer.file_original(original_pdffilename) if tgt_path != original_pdffilename: print("Filed original file %s to %s as %s" % (original_pdffilename, os.path.dirname(tgt_path), os.path.basename(tgt_path))) return os.path.dirname(filed_path) def _send_email(self, infilename, outfilename, filing): """ Send email using smtp """ print("Sending email status") from_addr = self.config["mail_from_addr"] to_addr_list = self.config["mail_to_list"] smtpserver = self.config["mail_smtp_server"] login = self.config["mail_smtp_login"] password = self.config["mail_smtp_password"] subject = "PyPDFOCR converted: %s" % (os.path.basename(outfilename)) header = 'From: %s\n' % login header += 'To: %s\n' % ','.join(to_addr_list) header += 'Subject: %s\n\n' % subject message = """ PyPDFOCR Conversion: -------------------- Original file: %s Converted file: %s Filing: %s """ % (infilename, outfilename, filing) message = header + message server = smtplib.SMTP(smtpserver) server.starttls() server.login(login, password) problems = server.sendmail(from_addr, to_addr_list, message) server.quit() def go(self, argv): """ The main entry point into PyPDFOCR #. Parses options #. If filing is enabled, call :func:`_setup_filing` #. If watch is enabled, start the watcher #. :func:`run_conversion` #. if filing is enabled, call :func:`file_converted_file` """ # Read the command line options self.get_options(argv) # Setup tesseract and ghostscript self._setup_external_tools() # Setup the pdf filing if enabled if self.enable_filing: self._setup_filing() # Do the actual conversion followed by optional filing and email if self.watch: while True: # Make sure the watcher doesn't terminate try: py_watcher = PyPdfWatcher(self.watch_dir, self.config.get('watch')) for pdf_filename in py_watcher.start(): self._convert_and_file_email(pdf_filename) except KeyboardInterrupt: break except Exception as e: print traceback.print_exc(e) py_watcher.stop() else: self._convert_and_file_email(self.pdf_filename) def _convert_and_file_email(self, pdf_filename): """ Helper function to run the conversion, then do the optional filing, and optional emailing. """ ocr_pdffilename = self.run_conversion(pdf_filename) if self.enable_filing: filing = self.file_converted_file(ocr_pdffilename, pdf_filename) else: filing = "None" if self.enable_email: self._send_email(pdf_filename, ocr_pdffilename, filing)
class PyPDFOCR(object): """ The main clas. Performs the following functions: * Parses command line options * Optionally just watches a directory for new PDF's to OCR; once a file appears, it does the next step * Runs a single file conversion: * Runs ghostscript to get tiff/jpg * Runs Tesseract-OCR to do the actual OCR * Takes the HOCR from Tesseract and creates a new PDF with the text overlay * """ def __init__(self): """ Initializes the GhostScript, Tesseract, and PDF helper classes. """ self.config = {} def get_options(self, argv): """ Parse the command-line options and set the following object properties: :param argv: usually just sys.argv[1:] :returns: Nothing :ivar debug: Enable logging debug statements :ivar verbose: Enable verbose logging :ivar enable_filing: Whether to enable post-OCR filing of PDFs :ivar pdf_filename: Filename for single conversion mode :ivar watch_dir: Directory to watch for files to convert :ivar config: Dict of the config file :ivar watch: Whether folder watching mode is turned on :ivar enable_evernote: Enable filing to evernote """ p = argparse.ArgumentParser( description= "Convert scanned PDFs into their OCR equivalent. Depends on GhostScript and Tesseract-OCR being installed.", epilog="PyPDFOCR version %s (Copyright 2013 Virantha Ekanayake)" % __version__, ) p.add_argument('-d', '--debug', action='store_true', default=False, dest='debug', help='Turn on debugging') p.add_argument('-v', '--verbose', action='store_true', default=False, dest='verbose', help='Turn on verbose mode') #--------- # Single or watch mode #-------- single_or_watch_group = p.add_mutually_exclusive_group(required=True) # Positional argument for single file conversion single_or_watch_group.add_argument("pdf_filename", nargs="?", help="Scanned pdf file to OCR") # Watch directory for watch mode single_or_watch_group.add_argument( '-w', '--watch', dest='watch_dir', help= 'Watch given directory and run ocr automatically until terminated') # Add flow option to single mode extract_images,preprocess,ocr,write args = p.parse_args(argv) self.debug = args.debug self.verbose = args.verbose self.pdf_filename = args.pdf_filename if self.debug: logging.basicConfig(level=logging.DEBUG, format='%(message)s') if self.verbose: logging.basicConfig(level=logging.INFO, format='%(message)s') def _clean_up_files(self, files): """ Helper function to delete files :param files: List of files to delete :type files: list :returns: None """ for f in files: try: os.remove(f) except: logging.debug("Error removing file %s .... continuing" % f) def _setup_external_tools(self): """ Instantiate the external tool wrappers with their config dicts """ self.gs = PyGs(self.config.get('ghostscript', {})) self.ts = PyTesseract(self.config.get('tesseract', {})) self.pdf = PyPdf(self.gs) self.preprocess = PyPreprocess(self.config.get('preprocess', {})) return def run_conversion(self, pdf_filename): """ Does the following: - Convert the PDF using GhostScript to TIFF and JPG - Run Tesseract on the TIFF to extract the text into HOCR (html) - Use PDF generator to overlay the text on the JPG and output a new PDF - Clean up temporary image files :param pdf_filename: Scanned PDF :type pdf_filename: string :returns: OCR'ed PDF :rtype: filename string """ print("Starting conversion of %s" % pdf_filename) try: # Make the images for Tesseract img_dpi, glob_img_filename = self.gs.make_img_from_pdf( pdf_filename) fns = glob.glob(glob_img_filename) except Exception: raise try: # Preprocess if not self.skip_preprocess: preprocess_imagefilenames = self.preprocess.preprocess(fns) else: logging.info("Skipping preprocess step") preprocess_imagefilenames = fns # Run teserract self.ts.lang = self.lang hocr_filenames = self.ts.make_hocr_from_pnms( preprocess_imagefilenames) # Generate new pdf with overlayed text #ocr_pdf_filename = self.pdf.overlay_hocr(tiff_dpi, hocr_filename, pdf_filename) ocr_pdf_filename = self.pdf.overlay_hocr_pages( img_dpi, hocr_filenames, pdf_filename) finally: # Clean up the files time.sleep(1) if not self.debug: # Need to clean up the original image files before preprocessing if locals().has_key( "fns" ): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % fns) self._clean_up_files(fns) if locals().has_key( "preprocess_imagefilenames" ): # Have to check if this was set before exception raised logging.info("Cleaning up %s" % preprocess_imagefilenames) self._clean_up_files( preprocess_imagefilenames ) # splat the hocr_filenames as it is a list of pairs for ext in [".hocr", ".html", ".txt"]: fns_to_remove = [ os.path.splitext(fn)[0] + ext for fn in preprocess_imagefilenames ] logging.info("Cleaning up %s" % fns_to_remove) self._clean_up_files( fns_to_remove ) # splat the hocr_filenames as it is a list of pairs # clean up the hocr input (jpg) and output (html) files #self._clean_up_files(itertools.chain(*hocr_filenames)) # splat the hocr_filenames as it is a list of pairs # Seems like newer tessearct > 3.03 is now creating .txt files with the OCR text?/? #self._clean_up_files([x[1].replace(".hocr", ".txt") for x in hocr_filenames]) print("Completed conversion successfully to %s" % ocr_pdf_filename) return ocr_pdf_filename def go(self, argv): """ The main entry point into PyPDFOCR #. Parses options #. :func:`run_conversion` """ # Read the command line options self.get_options(argv) # Setup tesseract and ghostscript self._setup_external_tools() # Setup the pdf filing if enabled if self.enable_filing: self._setup_filing() # Do the actual conversion self._convert_and_file_email(self.pdf_filename) def _convert_and_file_email(self, pdf_filename): """ Helper function to run the conversion, then do the optional filing, and optional emailing. """ ocr_pdffilename = self.run_conversion(pdf_filename) filing = "None"