def do_document_ocr(queue_document): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling tesseract """ for document_page in queue_document.document.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR ocr_transformations, warnings = queue_document.get_transformation_list( ) document_filepath = document_page.document.get_image_cache_name( page=document_page.page_number) unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % ( document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT) unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) unpaper_input = convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations) execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) #from PIL import Image, ImageOps #im = Image.open(document_filepath) ##if im.mode=='RGBA': ## im=im.convert('RGB') ##im = im.convert('L') #im = ImageOps.grayscale(im) #im.save(unpaper_output_filepath) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join( [pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() finally: cleanup(pre_ocr_filepath_w_ext) cleanup(unpaper_input) cleanup(document_filepath) cleanup(unpaper_output_filepath)
def do_document_ocr(queue_document): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling tesseract """ for document_page in queue_document.document.documentpage_set.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR ocr_transformations, warnings = queue_document.get_transformation_list() document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number) unpaper_output_filename = u"%s_unpaper_out_page_%s%s%s" % ( document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT, ) unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) unpaper_input = convert( document_filepath, file_format=UNPAPER_FILE_FORMAT, transformations=ocr_transformations ) execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) # from PIL import Image, ImageOps # im = Image.open(document_filepath) ##if im.mode=='RGBA': ## im=im.convert('RGB') ##im = im.convert('L') # im = ImageOps.grayscale(im) # im.save(unpaper_output_filepath) # Convert to TIFF pre_ocr_filepath = convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u"Text from OCR") document_page.save() finally: cleanup(pre_ocr_filepath_w_ext) cleanup(unpaper_input) cleanup(document_filepath) cleanup(unpaper_output_filepath)
def setUp(self): from ocr.parsers import parse_document_page self.document_type = DocumentType(name='test doc type') self.document_type.save() self.document = Document( document_type=self.document_type, description='description', ) self.document.save() file_object = open(os.path.join(settings.PROJECT_ROOT, 'contrib', 'mayan_11_1.pdf')) new_version = self.document.new_version(file=File(file_object, name='mayan_11_1.pdf')) file_object.close() # Text extraction on the first page only parse_document_page(self.document.latest_version.pages.all()[0])
def setUp(self): from ocr.parsers import parse_document_page self.document_type = DocumentType(name='test doc type') self.document_type.save() self.document = Document( document_type=self.document_type, description='description', ) self.document.save() with open(TEST_DOCUMENT_PATH) as file_object: new_version = self.document.new_version(file=File(file_object, name='mayan_11_1.pdf')) # Text extraction on the first page only parse_document_page(self.document.latest_version.pages.all()[0])