def get_text(self): if self._text is not None: return self._text if not settings.OCR_ALWAYS and self._is_ocred(): self.log("debug", "Skipping OCR, using Text from PDF") self._text = get_text_from_pdf(self.document_path) return self._text images = self._get_greyscale() if not images: raise ParseError("Empty document, nothing to do.") try: sample_page_index = int(len(images) / 2) self.log( "debug", f"Attempting language detection on page " f"{sample_page_index + 1} of {len(images)}...") sample_page_text = self._ocr([images[sample_page_index]], settings.OCR_LANGUAGE)[0] guessed_language = self._guess_language(sample_page_text) if not guessed_language or guessed_language not in ISO639: self.log("warning", "Language detection failed.") ocr_pages = self._complete_ocr_default_language( images, sample_page_index, sample_page_text) elif ISO639[guessed_language] == settings.OCR_LANGUAGE: self.log( "debug", f"Detected language: {guessed_language} " f"(default language)") ocr_pages = self._complete_ocr_default_language( images, sample_page_index, sample_page_text) elif not ISO639[guessed_language] in pyocr.get_available_tools( )[0].get_available_languages(): # NOQA: E501 self.log( "warning", f"Detected language {guessed_language} is not available " f"on this system.") ocr_pages = self._complete_ocr_default_language( images, sample_page_index, sample_page_text) else: self.log("debug", f"Detected language: {guessed_language}") ocr_pages = self._ocr(images, ISO639[guessed_language]) self.log("debug", "OCR completed.") self._text = strip_excess_whitespace(" ".join(ocr_pages)) return self._text except OCRError as e: raise ParseError(e)
def call_convert(input_file, output_file, **kwargs): if ".pdf" in input_file: raise ParseError("Does not compute.") else: run_convert(input_file=input_file, output_file=output_file, **kwargs)
def get_thumbnail(self): """ The thumbnail of a PDF is just a 500px wide image of the first page. """ out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail try: run_convert(self.CONVERT, "-scale", "500x5000", "-alpha", "remove", "-strip", "-trim", "{}[0]".format(self.document_path), out_path) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript self.log( "warning", "Thumbnail generation with ImageMagick failed, " "falling back to Ghostscript.") gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [ self.GHOSTSCRIPT, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, self.document_path ] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert(self.CONVERT, "-scale", "500x5000", "-alpha", "remove", "-strip", "-trim", gs_out_path, out_path) return out_path
def parse(self, document_path, mime_type): self.log("info", f"[TIKA_PARSE] Sending {document_path} to Tika server") try: parsed = parser.from_file(document_path) except requests.exceptions.HTTPError as err: raise ParseError( f"Could not parse {document_path} with tika server: {err}") try: content = parsed["content"].strip() except: content = "" try: creation_date = dateutil.parser.isoparse( parsed["metadata"]["Creation-Date"]) except: creation_date = None archive_path = os.path.join(self.tempdir, "convert.pdf") convert_to_pdf(self, document_path, archive_path) self.archive_path = archive_path self.date = creation_date self.text = content
def get_text(self): images = self._get_greyscale() try: return self._get_ocr(images) except OCRError as e: raise ParseError(e)
def run_command(*args): environment = os.environ.copy() if settings.CONVERT_MEMORY_LIMIT: environment["MAGICK_MEMORY_LIMIT"] = settings.CONVERT_MEMORY_LIMIT if settings.CONVERT_TMPDIR: environment["MAGICK_TMPDIR"] = settings.CONVERT_TMPDIR if not subprocess.Popen(' '.join(args), env=environment, shell=True).wait() == 0: raise ParseError("Convert failed at {}".format(args))
def get_thumbnail(self, document_path, mime_type): self.log("info", f"[TIKA_THUMB] Generating thumbnail for{document_path}") archive_path = self.archive_path out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail try: run_convert( density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, input_file="{}[0]".format(archive_path), output_file=out_path, logging_group=self.logging_group, ) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript self.log( "warning", "Thumbnail generation with ImageMagick failed, falling back " "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!", ) gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [ settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, archive_path, ] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert( density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, input_file=gs_out_path, output_file=out_path, logging_group=self.logging_group, ) return out_path
def get_text(self): if self.TEXT_CACHE is not None: return self.TEXT_CACHE if not self.OCR_ALWAYS and self._is_ocred(): self.log("info", "Skipping OCR, using Text from PDF") self.TEXT_CACHE = get_text_from_pdf(self.document_path) return self.TEXT_CACHE images = self._get_greyscale() try: self.TEXT_CACHE = self._get_ocr(images) return self.TEXT_CACHE except OCRError as e: raise ParseError(e)
def get_thumbnail(self, document_path, mime_type): """ The thumbnail of a PDF is just a 500px wide image of the first page. """ out_path = os.path.join(self.tempdir, "convert.png") # Run convert to get a decent thumbnail try: run_convert(density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, auto_orient=True, input_file="{}[0]".format(document_path), output_file=out_path, logging_group=self.logging_group) except ParseError: # if convert fails, fall back to extracting # the first PDF page as a PNG using Ghostscript self.log( 'warning', "Thumbnail generation with ImageMagick failed, falling back " "to ghostscript. Check your /etc/ImageMagick-x/policy.xml!") gs_out_path = os.path.join(self.tempdir, "gs_out.png") cmd = [ settings.GS_BINARY, "-q", "-sDEVICE=pngalpha", "-o", gs_out_path, document_path ] if not subprocess.Popen(cmd).wait() == 0: raise ParseError("Thumbnail (gs) failed at {}".format(cmd)) # then run convert on the output from gs run_convert(density=300, scale="500x5000>", alpha="remove", strip=True, trim=False, auto_orient=True, input_file=gs_out_path, output_file=out_path, logging_group=self.logging_group) return out_path
def convert_to_pdf(self, document_path, pdf_path): pdf_path = os.path.join(self.tempdir, "convert.pdf") gotenberg_server = os.getenv("PAPERLESS_GOTENBERG", "http://localhost:3000") url = gotenberg_server + "/convert/office" self.log("info", f"[TIKA] Converting {document_path} to PDF as {pdf_path}") files = {"files": open(document_path, "rb")} headers = {} try: response = requests.post(url, files=files, headers=headers) response.raise_for_status() # ensure we notice bad responses except requests.exceptions.HTTPError as err: raise ParseError( f"Could not contact gotenberg server at {gotenberg_server}: {err}") file = open(pdf_path, "wb") file.write(response.content) file.close()
def test_parser_error(self, m): m.side_effect = ParseError() Document = self.apps.get_model("documents", "Document") doc1 = make_test_document(Document, "document", "image/png", simple_png, "document.png", simple_pdf) doc2 = make_test_document(Document, "document", "application/pdf", simple_jpg, "document.jpg", simple_pdf) self.assertIsNotNone(doc1.archive_checksum) self.assertIsNotNone(doc2.archive_checksum) with self.assertLogs() as capture: self.performMigration() self.assertEqual(m.call_count, 6) self.assertEqual( len( list( filter( lambda log: "Parse error, will try again in 5 seconds" in log, capture.output))), 4) self.assertEqual( len( list( filter( lambda log: "Unable to regenerate archive document for ID:" in log, capture.output))), 2) Document = self.apps.get_model("documents", "Document") doc1 = Document.objects.get(id=doc1.id) doc2 = Document.objects.get(id=doc2.id) self.assertIsNone(doc1.archive_checksum) self.assertIsNone(doc2.archive_checksum) self.assertIsNone(doc1.archive_filename) self.assertIsNone(doc2.archive_filename)
def parse(self, document_path, mime_type, file_name=None): self.log("info", f"Sending {document_path} to Tika server") tika_server = settings.PAPERLESS_TIKA_ENDPOINT try: parsed = parser.from_file(document_path, tika_server) except Exception as err: raise ParseError( f"Could not parse {document_path} with tika server at " f"{tika_server}: {err}") self.text = parsed["content"].strip() try: self.date = dateutil.parser.isoparse( parsed["metadata"]["Creation-Date"]) except Exception as e: self.log( "warning", f"Unable to extract date for document " f"{document_path}: {e}") self.archive_path = self.convert_to_pdf(document_path, file_name)
def convert_to_pdf(self, document_path, file_name): pdf_path = os.path.join(self.tempdir, "convert.pdf") gotenberg_server = settings.PAPERLESS_TIKA_GOTENBERG_ENDPOINT url = gotenberg_server + "/convert/office" self.log("info", f"Converting {document_path} to PDF as {pdf_path}") files = {"files": (file_name or os.path.basename(document_path), open(document_path, "rb"))} headers = {} try: response = requests.post(url, files=files, headers=headers) response.raise_for_status() # ensure we notice bad responses except Exception as err: raise ParseError( f"Error while converting document to PDF: {err}" ) file = open(pdf_path, "wb") file.write(response.content) file.close() return pdf_path
def get_text(self): try: images = self._get_images() return self._ocr(images) except Exception as e: raise ParseError(e)
def parse(self, document_path, mime_type, file_name=None): # This forces tesseract to use one core per page. os.environ['OMP_THREAD_LIMIT'] = "1" if mime_type == "application/pdf": text_original = self.extract_text(None, document_path) original_has_text = text_original and len(text_original) > 50 else: text_original = None original_has_text = False if settings.OCR_MODE == "skip_noarchive" and original_has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return import ocrmypdf from ocrmypdf import InputFileError, EncryptedPdfError archive_path = os.path.join(self.tempdir, "archive.pdf") sidecar_file = os.path.join(self.tempdir, "sidecar.txt") args = self.construct_ocrmypdf_parameters(document_path, mime_type, archive_path, sidecar_file) try: self.log("debug", f"Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) self.archive_path = archive_path self.text = self.extract_text(sidecar_file, archive_path) if not self.text: raise NoTextFoundException( "No text was found in the original document") except EncryptedPdfError: self.log( "warning", "This file is encrypted, OCR is impossible. Using " "any text present in the original file.") if original_has_text: self.text = text_original except (NoTextFoundException, InputFileError) as e: self.log( "warning", f"Encountered an error while running OCR: {str(e)}. " f"Attempting force OCR to get the text.") archive_path_fallback = os.path.join(self.tempdir, "archive-fallback.pdf") sidecar_file_fallback = os.path.join(self.tempdir, "sidecar-fallback.txt") # Attempt to run OCR with safe settings. args = self.construct_ocrmypdf_parameters(document_path, mime_type, archive_path_fallback, sidecar_file_fallback, safe_fallback=True) try: self.log("debug", f"Fallback: Calling OCRmyPDF with args: {args}") ocrmypdf.ocr(**args) # Don't return the archived file here, since this file # is bigger and blurry due to --force-ocr. self.text = self.extract_text(sidecar_file_fallback, archive_path_fallback) except Exception as e: # If this fails, we have a serious issue at hand. raise ParseError(f"{e.__class__.__name__}: {str(e)}") except Exception as e: # Anything else is probably serious. raise ParseError(f"{e.__class__.__name__}: {str(e)}") # As a last resort, if we still don't have any text for any reason, # try to extract the text from the original document. if not self.text: if original_has_text: self.text = text_original else: self.log( "warning", f"No text was found in {document_path}, the content will " f"be empty.") self.text = ""
def construct_ocrmypdf_parameters(self, input_file, mime_type, output_file, sidecar_file, safe_fallback=False): ocrmypdf_args = { 'input_file': input_file, 'output_file': output_file, # need to use threads, since this will be run in daemonized # processes by django-q. 'use_threads': True, 'jobs': settings.THREADS_PER_WORKER, 'language': settings.OCR_LANGUAGE, 'output_type': settings.OCR_OUTPUT_TYPE, 'progress_bar': False } if settings.OCR_MODE == 'force' or safe_fallback: ocrmypdf_args['force_ocr'] = True elif settings.OCR_MODE in ['skip', 'skip_noarchive']: ocrmypdf_args['skip_text'] = True elif settings.OCR_MODE == 'redo': ocrmypdf_args['redo_ocr'] = True else: raise ParseError(f"Invalid ocr mode: {settings.OCR_MODE}") if settings.OCR_CLEAN == 'clean': ocrmypdf_args['clean'] = True elif settings.OCR_CLEAN == 'clean-final': if settings.OCR_MODE == 'redo': ocrmypdf_args['clean'] = True else: ocrmypdf_args['clean_final'] = True if settings.OCR_DESKEW and not settings.OCR_MODE == 'redo': ocrmypdf_args['deskew'] = True if settings.OCR_ROTATE_PAGES: ocrmypdf_args['rotate_pages'] = True ocrmypdf_args[ 'rotate_pages_threshold'] = settings.OCR_ROTATE_PAGES_THRESHOLD # NOQA: E501 if settings.OCR_PAGES > 0: ocrmypdf_args['pages'] = f"1-{settings.OCR_PAGES}" else: # sidecar is incompatible with pages ocrmypdf_args['sidecar'] = sidecar_file if self.is_image(mime_type): dpi = self.get_dpi(input_file) a4_dpi = self.calculate_a4_dpi(input_file) if dpi: self.log("debug", f"Detected DPI for image {input_file}: {dpi}") ocrmypdf_args['image_dpi'] = dpi elif settings.OCR_IMAGE_DPI: ocrmypdf_args['image_dpi'] = settings.OCR_IMAGE_DPI elif a4_dpi: ocrmypdf_args['image_dpi'] = a4_dpi else: raise ParseError( f"Cannot produce archive PDF for image {input_file}, " f"no DPI information is present in this image and " f"OCR_IMAGE_DPI is not set.") if settings.OCR_USER_ARGS and not safe_fallback: try: user_args = json.loads(settings.OCR_USER_ARGS) ocrmypdf_args = {**ocrmypdf_args, **user_args} except Exception as e: self.log( "warning", f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " f"they will not be used. Error: {e}") return ocrmypdf_args
def run_unpaper(args): unpaper, pnm = args command_args = unpaper, pnm, pnm.replace(".pnm", ".unpaper.pnm") if not subprocess.Popen(command_args).wait() == 0: raise ParseError("Unpaper failed at {}".format(command_args))
def parse(self, document_path, mime_type): mode = settings.OCR_MODE text_original = get_text_from_pdf(document_path) has_text = text_original and len(text_original) > 50 if mode == "skip_noarchive" and has_text: self.log("debug", "Document has text, skipping OCRmyPDF entirely.") self.text = text_original return if mode in ['skip', 'skip_noarchive'] and not has_text: # upgrade to redo, since there appears to be no text in the # document. This happens to some weird encrypted documents or # documents with failed OCR attempts for which OCRmyPDF will # still report that there actually is text in them. self.log( "debug", "No text was found in the document and skip is " "specified. Upgrading OCR mode to redo.") mode = "redo" archive_path = os.path.join(self.tempdir, "archive.pdf") ocr_args = { 'input_file': document_path, 'output_file': archive_path, 'use_threads': True, 'jobs': settings.THREADS_PER_WORKER, 'language': settings.OCR_LANGUAGE, 'output_type': settings.OCR_OUTPUT_TYPE, 'progress_bar': False, 'clean': True } if settings.OCR_PAGES > 0: ocr_args['pages'] = f"1-{settings.OCR_PAGES}" # Mode selection. if mode in ['skip', 'skip_noarchive']: ocr_args['skip_text'] = True elif mode == 'redo': ocr_args['redo_ocr'] = True elif mode == 'force': ocr_args['force_ocr'] = True else: raise ParseError(f"Invalid ocr mode: {mode}") if self.is_image(mime_type): dpi = self.get_dpi(document_path) a4_dpi = self.calculate_a4_dpi(document_path) if dpi: self.log("debug", f"Detected DPI for image {document_path}: {dpi}") ocr_args['image_dpi'] = dpi elif settings.OCR_IMAGE_DPI: ocr_args['image_dpi'] = settings.OCR_IMAGE_DPI elif a4_dpi: ocr_args['image_dpi'] = a4_dpi else: raise ParseError( f"Cannot produce archive PDF for image {document_path}, " f"no DPI information is present in this image and " f"OCR_IMAGE_DPI is not set.") if settings.OCR_USER_ARGS: try: user_args = json.loads(settings.OCR_USER_ARGS) ocr_args = {**ocr_args, **user_args} except Exception as e: self.log( "warning", f"There is an issue with PAPERLESS_OCR_USER_ARGS, so " f"they will not be used: {e}") # This forces tesseract to use one core per page. os.environ['OMP_THREAD_LIMIT'] = "1" try: self.log("debug", f"Calling OCRmyPDF with {str(ocr_args)}") ocrmypdf.ocr(**ocr_args) # success! announce results self.archive_path = archive_path self.text = get_text_from_pdf(archive_path) except (InputFileError, EncryptedPdfError) as e: self.log( "debug", f"Encountered an error: {e}. Trying to use text from " f"original.") # This happens with some PDFs when used with the redo_ocr option. # This is not the end of the world, we'll just use what we already # have in the document. self.text = text_original # Also, no archived file. if not self.text: # However, if we don't have anything, fail: raise ParseError(e) except Exception as e: # Anything else is probably serious. raise ParseError(e) if not self.text: # This may happen for files that don't have any text. self.log( 'warning', f"Document {document_path} does not have any text." f"This is probably an error or you tried to add an image " f"without text, or something is wrong with this document.") self.text = ""