def fetch(self): self.fetch_pages() if not os.path.exists(self.get_book_file_path()): with open(self.get_book_file_path(), "wb") as f: pages = [ os.path.join(self.get_dir_path(), f"{page:03}.jpg") for page in range(self.get_page_count()) ] pages.insert( 1, os.path.join(WORKSPACE, "assets", "disclaimer.jpg")) pages.insert(2, os.path.join(WORKSPACE, "assets", "blank.jpg")) f.write( img2pdf.convert( pages, title=self.name, creationdate=datetime(int(self.year), 1, 1), author="Comisión Nacional de Libros de Texto Gratuitos", )) try: ocrmypdf.ocr(self.get_book_file_path(), self.get_book_file_path(), language='spa', clean=True, jobs=12, max_image_mpixels=900) except ocrmypdf.exceptions.PriorOcrFoundError: pass except Exception as e: return print(f"Error este libro no pudo ser indexado: {e}") self.cleanup() print(f"Tu libro está listo en: {self.get_book_file_path()}\n")
def process(file, failure_path=None, ocr_output_dir=None, gcs_bucket=None, **kwargs): if file.endswith('.pdf'): if not ocr_output_dir: ocr_output_dir = 'ocr_output' if not os.path.exists(ocr_output_dir): os.mkdir(ocr_output_dir) ocr_output_path = os.path.join(ocr_output_dir, os.path.basename(file)) ocrmypdf.ocr( input_file=file, output_file=ocr_output_path, deskew=True, use_threads=True, skip_text=True, ) source = ocr_output_path else: source = file scr = Screenplay(source=source, failure_path=failure_path, **kwargs) if gcs_bucket: gcs_filename = "{}.json".format(scr.title) gcs_upload(scr.data, gcs_bucket, gcs_filename) return scr
def convert2images(unique_id, meet, seconds, custom_coordinates, ocr): file = f"./{videos_dir}/{unique_id}" directory = unique_id # Directory parent_dir = f"./{slides_dir}" # Parent Directory path path = os.path.join(parent_dir, directory) # Path try: os.makedirs(path, exist_ok=True) print("Directory '%s' created successfully" % directory) frames = video_to_frames( video_path=file, frames_dir=f"./{slides_dir}", seconds=seconds, meet=meet, custom_coordinates=custom_coordinates, ) if frames: # If no frames have been generated due to poor video convert2pdf(unique_id) if ocr: # Disadvantage is pdf file size increases and image quality detoriates ocrmypdf.ocr( f"./{pdfs_dir}/{unique_id}.pdf", f"./{pdfs_dir}/{unique_id}.pdf", deskew=True, pdf_renderer="hocr", ) freeUpSpace(unique_id) except OSError as e: raise e print("Directory '{0}' can not be created".format(unique_id)) print(file, " didn't complete successfully.")
def test_stream_api(resources): in_ = (resources / 'graph.pdf').open('rb') out = BytesIO() ocrmypdf.ocr(in_, out, tesseract_timeout=0.0) out.seek(0) assert b'%PDF' in out.read(1024)
def pdf_extraction_func(pdf_file_path:pathlib.Path,output_folder_path:pathlib.Path, ): temp_folder= tempfile.TemporaryDirectory() temp_foldername =temp_folder.name filename_no_suffix=re.sub('__|___|____','_',re.sub('\s|\.|\-','_', pdf_file_path.stem)) pdf_file_temp_path= pathlib.Path(temp_foldername) / (filename_no_suffix + pdf_file_path.suffix) shutil.copy(pdf_file_path, pdf_file_temp_path) pdf_txt_temp_path= pathlib.Path(temp_foldername) / (filename_no_suffix + '.txt') pdf_modified_file_path = pathlib.Path(temp_foldername) / (filename_no_suffix + '_modified.pdf') ocrmypdf.ocr(input_file=pdf_file_temp_path,sidecar=pdf_txt_temp_path,output_file=pdf_modified_file_path,language='eng+chi_tra',optimize=3, deskew=True,force_ocr=True,progress_bar=True,image_dpi=1200,tesseract_oem=1, tesseract_pagesegmode=3) with open(pdf_txt_temp_path, 'r', encoding='utf-8') as f: lines=[line for line in f.readlines() if len(line.strip().strip('\n'))>1 ] pdf_txt_path = output_folder_path / (pdf_file_path.stem + '.txt') with open(pdf_txt_path, 'w', encoding='utf-8') as g: for line in lines: g.write(line) temp_folder.cleanup()
def convert(self, pdf_filename): pdf_filepath = os.path.join(PDF_DIR, pdf_filename) error_msg = "" if not os.path.isfile(pdf_filepath): error_msg = "File not found" return False, "", error_msg now = datetime.now().strftime("%d%m%Y-%H%M%S") new_pdf_filename = pdf_filename[:-4] + now + "_searchable.pdf" new_pdf_filepath = os.path.join(PDF_DIR, new_pdf_filename) try: ocr.ocr(input_file=pdf_filepath, output_file=new_pdf_filepath, skip_text=True) except Exception as e: error_msg = "OCR exception occured" return False, "", error_msg if os.path.isfile(new_pdf_filepath): return True, new_pdf_filename, error_msg error_msg = "File not found" return False, "", error_msg
def ocr(): ''' inputFile: full path for scanned pdf outputFile: full path for ocr'd file (can be same as inputfile but will replaced the scanned file) textFile: full path for creating text file input_file, output_file: self explanatory force_ocr: if the file is already OCR'd, it ocr's it again by removing previous text layer deskew: Analyses the rotation of page tesseract_pagesegmode: psm mode (0-13) sidecar: arg for creating text file on the provided location rotate_pages: Further increases the accuracy of rotated scanned PDF's, also gives output pdf after rotating the pages in right direction :return: ''' inputFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample.pdf" outputFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample_ocr.pdf" textFile = "C:\\D_Drive\wordpress-pdf-invoice-plugin-sample.txt" psm = None ocrmypdf.ocr(input_file=inputFile, output_file=outputFile, force_ocr=True, deskew=True, tesseract_pagesegmode=psm, sidecar=textFile, rotate_pages=True)
def add_ocr_to_pdf(update, context): if not check_user_data(update, context, PDF_INFO): return ConversationHandler.END _ = set_lang(update, context) update.effective_message.reply_text( _("Adding an OCR text layer to your PDF file"), reply_markup=ReplyKeyboardRemove(), ) with tempfile.NamedTemporaryFile() as tf: user_data = context.user_data file_id, file_name = user_data[PDF_INFO] pdf_file = context.bot.get_file(file_id) pdf_file.download(custom_path=tf.name) with tempfile.TemporaryDirectory() as dir_name: out_fn = os.path.join(dir_name, f"OCR_{os.path.splitext(file_name)[0]}.pdf") try: # logging.getLogger("ocrmypdf").setLevel(logging.WARNING) ocrmypdf.ocr(tf.name, out_fn, deskew=True, progress_bar=False) send_result_file(update, context, out_fn, "ocr") except PriorOcrFoundError: update.effective_message.reply_text( _("Your PDF file already has a text layer")) # Clean up memory if user_data[PDF_INFO] == file_id: del user_data[PDF_INFO] return ConversationHandler.END
def pdf2txt(doc_path): txt_path = doc_path.with_suffix('.txt') """ #remove the condition for treating the file again if os.path.exists(txt_path) and not(file_is_too_small(txt_path)): #tqdm.write(f"File {txt_path} exists. Skipping...") return 0""" if file_is_too_big(doc_path): tqdm.write(f"File {doc_path} is too big. Skipping...") return 0 try: P.extract_text(doc_path) # writes text to /path/to/my_file.txt if file_is_too_small(txt_path): # Text file is very small, PDF has an image probably, try OCRizing it try: ocr_txt = ocr_pdf(doc_path) with open(txt_path, "w") as filo: filo.write(ocr_txt) return 1 except Exception as e: if file_is_too_small(txt_path): try: ocrmypdf.ocr(doc_path, doc_path.parents[0] / "result.pdf", sidecar=txt_path) return 1 except Exception as e: print( f"Could not ocr convert to txt file {doc_path}: {str(e)}" ) return 0 except Exception as e: print(f"Could not convert to txt file {doc_path}: {str(e)}") return 0
def ocr(self, pdf_file, destination): """Calling ocrmypdf""" self.log.debug(f'ocr for source={pdf_file}') ocrmypdf.ocr(pdf_file, destination, deskew=True, language="deu", oversample=500)
def ocr(book_title): if config['Do_OCR'] == "True": book_title = book_title + '_IMG' ocrmypdf.ocr(book_title + '.pdf', book_title + "_OCR.pdf", use_threads=True) else: pass
def test_links(resources, outpdf): ocrmypdf.ocr( resources / 'link.pdf', outpdf, redo_ocr=True, oversample=200, output_type='pdf' ) pdf = pikepdf.open(outpdf) p1 = pdf.pages[0] p2 = pdf.pages[1] assert p1.Annots[0].A.D[0].objgen == p2.objgen assert p2.Annots[0].A.D[0].objgen == p1.objgen
def handle(req): decoded = base64.decodebytes(bytes(req, 'utf-8')) file_path = save_image_from_base64(req, 'pdf') ocrmypdf.ocr(file_path, './tmp/output.pdf', deskew=True) text = textract.process('./tmp/output.pdf') print(text)
def extractText(filename): outputFilename = f"/tmp/{path.basename(filename)}" try: ocr(input_file=filename, output_file=outputFilename, force_ocr=True, progress_bar=False) return outputFilename except: logging.exception(f"Cannot ocr {path.basename(filename)}.", exc_info=True)
def test_no_glyphless_graft(resources, outdir): pdf = pikepdf.open(resources / 'francais.pdf') pdf_aspect = pikepdf.open(resources / 'aspect.pdf') pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf') pdf.pages.extend(pdf_aspect.pages) pdf.pages.extend(pdf_cmyk.pages) pdf.save(outdir / 'test.pdf') with patch('ocrmypdf._graft.MAX_REPLACE_PAGES', 2): ocrmypdf.ocr(outdir / 'test.pdf', outdir / 'out.pdf', deskew=True, tesseract_timeout=0)
def execute_ocrmypdf(file_path): filename = Path(file_path).name if OUTPUT_DIRECTORY_YEAR_MONTH: today = datetime.today() output_directory_year_month = Path( f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}') if not output_directory_year_month.exists(): output_directory_year_month.mkdir(parents=True, exist_ok=True) output_path = Path(output_directory_year_month) / filename else: output_path = Path(OUTPUT_DIRECTORY) / filename print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}') ocrmypdf.ocr(file_path, output_path)
def createWOFileTypeKeys(): keys = {} for ftype in list( os.scandir( '/media/andrew/F08C9B848C9B444E/analysis/tv/orderscoring/')): o = list( os.scandir( '/media/andrew/F08C9B848C9B444E/analysis/tv/orderscoring/' + ftype.name + '/')) op = [] # Convert all pdfs to text, string process them and turn them into an array of strings for f in o: res = subprocess.run(['pdftotext', f.path, '-'], stdout=subprocess.PIPE).stdout.decode() res = res.split('\n') res = [x.replace(' ', '') for x in res] res = [x.replace(':', '') for x in res] res = [x.replace('.', '') for x in res] res = list(filter(lambda a: a != '', res)) res = res[0:200] res = list(set(res)) #print(res[0:100]) if len(res) > 1: op.append(res) else: ocrmypdf.ocr(f.path, f.path, deskew=True, rotate_pages=True) curSet = [] # Filter so only keys that exist in all files of the given report format remain. print(len(op)) for l in op: if len(curSet) == 0: curSet = l else: curSet = [x for x in curSet if x in l] keys[ftype.name] = curSet # Filter out keys that are non-unique to that report type. for k in keys: types = ['contracts', 'invoices', 'orders'] types = [x for x in types if x != k] for t in types: keys[k] = [x for x in keys[k] if x not in keys[t]] # Get rid of nonspecific keys keys[k] = list(filter(lambda a: (len(a) > 4) & (len(a) < 23), keys[k])) print(keys[k]) keys = [[(k, vv) for vv in v] for k, v in keys.items()] keys2 = [] for k in keys: for v in k: keys2.append(v) keys = pd.DataFrame(keys2, columns=['pdftype', 'keyword']) keys.to_csv('filetypekeywords.csv', index=False)
def test_no_glyphless_graft(resources, outdir): pdf = pikepdf.open(resources / 'francais.pdf') pdf_aspect = pikepdf.open(resources / 'aspect.pdf') pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf') pdf.pages.extend(pdf_aspect.pages) pdf.pages.extend(pdf_cmyk.pages) pdf.save(outdir / 'test.pdf') env = os.environ.copy() env['_OCRMYPDF_MAX_REPLACE_PAGES'] = '2' with os_environ(env): ocrmypdf.ocr( outdir / 'test.pdf', outdir / 'out.pdf', deskew=True, tesseract_timeout=0 )
def create_searchable_pdf(self, input_pdf, output_pdf): logging.debug("Working directory: %s" % os.getcwd()) try: ocrmypdf.ocr(input_pdf, output_pdf, rotate_pages=True, rotate_pages_threshold=13, deskew=True, clean=True) except Exception as e: logging.error(e) return
def test_limited_pages(resources, outpdf, spoof_tesseract_cache): multi = resources / 'multipage.pdf' ocrmypdf.ocr( multi, outpdf, pages='5-6', optimize=0, output_type='pdf', tesseract_env=spoof_tesseract_cache, ) pi = PdfInfo(outpdf) assert not pi.pages[0].has_text assert pi.pages[4].has_text assert pi.pages[5].has_text
def test_limited_pages(resources, outpdf): multi = resources / 'multipage.pdf' ocrmypdf.ocr( multi, outpdf, pages='5-6', optimize=0, output_type='pdf', plugins=['tests/plugins/tesseract_cache.py'], ) pi = PdfInfo(outpdf) assert not pi.pages[0].has_text assert pi.pages[4].has_text assert pi.pages[5].has_text
def _get_text(inpdf, sesspath, language, unpaper_args, minwords): force_ocr, prelim_text = _need_ocr(inpdf, minwords) ocr(inpdf, f"{sesspath}/tmp.pdf", sidecar=f"{sesspath}/tmp.txt", language=language, deskew=force_ocr, rotate_pages=force_ocr, remove_background=force_ocr, clean=force_ocr, unpaper_args=unpaper_args, redo_ocr=(not force_ocr), force_ocr=force_ocr) with open(f"{sesspath}/tmp.txt", "rt") as text: return text.read(), prelim_text
def test_masks(spoof_tesseract_noop, resources, outpdf): assert ( ocrmypdf.ocr( resources / 'masks.pdf', outpdf, tesseract_env=spoof_tesseract_noop ) == ExitCode.ok )
def execute_ocrmypdf(file_path): new_file = Path(file_path) filename = new_file.name if OUTPUT_DIRECTORY_YEAR_MONTH: today = datetime.today() output_directory_year_month = Path( f'{OUTPUT_DIRECTORY}/{today.year}/{today.month}') if not output_directory_year_month.exists(): output_directory_year_month.mkdir(parents=True, exist_ok=True) output_path = Path(output_directory_year_month) / filename else: output_path = Path(OUTPUT_DIRECTORY) / filename logger.info(f'New file: {file_path}. Waiting until fully loaded...') # This loop waits to make sure that the file is completely loaded on # disk before attempting to read. Docker sometimes will publish the # watchdog event before the file is actually fully on disk, causing # pikepdf to fail. current_size = None while current_size != new_file.stat().st_size: current_size = new_file.stat().st_size logger.debug(f'new_file current_size: {current_size}') time.sleep(POLL_NEW_FILE_SECONDS) logger.info(f'Attempting to OCRmyPDF to: {output_path}') exit_code = ocrmypdf.ocr(input_file=file_path, output_file=output_path, deskew=DESKEW) if exit_code == 0 and ON_SUCCESS_DELETE: logger.info(f'Done. Deleting: {file_path}') new_file.unlink() else: logger.info('Done')
def test_masks(resources, outpdf): assert ( ocrmypdf.ocr( resources / 'masks.pdf', outpdf, plugins=['tests/plugins/tesseract_noop.py'] ) == ExitCode.ok )
def process(self, pdfData, outputName=None, modificationTime=None): inf, outf, sidef = self.store(pdfData) #self.log.info("Creating file: %s" % outf) try: if outputName is None: yr, mt, name = self.guess(sidef) destName = os.path.join(self.destPath, "%s %s %s" % (yr, mt, name)) idx = 2 orgName = destName while os.path.exists(destName): destName = "%s %02d" % (orgName, idx) idx += 1 else: destName = os.path.join(self.destPath, outputName) os.makedirs(Path(destName).parent, exist_ok=True) if modificationTime is not None and os.path.exists( destName ) and os.path.getmtime(destName) > modificationTime: self.log.info( "Skipping processing, because newer file (%s) with same name exists" % destName) else: ocrmypdf.ocr(inf, outf, deskew=self.deskew, sidecar=sidef, remove_background=self.removeBackground, language=self.language) shutil.move(outf, destName) os.remove(sidef) self.log.info("Created & processed document %s" % destName) except ocrmypdf.exceptions.PriorOcrFoundError: # ok - we skip the document, but write a message to the log file. self.log.info( "Skipping processing (copy only), because of existing ocr: %s" % (inf if outputName is None else outputName)) shutil.copyfile(inf, destName) except ocrmypdf.exceptions.EncryptedPdfError: # ok - we skip the document, but write a message to the log file. self.log.warn( "Skipping processing (copy only), because PDF is encrypted: %s" % (inf if outputName is None else outputName)) shutil.copyfile(inf, destName) os.remove(inf)
def generate_searchable_pdf(self, pdf, tmp_path, separator): """ Start from standard PDF, with no OCR, and create a searchable PDF, with OCR. Thanks to ocrmypdf python lib :param pdf: Path to original pdf (not searchable, without OCR) :param tmp_path: Path to store the final pdf, searchable with OCR :param separator: Class Separator instance """ try: output_file = tmp_path + '/result.pdf' ocrmypdf.ocr(pdf, output_file, language=self.lang, skip_text=True, progress_bar=False, jobs=int(self.Config.cfg['GLOBAL']['nbthreads'])) if separator.convert_to_pdfa == "True": output_file = tmp_path + '/result-pdfa.pdf' separator.convert_to_pdfa_function(output_file, tmp_path + '/result.pdf', self.Log) self.searchablePdf = open(output_file, 'rb').read() except ocrmypdf.exceptions.PriorOcrFoundError as e: self.Log.error(e)
def do_POST(self): try: content_length = int(self.headers['Content-Length']) post_data = self.rfile.read(content_length) query = parse_qs(urlparse(self.path).query) def query_string(name, default): raw = query.get(name, []) if len(raw) == 1: return raw[0] else: return default def query_boolean(name, default): raw = query.get(name, []) if len(raw) == 1 and (raw[0] == 'yes' or raw[0] == 'true' or raw[0] == '1'): return True elif len(raw) == 1 and (raw[0] == 'no' or raw[0] == 'false' or raw[0] == '0'): return False else: return default with tempfile.NamedTemporaryFile() as temp: with tempfile.NamedTemporaryFile() as tempOut: temp.write(post_data) temp.seek(0) result = ocrmypdf.ocr( temp.name, tempOut.name, language = 'deu+eng', rotate_pages = query_boolean('rotate_pages', True), deskew = query_boolean('deskew', True), remove_background = query_boolean('remove_background', True), clean_final = True, force_ocr = query_boolean('force_ocr', False), unpaper_args = '--dpi %s --post-size a4' % query_string('dpi', '200'), progress_bar = False ) self.send_response(200) self.send_header('Content-type', 'application/pdf') self.end_headers() self.wfile.write(tempOut.read()) except ocrmypdf.exceptions.PriorOcrFoundError: self.send_response(400) self.send_header('Content-type', 'text/plain') self.end_headers() self.wfile.write(bytes('Document already has been OCRed', 'utf-8')) except: self.send_response(500) self.send_header('Content-type', 'text/plain') self.end_headers() self.wfile.write(bytes('Unknown error', 'utf-8')) traceback.print_exc(file=sys.stdout)
def ocr_pdf_if_not_searchable(filepath): filesProcessed = "" x = 0 head, tail = os.path.split(filepath) text_perc = get_text_percentage(filepath) if text_perc < 0.01: x += 1 result = ocrmypdf.ocr(filepath, filepath.split("/")[-1][:-4] + "_OCR.pdf", redo_ocr=True) filesProcessed += tail
def OCR(choice, language): try: if choice: save_path = os.path.join(directory, f'{book_title}_ocr.pdf') else: save_path = os.path.join(directory, f'{book_title}.pdf') pdf_path = os.path.join(directory, f'{book_title}.pdf') ocrmypdf.ocr(pdf_path, save_path, rotate_pages=True, remove_background=True, language=language.get(), deskew=True, force_ocr=True) lbl_output_2.config(fg='green') output_2_text.set("OCR completed") except Exception as e: print(e) lbl_output_2.config(fg='red') output_2_text.set("Failed to perform OCR")