def __init__( self, infile, detailed_analysis: bool = False, progbar: bool = False, max_workers: int = None, check_pages=None, ): self._infile = infile if check_pages is None: check_pages = range(0, 1_000_000_000) with pikepdf.open(infile) as pdf: if pdf.is_encrypted: raise EncryptedPdfError() # Triggered by encryption with empty passwd self._pages = _pdf_pageinfo_concurrent( pdf, infile, progbar, max_workers, check_pages=check_pages, detailed_analysis=detailed_analysis, ) self._needs_rendering = pdf.root.get('/NeedsRendering', False) self._has_acroform = False if '/AcroForm' in pdf.root: if len(pdf.root.AcroForm.get('/Fields', [])) > 0: self._has_acroform = True elif '/XFA' in pdf.root.AcroForm: self._has_acroform = True
def _pdf_get_all_pageinfo(infile, detailed_analysis=False, log=None, progbar=False): pdf = pikepdf.open(infile) # Do not close in this function if pdf.is_encrypted: pdf.close() raise EncryptedPdfError() # Triggered by encryption with empty passwd if detailed_analysis: pages_xml = None else: pages_xml = ghosttext.extract_text_xml(infile, pdf, pageno=None, log=log) pages = [] for n, _ in tqdm( enumerate(pdf.pages), total=len(pdf.pages), desc="Scan", unit='page', disable=not progbar, ): page_xml = pages_xml[n] if pages_xml else None page = PageInfo(pdf, n, infile, page_xml, detailed_analysis) pages.append(page) return pages, pdf
def get_page_analysis(infile, pageno, pscript5_mode): rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) dev = TextPositionTracker(rman, laparams=LAParams(all_texts=True, detect_vertical=True)) interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev) if pscript5_mode: patcher = patch.multiple( 'pdfminer.pdffont.PDFType3Font', spec=True, get_ascent=PDFType3Font__PScript5_get_ascent, get_descent=PDFType3Font__PScript5_get_descent, get_height=PDFType3Font__PScript5_get_height, ) patcher.start() try: with Path(infile).open('rb') as f: page = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0) interp.process_page(next(page)) except PDFTextExtractionNotAllowed: raise EncryptedPdfError() finally: if pscript5_mode: patcher.stop() return dev.get_result()
def get_pdfinfo( input_file, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ): try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, ) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: raise InputFileError()
def get_page_analysis(infile, pageno, pscript5_mode): rman = pdfminer.pdfinterp.PDFResourceManager(caching=True) if pdfminer.__version__ < '20200402': # Workaround for https://github.com/pdfminer/pdfminer.six/issues/395 disable_boxes_flow = 2 else: disable_boxes_flow = None dev = TextPositionTracker( rman, laparams=LAParams(all_texts=True, detect_vertical=True, boxes_flow=disable_boxes_flow), ) interp = pdfminer.pdfinterp.PDFPageInterpreter(rman, dev) patcher = None if pscript5_mode: patcher = patch.multiple( 'pdfminer.pdffont.PDFType3Font', spec=True, get_ascent=PDFType3Font__PScript5_get_ascent, get_descent=PDFType3Font__PScript5_get_descent, get_height=PDFType3Font__PScript5_get_height, ) patcher.start() try: with Path(infile).open('rb') as f: page_iter = PDFPage.get_pages(f, pagenos=[pageno], maxpages=0) page = next(page_iter, None) if page is None: raise InputFileError( f"pdfminer could not process page {pageno} (counting from 0)." ) interp.process_page(page) except PDFTextExtractionNotAllowed as e: raise EncryptedPdfError() from e finally: if patcher is not None: patcher.stop() return dev.get_result()
def get_pdfinfo( input_file, *, executor: Executor, detailed_analysis=False, progbar=False, max_workers=None, check_pages=None, ) -> PdfInfo: try: return PdfInfo( input_file, detailed_analysis=detailed_analysis, progbar=progbar, max_workers=max_workers, check_pages=check_pages, executor=executor, ) except pikepdf.PasswordError as e: raise EncryptedPdfError() from e except pikepdf.PdfError as e: raise InputFileError() from e