def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, xmltext: str): pageinfo = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] if xmltext is not None: bboxes = ghosttext.page_get_textblocks(fspath(infile), pageno, xmltext=xmltext, height=height_pt) pageinfo['bboxes'] = bboxes else: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) bboxes = (box.bbox for box in pageinfo['textboxes']) pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) pageinfo['userunit'] = userunit pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0) pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0) try: pageinfo['rotate'] = int(page['/Rotate']) except KeyError: pageinfo['rotate'] = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) contentsinfo = [ ci for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand) ] pageinfo['has_vector'] = False if any(isinstance(ci, VectorInfo) for ci in contentsinfo): pageinfo['has_vector'] = True pageinfo['images'] = [ im for im in contentsinfo if isinstance(im, ImageInfo) ] if pageinfo['images']: xres = Decimal(max(image.xres for image in pageinfo['images'])) yres = Decimal(max(image.yres for image in pageinfo['images'])) pageinfo['xres'], pageinfo['yres'] = xres, yres pageinfo['width_pixels'] = int(round(xres * pageinfo['width_inches'])) pageinfo['height_pixels'] = int(round(yres * pageinfo['height_inches'])) return pageinfo
def _pdf_get_pageinfo( pdf, pageno: int, infile: PathLike, check_pages, detailed_analysis: bool ): pageinfo: Dict[str, Any] = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] check_this_page = pageno in check_pages if check_this_page and detailed_analysis: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) bboxes = (box.bbox for box in pageinfo['textboxes']) pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) else: pageinfo['textboxes'] = [] pageinfo['has_text'] = None # i.e. "no information" userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) pageinfo['userunit'] = userunit pageinfo['width_inches'] = width_pt * userunit / Decimal(72.0) pageinfo['height_inches'] = height_pt * userunit / Decimal(72.0) try: pageinfo['rotate'] = int(page['/Rotate']) except KeyError: pageinfo['rotate'] = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) if check_this_page: pageinfo['has_vector'] = False pageinfo['has_text'] = False pageinfo['images'] = [] for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): if isinstance(ci, VectorMarker): pageinfo['has_vector'] = True elif isinstance(ci, TextMarker): pageinfo['has_text'] = True elif isinstance(ci, ImageInfo): pageinfo['images'].append(ci) else: raise NotImplementedError() else: pageinfo['has_vector'] = None # i.e. "no information" pageinfo['has_text'] = None pageinfo['images'] = None if pageinfo['images']: dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in pageinfo['images']) pageinfo['dpi'] = dpi pageinfo['width_pixels'] = int(round(dpi.x * float(pageinfo['width_inches']))) pageinfo['height_pixels'] = int(round(dpi.y * float(pageinfo['height_inches']))) return pageinfo
def _gather_pageinfo( self, pdf: Pdf, pageno: int, infile: PathLike, check_pages: Container[int], detailed_analysis: bool, ): page = pdf.pages[pageno] mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] check_this_page = pageno in check_pages if check_this_page and detailed_analysis: pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') miner = get_page_analysis(infile, pageno, pscript5_mode) self._textboxes = list(simplify_textboxes(miner, get_text_boxes)) bboxes = (box.bbox for box in self._textboxes) self._has_text = _page_has_text(bboxes, width_pt, height_pt) else: self._textboxes = [] self._has_text = None # i.e. "no information" userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): userunit = Decimal(userunit) self._userunit = userunit self._width_inches = width_pt * userunit / Decimal(72.0) self._height_inches = height_pt * userunit / Decimal(72.0) try: self._rotate = int(page['/Rotate']) except KeyError: self._rotate = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) if check_this_page: self._has_vector = False self._has_text = False self._images = [] for ci in _process_content_streams( pdf=pdf, container=page, shorthand=userunit_shorthand ): if isinstance(ci, VectorMarker): self._has_vector = True elif isinstance(ci, TextMarker): self._has_text = True elif isinstance(ci, ImageInfo): self._images.append(ci) else: raise NotImplementedError() else: self._has_vector = None # i.e. "no information" self._has_text = None self._images = None self._dpi = None if self._images: dpi = Resolution(0.0, 0.0).take_max(image.dpi for image in self._images) self._dpi = dpi self._width_pixels = int(round(dpi.x * float(self._width_inches))) self._height_pixels = int(round(dpi.y * float(self._height_inches)))