def clip_page(cls, page:fitz.Page, bbox:fitz.Rect, zoom:float=3.0): '''Clip pixmap according to bbox from page.''' # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): '''Clip page pixmap (without text) according to `bbox` (entire page by default). ''' # hide text before clip the image only # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page._getContents(): stream = doc._getXrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc._updateStream(xref, stream) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): """Clip page pixmap (without text) according to ``bbox``. Args: page (fitz.Page): pdf page to extract. bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: dict: Raw dict of the extracted pixmap. """ # hide text before clip the image only # render Tr: set the text rendering mode # - 3: neither fill nor stroke the text -> invisible # read more: # - https://github.com/pymupdf/PyMuPDF/issues/257 # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf doc = page.parent for xref in page.get_contents(): stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \ .replace(b'Tm', b'Tm 3 Tr') \ .replace(b'Td', b'Td 3 Tr') doc.updateStream(xref, stream) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap return cls.to_raw_dict(image, bbox)
def render_fitz_page( page: fitz.Page, zoom: float, pixel_ratio: float, clip: fitz.Rect = None ) -> QtGui.QPixmap: scale_ratio = zoom * pixel_ratio pix = page.getPixmap(matrix=fitz.Matrix(scale_ratio, scale_ratio), clip=clip) mode = "RGBA" if pix.alpha else "RGB" img = Image.frombytes(mode, [pix.width, pix.height], pix.samples) img = ImageQt.ImageQt(img) img.setDevicePixelRatio(pixel_ratio) return img
def to_image(self, page: fitz.Page): '''Convert to image block dict if this is a vector graphic paths.''' bbox = self.bbox image = page.getPixmap(clip=bbox) return { 'type': 1, 'bbox': tuple(bbox), 'ext': 'png', 'width': bbox.width, 'height': bbox.height, 'image': image.getImageData(output="png") }
def extract_vector_graphics(cls, page: fitz.Page, exclude_areas: list, clip_image_res_ratio: float = 3.0): """Detect and extract vector graphics by clipping associated page area. Args: page (fitz.Page): pdf page to extract images. exclude_areas (list): A list of bbox-like ``(x0, y0, x1, y1)`` area to exclude, e.g. raster image area, table area. clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0. Returns: list: A list of extracted and recovered image raw dict. .. note:: Contours for vector graphics are detected first with ``opencv-python``. """ # find contours contours = cls._detect_svg_contours(page, exclude_areas) # filter contours fun = lambda a, b: a.bbox & b.bbox groups = contours.group(fun) # clip images images = [] for group in groups: bbox = group.bbox pix = page.getPixmap(clip=bbox, matrix=fitz.Matrix(clip_image_res_ratio, clip_image_res_ratio)) raw_dict = cls._to_raw_dict(pix, bbox) images.append(raw_dict) return images
def _clip_page(cls, page: fitz.Page, bbox: fitz.Rect = None, zoom: float = 3.0): """Clip page pixmap (without text) according to ``bbox``. Args: page (fitz.Page): pdf page to extract. bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page. zoom (float, optional): Improve resolution by this rate. Defaults to 3.0. Returns: fitz.Pixmap: The extracted pixmap. """ # hide text cls._hide_page_text(page) # improve resolution # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution # - https://github.com/pymupdf/PyMuPDF/issues/181 bbox = page.rect if bbox is None else bbox & page.rect return page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap