示例#1
0
 def clip_page(cls, page:fitz.Page, bbox:fitz.Rect, zoom:float=3.0):
     '''Clip pixmap according to bbox from page.'''
     # improve resolution
     # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
     # - https://github.com/pymupdf/PyMuPDF/issues/181        
     image = page.getPixmap(clip=bbox, matrix=fitz.Matrix(zoom, zoom)) # type: fitz.Pixmap
     return cls.to_raw_dict(image, bbox)
示例#2
0
    def clip_page(cls,
                  page: fitz.Page,
                  bbox: fitz.Rect = None,
                  zoom: float = 3.0):
        '''Clip page pixmap (without text) according to `bbox` (entire page by default).
        '''
        # hide text before clip the image only
        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent
        for xref in page._getContents():
            stream = doc._getXrefStream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc._updateStream(xref, stream)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        image = page.getPixmap(clip=bbox,
                               matrix=fitz.Matrix(zoom,
                                                  zoom))  # type: fitz.Pixmap
        return cls.to_raw_dict(image, bbox)
示例#3
0
    def clip_page(cls,
                  page: fitz.Page,
                  bbox: fitz.Rect = None,
                  zoom: float = 3.0):
        """Clip page pixmap (without text) according to ``bbox``.

        Args:
            page (fitz.Page): pdf page to extract.
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            dict: Raw dict of the extracted pixmap.
        """
        # hide text before clip the image only
        # render Tr: set the text rendering mode
        # - 3: neither fill nor stroke the text -> invisible
        # read more:
        # - https://github.com/pymupdf/PyMuPDF/issues/257
        # - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/pdf_reference_archives/PDFReference.pdf
        doc = page.parent
        for xref in page.get_contents():
            stream = doc.xrefStream(xref).replace(b'BT', b'BT 3 Tr') \
                                             .replace(b'Tm', b'Tm 3 Tr') \
                                             .replace(b'Td', b'Td 3 Tr')
            doc.updateStream(xref, stream)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        image = page.getPixmap(clip=bbox,
                               matrix=fitz.Matrix(zoom,
                                                  zoom))  # type: fitz.Pixmap
        return cls.to_raw_dict(image, bbox)
示例#4
0
def render_fitz_page(
    page: fitz.Page, zoom: float, pixel_ratio: float, clip: fitz.Rect = None
) -> QtGui.QPixmap:
    scale_ratio = zoom * pixel_ratio
    pix = page.getPixmap(matrix=fitz.Matrix(scale_ratio, scale_ratio), clip=clip)
    mode = "RGBA" if pix.alpha else "RGB"
    img = Image.frombytes(mode, [pix.width, pix.height], pix.samples)
    img = ImageQt.ImageQt(img)
    img.setDevicePixelRatio(pixel_ratio)
    return img
示例#5
0
 def to_image(self, page: fitz.Page):
     '''Convert to image block dict if this is a vector graphic paths.'''
     bbox = self.bbox
     image = page.getPixmap(clip=bbox)
     return {
         'type': 1,
         'bbox': tuple(bbox),
         'ext': 'png',
         'width': bbox.width,
         'height': bbox.height,
         'image': image.getImageData(output="png")
     }
示例#6
0
    def extract_vector_graphics(cls,
                                page: fitz.Page,
                                exclude_areas: list,
                                clip_image_res_ratio: float = 3.0):
        """Detect and extract vector graphics by clipping associated page area.

        Args:
            page (fitz.Page): pdf page to extract images.
            exclude_areas (list): A list of bbox-like ``(x0, y0, x1, y1)`` area to exclude, 
                e.g. raster image area, table area.
            clip_image_res_ratio (float, optional): Resolution ratio of clipped bitmap. Defaults to 3.0.

        Returns:
            list: A list of extracted and recovered image raw dict.
        
        .. note::
            Contours for vector graphics are detected first with ``opencv-python``.
        """
        # find contours
        contours = cls._detect_svg_contours(page, exclude_areas)

        # filter contours
        fun = lambda a, b: a.bbox & b.bbox
        groups = contours.group(fun)

        # clip images
        images = []
        for group in groups:
            bbox = group.bbox
            pix = page.getPixmap(clip=bbox,
                                 matrix=fitz.Matrix(clip_image_res_ratio,
                                                    clip_image_res_ratio))
            raw_dict = cls._to_raw_dict(pix, bbox)
            images.append(raw_dict)

        return images
示例#7
0
    def _clip_page(cls,
                   page: fitz.Page,
                   bbox: fitz.Rect = None,
                   zoom: float = 3.0):
        """Clip page pixmap (without text) according to ``bbox``.

        Args:
            page (fitz.Page): pdf page to extract.
            bbox (fitz.Rect, optional): Target area to clip. Defaults to None, i.e. entire page.
            zoom (float, optional): Improve resolution by this rate. Defaults to 3.0.

        Returns:
            fitz.Pixmap: The extracted pixmap.
        """
        # hide text
        cls._hide_page_text(page)

        # improve resolution
        # - https://pymupdf.readthedocs.io/en/latest/faq.html#how-to-increase-image-resolution
        # - https://github.com/pymupdf/PyMuPDF/issues/181
        bbox = page.rect if bbox is None else bbox & page.rect
        return page.getPixmap(clip=bbox,
                              matrix=fitz.Matrix(zoom,
                                                 zoom))  # type: fitz.Pixmap