def init(self, page: fitz.Page) -> Layout: '''Initialize layout object.''' # Layout object based on raw dict raw_layout = page.getText('rawdict') self._layout = Layout(raw_layout) # get rectangle shapes from page source: # these shapes are generally converted from docx, e.g. highlight, underline, # which are different from PDF comments like highlight, rectangle. if not page._isWrapped: page._wrapContents() # transformation matrix from PDF to PyMuPDF M = page.transformationMatrix # PyMuPDF>=1.17.0 for xref in page.getContents(): page_content = self._doc_pdf._getXrefStream(xref).decode( encoding="ISO-8859-1") self._layout.rects.from_stream(page_content, M) # get annotations(comment shapes) from PDF page: consider highlight, underline, # strike-through-line only. annots = page.annots() self._layout.rects.from_annotations(annots) # plot raw layout if self.debug_mode: # new section for current pdf page new_page_section(self._doc_debug, self._layout.width, self._layout.height, f'Page {page.number}') # initial layout self._layout.plot(self._doc_debug, 'Original Text Blocks', key=PlotControl.LAYOUT) self._layout.plot(self._doc_debug, 'Original Rectangle Shapes', key=PlotControl.SHAPE) return self._layout
def paths_from_annotations(page: fitz.Page): ''' Get shapes, e.g. Line, Square, Highlight, from annotations(comment shapes) in PDF page. --- Args: - page: fitz.Page, current page There are stroke and fill properties for each shape, representing border and filling area respectively. So, a square annotation with both stroke and fill will be converted to five rectangles here: four borders and one filling area. read more: - https://pymupdf.readthedocs.io/en/latest/annot.html - https://pymupdf.readthedocs.io/en/latest/vars.html#annotation-types ''' res = [] for annot in page.annots(): # annot type, e.g. (8, 'Highlight') key = annot.type[0] # color, e.g. {'stroke': [1.0, 1.0, 0.0], 'fill': []} c = annot.colors sc = utils.RGB_value(c['stroke']) if c['stroke'] else None fc = utils.RGB_value(c['fill']) if c['fill'] else None # width w = annot.border.get('width', 1.0) # width=-1 if not set w = 1.0 if w == -1 else w # 1.0 by default # bbox rect = annot.rect # considering the contributions to text format and table borders, # only the following types are processed. # PDF_ANNOT_LINE 3 # PDF_ANNOT_SQUARE 4 # PDF_ANNOT_HIGHLIGHT 8 # PDF_ANNOT_UNDERLINE 9 # PDF_ANNOT_STRIKEOUT 11 # Line: a space of 1.5*w around each border # # +----------------------------+ # | space | # | +--------------+ | # | | border | 1.5w | # | +--------------+ | # | 1.5w | # +----------------------------+ # if key == 3: x0 = rect.x0 + 1.5 * w x1 = rect.x1 - 1.5 * w y0 = y1 = (rect.y0 + rect.y1) / 2.0 path = _add_stroke_line((x0, y0), (x1, y1), sc, w) res.append(path) # Square: a space of 0.5*w around eah border # border rects and filling rects are to be extracted from original square # # +------------------------------------------+ # | space | # | +----------------------------+ | # | | border | | # | | +--------------+ | | # | | fill | w | 0.5w | # | | +--------------+ | | # | | w | | # | +----------------------------+ | # | 0.5w | # +------------------------------------------+ # elif key == 4: # stroke rectangles if not sc is None: x0, y0 = rect.x0 + w, rect.y0 + w x1, y1 = rect.x1 - w, rect.y1 - w path = _add_stroke_rect((x0, y0), (x1, y1), sc, w) res.append(path) # fill rectangle if not fc is None: d = 1.5 * w x0, y0 = rect.x0 + d, rect.y0 + d x1, y1 = rect.x1 - d, rect.y1 - d path = _add_fill_rect((x0, y0), (x1, y1), fc) res.append(path) # highlight, underline, strikethrough: on space # For these shapes, `annot.rect` is a combination of all sub-highlights, especially # the highlight is continuous in multi-lines. # So, `annot.vertices` should be used here, i.e. vertices marked with `+` below. # +------------------------+ # +------------------------+ # +-----------+ # +-----------+ # NOTE: Though underline and strikethrough are just lines, the affected areas are same as # highlights, as illustrated above. # # https://github.com/pymupdf/PyMuPDF/issues/318 # elif key in (8, 9, 11): points = annot.vertices for i in range(int(len(points) / 4.0)): # four points in a group # highlight: whole bbox if key == 8: x0, y0 = points[4 * i] x1, y1 = points[4 * i + 3] # NOTE: this indded a stroke for PyMuPDF -> no fill color but stroke color !! path = _add_fill_rect((x0, y0), (x1, y1), sc) res.append(path) else: # underline: bottom edge if key == 9: start, end = points[4 * i + 2], points[4 * i + 3] # strikethrough: average of top and bottom edge else: x0, x1 = points[4 * i][0], points[4 * i + 1][0] y_ = (points[4 * i][1] + points[4 * i + 2][1]) / 2.0 start = x0, y_ end = x1, y_ path = _add_stroke_line(start, end, sc, w) res.append(path) return res