def paths_from_stream(page: fitz.Page): ''' Get paths, e.g. highlight, underline and table borders, from page source contents. --- Args: - page: fitz.Page, current page The page source is represented as contents of stream object. For example, ``` /P<</MCID 0>> BDC ... 1 0 0 1 90.0240021 590.380005 cm ... 1 1 0 rg # or 0 g ... 285.17 500.11 193.97 13.44 re f* ... 214 320 m 249 322 l 426 630 425 630 422 630 c ... EMC ``` where, - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005) - `q`/`Q` save/restores graphic status - `rg` / `g` specify color mode: rgb / grey - `re`, `f` or `f*`: fill rectangle path with pre-defined color - `m` (move to) and `l` (line to) defines a path - `c` draw cubic Bezier curve with given control points In this case, - a rectangle with: - fill color is yellow (1,1,0) - lower left corner: (285.17 500.11) - width: 193.97 - height: 13.44 - a line from (214, 320) to (249, 322) - a Bezier curve with control points (249,322), (426,630), (425,630), (422,630) Read more: - https://github.com/pymupdf/PyMuPDF/issues/263 - https://github.com/pymupdf/PyMuPDF/issues/225 - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf ''' # Each object in PDF has a cross-reference number (xref): # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)` # - the xref for a page object itself: `page.xref` # - all stream xref contained in one page: `page.getContents()` # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0 # # Clean contents first: # syntactically correct, standardize and pretty print the contents stream page.cleanContents() xref_stream = page.readContents().decode(encoding="ISO-8859-1") # transformation matrix for coordinate system conversion from pdf to fitz # NOTE: transformation matrix converts PDF CS to UNROTATED PyMuPDF page CS, # so need further rotation transformation to the real page CS (applied in Object BBox) # https://github.com/pymupdf/PyMuPDF/issues/619 matrix = page.transformationMatrix # Graphic States: working CS is coincident with the absolute origin (0, 0) # Refer to PDF reference v1.7 4.2.3 Transformation Metrics # | a b 0 | # [a, b, c, d, e, f] => | c b 0 | # | e f 1 | ACS = [fitz.Matrix(0.0)] # identity matrix WCS = fitz.Matrix(0.0) # Graphics color: # - color space: PDF Reference Section 4.5 Color Spaces # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now. # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others. device_space = True color_spaces = _check_device_cs(page) # - stroking color Acs = [utils.RGB_value((0.0, 0.0, 0.0))] # stored value -> stack Wcs = Acs[0] # working value # - filling color Acf = [utils.RGB_value((0.0, 0.0, 0.0))] Wcf = Acf[0] # Stroke width Ad = [0.0] Wd = Ad[0] # collecting paths: each path is a list of points paths = [] # a list of path # clip path Acp = [] # stored clipping path Wcp = [] # working clipping path # Check line by line # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line; # otherwise, have to check stream contents word by word (line always changes) lines = xref_stream.splitlines() res = [] # final results for line in lines: words = line.split() if not words: continue op = words[-1] # operator always at the end after page.cleanContents() # ----------------------------------------------- # Color Operators: PDF Reference Table 4.24 # ----------------------------------------------- # - set color space: # color_space_name cs # specify color space # c1 c2 ... SC/SCN # components under defined color space if op.upper() == 'CS': Wcs = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = utils.RGB_value((0.0, 0.0, 0.0)) # Consider normal device cs only device_space = color_spaces.get(words[0], False) # - set color: color components under specified color space elif op.upper() == 'SC': # c1 c2 ... cn SC c = _RGB_from_color_components(words[0:-1], device_space) # non-stroking color if op == 'sc': Wcf = c # stroking color else: Wcs = c # - set color: color components under specified color space elif op.upper() == 'SCN': # c1 c2 ... cn [name] SC if utils.is_number(words[-2]): c = _RGB_from_color_components(words[0:-1], device_space) else: c = _RGB_from_color_components(words[0:-2], device_space) # non-stroking color if op == 'scn': Wcf = c # stroking color else: Wcs = c # - DeviceGray space, equal to: # /DeviceGray cs # c sc elif op.upper() == 'G': # 0 g g = float(words[0]) # nonstroking color, i.e. filling color here if op == 'g': Wcf = utils.RGB_value((g, g, g)) # stroking color else: Wcs = utils.RGB_value((g, g, g)) # - DeviceRGB space elif op.upper() == 'RG': # 1 1 0 rg r, g, b = map(float, words[0:-1]) # nonstroking color if op == 'rg': Wcf = utils.RGB_value((r, g, b)) # stroking color else: Wcs = utils.RGB_value((r, g, b)) # - DeviceCMYK space elif op.upper() == 'K': # c m y k K c, m, y, k = map(float, words[0:-1]) # nonstroking color if op == 'k': Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # stroking color else: Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # ----------------------------------------------- # Graphics State Operators: PDF References Table 4.7 # ----------------------------------------------- # CS transformation: a b c d e f cm, e.g. # 0.05 0 0 -0.05 0 792 cm # refer to PDF Reference 4.2.2 Common Transformations for detail elif op == 'cm': # update working CS components = list(map(float, words[0:-1])) Mt = fitz.Matrix(*components) WCS = Mt * WCS # M' = Mt x M # stroke width elif op == 'w': # 0.5 w Wd = float(words[0]) # save or restore graphics state: # only consider transformation and color here elif op == 'q': # save ACS.append(fitz.Matrix(WCS)) # copy as new matrix Acf.append(Wcf) Acs.append(Wcs) Ad.append(Wd) Acp.append(Wcp) elif op == 'Q': # restore WCS = fitz.Matrix(ACS.pop()) # copy as new matrix Wcf = Acf.pop() Wcs = Acs.pop() Wd = Ad.pop() Wcp = Acp.pop() # ----------------------------------------------- # Path Construction Operators: PDF References Table 4.9 # ----------------------------------------------- # rectangle block: # x y w h re is equivalent to # x y m # x+w y l # x+w y+h l # x y+h l # h # close the path elif op == 're': # ATTENTION: # top/bottom, left/right is relative to the positive direction of CS, # while a reverse direction may be performed, so be careful when calculating # the corner points. # Coordinates in the transformed PDF CS: # y1 +----------+ # | | h # y0 +----w-----+ # x0 x1 # # (x, y, w, h) before this line x0, y0, w, h = map(float, words[0:-1]) path = [] path.append((x0, y0)) path.append((x0 + w, y0)) path.append((x0 + w, y0 + h)) path.append((x0, y0 + h)) path.append((x0, y0)) paths.append(path) # path: m -> move to point to start a path elif op == 'm': # x y m x0, y0 = map(float, words[0:-1]) paths.append([(x0, y0)]) # path: l -> straight line to point elif op == 'l': # x y l x0, y0 = map(float, words[0:-1]) paths[-1].append((x0, y0)) # path: c -> cubic Bezier curve with control points elif op in ('c', 'v', 'y'): coords = list(map(float, words[0:-1])) P = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)] x0, y0 = paths[-1][-1] # x1 y1 x2 y2 x3 y3 c -> (x1,y1), (x2,y2) as control points if op == 'c': P.insert(0, (x0, y0)) # x2 y2 x3 y3 v -> (x0,y0), (x2,y2) as control points elif op == 'v': P.insert(0, (x0, y0)) P.insert(0, (x0, y0)) # x1 y1 x3 y3 y -> (x1,y1), (x3,y3) as control points else: P.insert(0, (x0, y0)) P.append(P[-1]) # calculate points on Bezier points with parametric equation bezier = _bezier_paths(P, segments=5) paths[-1].extend(bezier) # close the path elif op == 'h': for path in paths: _close_path(path) # ----------------------------------------------- # Path-painting Operatores: PDF Reference Table 4.10 # ----------------------------------------------- # close and stroke the path elif op.upper() == 'S': # close if op == 's': for path in paths: _close_path(path) # stroke path for path in paths: p = _stroke_path(path, WCS, Wcs, Wd, matrix) res.append(p) # reset path paths = [] # fill the path elif line in ('f', 'F', 'f*'): for path in paths: # close the path implicitly _close_path(path) # fill path p = _fill_rect_path(path, WCS, Wcf, matrix) res.append(p) # reset path paths = [] # close, fill and stroke the path elif op.upper() in ('B', 'B*'): for path in paths: # close path _close_path(path) # fill path p = _fill_rect_path(path, WCS, Wcf, matrix) res.append(p) # stroke path p = _stroke_path(path, WCS, Wcs, Wd, matrix) res.append(p) # reset path paths = [] # TODO: clip the path # https://stackoverflow.com/questions/17003171/how-to-identify-which-clip-paths-apply-to-a-path-or-fill-in-pdf-vector-graphics elif line in ('W', 'W*'): Wcp = paths[-1] if paths else [] paths = [] # end the path without stroking or filling elif op == 'n': paths = [] return res
def rects_from_stream(doc: fitz.Document, page: fitz.Page): ''' Get rectangle shapes, e.g. highlight, underline, table borders, from page source contents. --- Args: - doc: fitz.Document representing the pdf file - page: fitz.Page, current page The page source is represented as contents of stream object. For example, ``` /P<</MCID 0>> BDC ... 1 0 0 1 90.0240021 590.380005 cm ... 1 1 0 rg # or 0 g ... 285.17 500.11 193.97 13.44 re f* ... 214 320 m 249 322 l ... EMC ``` where, - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005) - `q`/`Q` save/restores graphic status - `rg` / `g` specify color mode: rgb / grey - `re`, `f` or `f*`: fill rectangle path with pre-defined color - `m` (move to) and `l` (line to) defines a path In this case, - a rectangle with: - fill color is yellow (1,1,0) - lower left corner: (285.17 500.11) - width: 193.97 - height: 13.44 - a line from (214, 320) to (249, 322) Read more: - https://github.com/pymupdf/PyMuPDF/issues/263 - https://github.com/pymupdf/PyMuPDF/issues/225 - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf ''' # Each object in PDF has a cross-reference number (xref): # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)` # - the xref for a page object itself: `page.xref` # - all stream xref contained in one page: `page.getContents()` # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0 # # Clean contents first: # syntactically correct, standardize and pretty print the contents stream page.cleanContents() xref_stream = page.readContents().decode(encoding="ISO-8859-1") # transformation matrix for coordinate system conversion from pdf to fitz matrix = page.transformationMatrix # Graphic States: working CS is coincident with the absolute origin (0, 0) # Refer to PDF reference v1.7 4.2.3 Transformation Metrices # | a b 0 | # [a, b, c, d, e, f] => | c b 0 | # | e f 1 | ACS = fitz.Matrix(0.0) # identity matrix WCS = fitz.Matrix(0.0) # Graphics color: # - color space: PDF Reference Section 4.5 Color Spaces # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now. # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others. device_space = True color_spaces = _check_device_cs(doc, page) # - stroking color Acs = utils.RGB_value((0.0, 0.0, 0.0)) # stored value Wcs = Acs # working value # - filling color Acf = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = Acf # Stroke width Ad = 0.0 Wd = 0.0 # In addition to lines, rectangles are also processed with border path paths = [] # a list of path, each path is a list of points # Check line by line # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line; # otherwise, have to check stream contents word by word (line always changes) lines = xref_stream.splitlines() rects = [] for line in lines: words = line.split() op = words[-1] # operator always at the end after page.cleanContents() # ----------------------------------------------- # Color Operators: PDF Reference Table 4.24 # ----------------------------------------------- # - set color space: # color_space_name cs # specify color space # c1 c2 ... SC/SCN # components under defined color space if op.upper() == 'CS': Wcs = utils.RGB_value((0.0, 0.0, 0.0)) Wcf = utils.RGB_value((0.0, 0.0, 0.0)) # Consider normal device cs only device_space = color_spaces.get(words[0], False) # - set color: color components under specified color space elif op.upper() == 'SC': # c1 c2 ... cn SC c = _RGB_from_color_components(words[0:-1], device_space) # nonstroking color if op == 'sc': Wcf = c # stroking color else: Wcs = c # - set color: color components under specified color space elif op.upper() == 'SCN': # c1 c2 ... cn [name] SC if utils.is_number(words[-2]): c = _RGB_from_color_components(words[0:-1], device_space) else: c = _RGB_from_color_components(words[0:-2], device_space) # nonstroking color if op == 'scn': Wcf = c # stroking color else: Wcs = c # - DeviceGray space, equal to: # /DeviceGray cs # c sc elif op.upper() == 'G': # 0 g g = float(words[0]) # nonstroking color, i.e. filling color here if op == 'g': Wcf = utils.RGB_value((g, g, g)) # stroking color else: Wcs = utils.RGB_value((g, g, g)) # - DeviceRGB space elif op.upper() == 'RG': # 1 1 0 rg r, g, b = map(float, words[0:-1]) # nonstroking color if op == 'rg': Wcf = utils.RGB_value((r, g, b)) # stroking color else: Wcs = utils.RGB_value((r, g, b)) # - DeviceCMYK space elif op.upper() == 'K': # c m y k K c, m, y, k = map(float, words[0:-1]) # nonstroking color if op == 'k': Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # stroking color else: Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0) # ----------------------------------------------- # Graphics State Operators: PDF References Table 4.7 # ----------------------------------------------- # CS transformation: a b c d e f cm, e.g. # 0.05 0 0 -0.05 0 792 cm # refer to PDF Reference 4.2.2 Common Transformations for detail elif op == 'cm': # update working CS components = list(map(float, words[0:-1])) Mt = fitz.Matrix(*components) WCS = Mt * WCS # M' = Mt x M # stroke width elif op == 'w': # 0.5 w Wd = float(words[0]) # save or restore graphics state: # only consider transformation and color here elif op == 'q': # save ACS = fitz.Matrix(WCS) # copy as new matrix Acf = Wcf Acs = Wcs Ad = Wd elif op == 'Q': # restore WCS = fitz.Matrix(ACS) # copy as new matrix Wcf = Acf Wcs = Acs Wd = Ad # ----------------------------------------------- # Path Construction Operators: PDF References Table 4.9 # ----------------------------------------------- # rectangle block: # x y w h re is equivalent to # x y m # x+w y l # x+w y+h l # x y+h l # h # close the path elif op == 're': # ATTENTION: # top/bottom, left/right is relative to the positive direction of CS, # while a reverse direction may be performed, so be careful when calculating # the corner points. # Coordinates in the transformed PDF CS: # y1 +----------+ # | | h # y0 +----w-----+ # x0 x1 # # (x, y, w, h) before this line x0, y0, w, h = map(float, words[0:-1]) path = [] path.append((x0, y0)) path.append((x0 + w, y0)) path.append((x0 + w, y0 + h)) path.append((x0, y0 + h)) path.append((x0, y0)) paths.append(path) # path: m -> move to point to start a path elif op == 'm': # x y m x0, y0 = map(float, words[0:-1]) paths.append([(x0, y0)]) # path: l -> straight line to point elif op == 'l': # x y l x0, y0 = map(float, words[0:-1]) paths[-1].append((x0, y0)) # close the path elif op == 'h': for path in paths: _close_path(path) # ----------------------------------------------- # Path-painting Operatores: PDF Reference Table 4.10 # ----------------------------------------------- # close and stroke the path elif op.upper() == 'S': # close if op == 's': for path in paths: _close_path(path) # stroke path for path in paths: rects_ = _stroke_path(path, WCS, Wcs, Wd, matrix) rects.extend(rects_) # reset path paths = [] # fill the path elif line in ('f', 'F', 'f*'): for path in paths: # close the path implicitly _close_path(path) # fill path rect = _fill_rect_path(path, WCS, Wcf, matrix) if rect: rects.append(rect) # reset path paths = [] # close, fill and stroke the path elif op.upper() in ('B', 'B*'): for path in paths: # close path _close_path(path) # fill path rect = _fill_rect_path(path, WCS, Wcf, matrix) if rect: rects.append(rect) # stroke path rects_ = _stroke_path(path, WCS, Wcs, Wd, matrix) rects.extend(rects_) # reset path paths = [] # TODO: clip the path elif line in ('W', 'W*'): pass # end the path without stroking or filling elif op == 'n': paths = [] return rects