示例#1
0
文件: pdf.py 项目: filips123/pdf2docx
def paths_from_stream(page: fitz.Page):
    ''' Get paths, e.g. highlight, underline and table borders, from page source contents.
        ---
        Args:
        - page: fitz.Page, current page

        The page source is represented as contents of stream object. For example,
        ```
            /P<</MCID 0>> BDC
            ...
            1 0 0 1 90.0240021 590.380005 cm
            ...
            1 1 0 rg # or 0 g
            ...
            285.17 500.11 193.97 13.44 re f*
            ...
            214 320 m
            249 322 l
            426 630 425 630 422 630 c
            ...
            EMC
        ```
        where,
        - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005)
        - `q`/`Q` save/restores graphic status
        - `rg` / `g` specify color mode: rgb / grey
        - `re`, `f` or `f*`: fill rectangle path with pre-defined color
        - `m` (move to) and `l` (line to) defines a path
        - `c` draw cubic Bezier curve with given control points
        
        In this case,
        - a rectangle with:
            - fill color is yellow (1,1,0)
            - lower left corner: (285.17 500.11)
            - width: 193.97
            - height: 13.44
        - a line from (214, 320) to (249, 322)
        - a Bezier curve with control points (249,322), (426,630), (425,630), (422,630)

        Read more:        
        - https://github.com/pymupdf/PyMuPDF/issues/263
        - https://github.com/pymupdf/PyMuPDF/issues/225
        - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf
    '''
    # Each object in PDF has a cross-reference number (xref):
    # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned
    # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)`
    # - the xref for a page object itself: `page.xref`
    # - all stream xref contained in one page: `page.getContents()`
    # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0
    #
    # Clean contents first:
    # syntactically correct, standardize and pretty print the contents stream
    page.cleanContents()
    xref_stream = page.readContents().decode(encoding="ISO-8859-1")

    # transformation matrix for coordinate system conversion from pdf to fitz
    # NOTE: transformation matrix converts PDF CS to UNROTATED PyMuPDF page CS,
    #       so need further rotation transformation to the real page CS (applied in Object BBox)
    # https://github.com/pymupdf/PyMuPDF/issues/619
    matrix = page.transformationMatrix

    # Graphic States: working CS is coincident with the absolute origin (0, 0)
    # Refer to PDF reference v1.7 4.2.3 Transformation Metrics
    #                        | a b 0 |
    # [a, b, c, d, e, f] =>  | c b 0 |
    #                        | e f 1 |
    ACS = [fitz.Matrix(0.0)]  # identity matrix
    WCS = fitz.Matrix(0.0)

    # Graphics color:
    # - color space: PDF Reference Section 4.5 Color Spaces
    # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now.
    # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others.
    device_space = True
    color_spaces = _check_device_cs(page)

    # - stroking color
    Acs = [utils.RGB_value((0.0, 0.0, 0.0))]  # stored value -> stack
    Wcs = Acs[0]  # working value
    # - filling color
    Acf = [utils.RGB_value((0.0, 0.0, 0.0))]
    Wcf = Acf[0]

    # Stroke width
    Ad = [0.0]
    Wd = Ad[0]

    # collecting paths: each path is a list of points
    paths = []  # a list of path

    # clip path
    Acp = []  # stored clipping path
    Wcp = []  # working clipping path

    # Check line by line
    # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line;
    # otherwise, have to check stream contents word by word (line always changes)
    lines = xref_stream.splitlines()

    res = []  # final results
    for line in lines:

        words = line.split()
        if not words: continue

        op = words[-1]  # operator always at the end after page.cleanContents()

        # -----------------------------------------------
        # Color Operators: PDF Reference Table 4.24
        # -----------------------------------------------
        # - set color space:
        #   color_space_name cs  # specify color space
        #   c1 c2 ... SC/SCN     # components under defined color space
        if op.upper() == 'CS':
            Wcs = utils.RGB_value((0.0, 0.0, 0.0))
            Wcf = utils.RGB_value((0.0, 0.0, 0.0))

            # Consider normal device cs only
            device_space = color_spaces.get(words[0], False)

        # - set color: color components under specified color space
        elif op.upper() == 'SC':  # c1 c2 ... cn SC
            c = _RGB_from_color_components(words[0:-1], device_space)
            #  non-stroking color
            if op == 'sc':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - set color: color components under specified color space
        elif op.upper() == 'SCN':  # c1 c2 ... cn [name] SC
            if utils.is_number(words[-2]):
                c = _RGB_from_color_components(words[0:-1], device_space)
            else:
                c = _RGB_from_color_components(words[0:-2], device_space)

            #  non-stroking color
            if op == 'scn':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - DeviceGray space, equal to:
        # /DeviceGray cs
        # c sc
        elif op.upper() == 'G':  # 0 g
            g = float(words[0])
            # nonstroking color, i.e. filling color here
            if op == 'g':
                Wcf = utils.RGB_value((g, g, g))
            # stroking color
            else:
                Wcs = utils.RGB_value((g, g, g))

        # - DeviceRGB space
        elif op.upper() == 'RG':  # 1 1 0 rg
            r, g, b = map(float, words[0:-1])

            #  nonstroking color
            if op == 'rg':
                Wcf = utils.RGB_value((r, g, b))
            # stroking color
            else:
                Wcs = utils.RGB_value((r, g, b))

        # - DeviceCMYK space
        elif op.upper() == 'K':  # c m y k K
            c, m, y, k = map(float, words[0:-1])
            #  nonstroking color
            if op == 'k':
                Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)
            # stroking color
            else:
                Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)

        # -----------------------------------------------
        # Graphics State Operators: PDF References Table 4.7
        # -----------------------------------------------
        # CS transformation: a b c d e f cm, e.g.
        # 0.05 0 0 -0.05 0 792 cm
        # refer to PDF Reference 4.2.2 Common Transformations for detail
        elif op == 'cm':
            # update working CS
            components = list(map(float, words[0:-1]))
            Mt = fitz.Matrix(*components)
            WCS = Mt * WCS  # M' = Mt x M

        # stroke width
        elif op == 'w':  # 0.5 w
            Wd = float(words[0])

        # save or restore graphics state:
        # only consider transformation and color here
        elif op == 'q':  # save
            ACS.append(fitz.Matrix(WCS))  # copy as new matrix
            Acf.append(Wcf)
            Acs.append(Wcs)
            Ad.append(Wd)
            Acp.append(Wcp)

        elif op == 'Q':  # restore
            WCS = fitz.Matrix(ACS.pop())  # copy as new matrix
            Wcf = Acf.pop()
            Wcs = Acs.pop()
            Wd = Ad.pop()
            Wcp = Acp.pop()

        # -----------------------------------------------
        # Path Construction Operators: PDF References Table 4.9
        # -----------------------------------------------
        # rectangle block:
        # x y w h re is equivalent to
        # x   y   m
        # x+w y   l
        # x+w y+h l
        # x   y+h l
        # h          # close the path
        elif op == 're':
            # ATTENTION:
            # top/bottom, left/right is relative to the positive direction of CS,
            # while a reverse direction may be performed, so be careful when calculating
            # the corner points.
            # Coordinates in the transformed PDF CS:
            #   y1 +----------+
            #      |          | h
            #   y0 +----w-----+
            #      x0        x1
            #

            # (x, y, w, h) before this line
            x0, y0, w, h = map(float, words[0:-1])
            path = []
            path.append((x0, y0))
            path.append((x0 + w, y0))
            path.append((x0 + w, y0 + h))
            path.append((x0, y0 + h))
            path.append((x0, y0))

            paths.append(path)

        # path: m -> move to point to start a path
        elif op == 'm':  # x y m
            x0, y0 = map(float, words[0:-1])
            paths.append([(x0, y0)])

        # path: l -> straight line to point
        elif op == 'l':  # x y l
            x0, y0 = map(float, words[0:-1])
            paths[-1].append((x0, y0))

        # path: c -> cubic Bezier curve with control points
        elif op in ('c', 'v', 'y'):
            coords = list(map(float, words[0:-1]))
            P = [(coords[i], coords[i + 1]) for i in range(0, len(coords), 2)]
            x0, y0 = paths[-1][-1]

            # x1 y1 x2 y2 x3 y3 c -> (x1,y1), (x2,y2) as control points
            if op == 'c':
                P.insert(0, (x0, y0))

            # x2 y2 x3 y3 v -> (x0,y0), (x2,y2) as control points
            elif op == 'v':
                P.insert(0, (x0, y0))
                P.insert(0, (x0, y0))

            # x1 y1 x3 y3 y -> (x1,y1), (x3,y3) as control points
            else:
                P.insert(0, (x0, y0))
                P.append(P[-1])

            # calculate points on Bezier points with parametric equation
            bezier = _bezier_paths(P, segments=5)
            paths[-1].extend(bezier)

        # close the path
        elif op == 'h':
            for path in paths:
                _close_path(path)

        # -----------------------------------------------
        # Path-painting Operatores: PDF Reference Table 4.10
        # -----------------------------------------------
        # close and stroke the path
        elif op.upper() == 'S':
            # close
            if op == 's':
                for path in paths:
                    _close_path(path)

            # stroke path
            for path in paths:
                p = _stroke_path(path, WCS, Wcs, Wd, matrix)
                res.append(p)

            # reset path
            paths = []

        # fill the path
        elif line in ('f', 'F', 'f*'):
            for path in paths:
                # close the path implicitly
                _close_path(path)

                # fill path
                p = _fill_rect_path(path, WCS, Wcf, matrix)
                res.append(p)

            # reset path
            paths = []

        # close, fill and stroke the path
        elif op.upper() in ('B', 'B*'):
            for path in paths:
                # close path
                _close_path(path)

                # fill path
                p = _fill_rect_path(path, WCS, Wcf, matrix)
                res.append(p)

                # stroke path
                p = _stroke_path(path, WCS, Wcs, Wd, matrix)
                res.append(p)

            # reset path
            paths = []

        # TODO: clip the path
        # https://stackoverflow.com/questions/17003171/how-to-identify-which-clip-paths-apply-to-a-path-or-fill-in-pdf-vector-graphics
        elif line in ('W', 'W*'):
            Wcp = paths[-1] if paths else []
            paths = []

        # end the path without stroking or filling
        elif op == 'n':
            paths = []

    return res
示例#2
0
def rects_from_stream(doc: fitz.Document, page: fitz.Page):
    ''' Get rectangle shapes, e.g. highlight, underline, table borders, from page source contents.
        ---
        Args:
        - doc: fitz.Document representing the pdf file
        - page: fitz.Page, current page

        The page source is represented as contents of stream object. For example,
        ```
            /P<</MCID 0>> BDC
            ...
            1 0 0 1 90.0240021 590.380005 cm
            ...
            1 1 0 rg # or 0 g
            ...
            285.17 500.11 193.97 13.44 re f*
            ...
            214 320 m
            249 322 l
            ...
            EMC
        ```
        where,
        - `cm` specify a coordinate system transformation, here (0,0) translates to (90.0240021 590.380005)
        - `q`/`Q` save/restores graphic status
        - `rg` / `g` specify color mode: rgb / grey
        - `re`, `f` or `f*`: fill rectangle path with pre-defined color
        - `m` (move to) and `l` (line to) defines a path
        
        In this case,
        - a rectangle with:
            - fill color is yellow (1,1,0)
            - lower left corner: (285.17 500.11)
            - width: 193.97
            - height: 13.44
        - a line from (214, 320) to (249, 322)

        Read more:        
        - https://github.com/pymupdf/PyMuPDF/issues/263
        - https://github.com/pymupdf/PyMuPDF/issues/225
        - https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdf_reference_archive/pdf_reference_1-7.pdf
    '''
    # Each object in PDF has a cross-reference number (xref):
    # - to get its source contents: `doc.xrefObject()` or low level API `doc._getXrefString()`; but for stream objects, only the non-stream part is returned
    # - to get the stream data: `doc.xrefStream(xref)` or low level API `doc._getXrefStream(xref)`
    # - the xref for a page object itself: `page.xref`
    # - all stream xref contained in one page: `page.getContents()`
    # - combine all stream object contents together: `page.readContents()` with PyMuPDF>=1.17.0
    #
    # Clean contents first:
    # syntactically correct, standardize and pretty print the contents stream
    page.cleanContents()
    xref_stream = page.readContents().decode(encoding="ISO-8859-1")

    # transformation matrix for coordinate system conversion from pdf to fitz
    matrix = page.transformationMatrix

    # Graphic States: working CS is coincident with the absolute origin (0, 0)
    # Refer to PDF reference v1.7 4.2.3 Transformation Metrices
    #                        | a b 0 |
    # [a, b, c, d, e, f] =>  | c b 0 |
    #                        | e f 1 |
    ACS = fitz.Matrix(0.0)  # identity matrix
    WCS = fitz.Matrix(0.0)

    # Graphics color:
    # - color space: PDF Reference Section 4.5 Color Spaces
    # NOTE: it should have to calculate color value under arbitrary color space, but it's really hard work for now.
    # So, consider device color space only like DeviceGray, DeviceRGB, DeviceCMYK, and set black for all others.
    device_space = True
    color_spaces = _check_device_cs(doc, page)

    # - stroking color
    Acs = utils.RGB_value((0.0, 0.0, 0.0))  # stored value
    Wcs = Acs  # working value
    # - filling color
    Acf = utils.RGB_value((0.0, 0.0, 0.0))
    Wcf = Acf

    # Stroke width
    Ad = 0.0
    Wd = 0.0

    # In addition to lines, rectangles are also processed with border path
    paths = []  # a list of path, each path is a list of points

    # Check line by line
    # Cleaned by `page.cleanContents()`, operator and operand are aligned in a same line;
    # otherwise, have to check stream contents word by word (line always changes)
    lines = xref_stream.splitlines()
    rects = []

    for line in lines:

        words = line.split()
        op = words[-1]  # operator always at the end after page.cleanContents()

        # -----------------------------------------------
        # Color Operators: PDF Reference Table 4.24
        # -----------------------------------------------
        # - set color space:
        #   color_space_name cs  # specify color space
        #   c1 c2 ... SC/SCN     # components under defined color space
        if op.upper() == 'CS':
            Wcs = utils.RGB_value((0.0, 0.0, 0.0))
            Wcf = utils.RGB_value((0.0, 0.0, 0.0))

            # Consider normal device cs only
            device_space = color_spaces.get(words[0], False)

        # - set color: color components under specified color space
        elif op.upper() == 'SC':  # c1 c2 ... cn SC
            c = _RGB_from_color_components(words[0:-1], device_space)
            #  nonstroking color
            if op == 'sc':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - set color: color components under specified color space
        elif op.upper() == 'SCN':  # c1 c2 ... cn [name] SC
            if utils.is_number(words[-2]):
                c = _RGB_from_color_components(words[0:-1], device_space)
            else:
                c = _RGB_from_color_components(words[0:-2], device_space)

            #  nonstroking color
            if op == 'scn':
                Wcf = c
            # stroking color
            else:
                Wcs = c

        # - DeviceGray space, equal to:
        # /DeviceGray cs
        # c sc
        elif op.upper() == 'G':  # 0 g
            g = float(words[0])
            # nonstroking color, i.e. filling color here
            if op == 'g':
                Wcf = utils.RGB_value((g, g, g))
            # stroking color
            else:
                Wcs = utils.RGB_value((g, g, g))

        # - DeviceRGB space
        elif op.upper() == 'RG':  # 1 1 0 rg
            r, g, b = map(float, words[0:-1])

            #  nonstroking color
            if op == 'rg':
                Wcf = utils.RGB_value((r, g, b))
            # stroking color
            else:
                Wcs = utils.RGB_value((r, g, b))

        # - DeviceCMYK space
        elif op.upper() == 'K':  # c m y k K
            c, m, y, k = map(float, words[0:-1])
            #  nonstroking color
            if op == 'k':
                Wcf = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)
            # stroking color
            else:
                Wcs = utils.CMYK_to_RGB(c, m, y, k, cmyk_scale=1.0)

        # -----------------------------------------------
        # Graphics State Operators: PDF References Table 4.7
        # -----------------------------------------------
        # CS transformation: a b c d e f cm, e.g.
        # 0.05 0 0 -0.05 0 792 cm
        # refer to PDF Reference 4.2.2 Common Transformations for detail
        elif op == 'cm':
            # update working CS
            components = list(map(float, words[0:-1]))
            Mt = fitz.Matrix(*components)
            WCS = Mt * WCS  # M' = Mt x M

        # stroke width
        elif op == 'w':  # 0.5 w
            Wd = float(words[0])

        # save or restore graphics state:
        # only consider transformation and color here
        elif op == 'q':  # save
            ACS = fitz.Matrix(WCS)  # copy as new matrix
            Acf = Wcf
            Acs = Wcs
            Ad = Wd

        elif op == 'Q':  # restore
            WCS = fitz.Matrix(ACS)  # copy as new matrix
            Wcf = Acf
            Wcs = Acs
            Wd = Ad

        # -----------------------------------------------
        # Path Construction Operators: PDF References Table 4.9
        # -----------------------------------------------
        # rectangle block:
        # x y w h re is equivalent to
        # x   y   m
        # x+w y   l
        # x+w y+h l
        # x   y+h l
        # h          # close the path
        elif op == 're':
            # ATTENTION:
            # top/bottom, left/right is relative to the positive direction of CS,
            # while a reverse direction may be performed, so be careful when calculating
            # the corner points.
            # Coordinates in the transformed PDF CS:
            #   y1 +----------+
            #      |          | h
            #   y0 +----w-----+
            #      x0        x1
            #

            # (x, y, w, h) before this line
            x0, y0, w, h = map(float, words[0:-1])
            path = []
            path.append((x0, y0))
            path.append((x0 + w, y0))
            path.append((x0 + w, y0 + h))
            path.append((x0, y0 + h))
            path.append((x0, y0))

            paths.append(path)

        # path: m -> move to point to start a path
        elif op == 'm':  # x y m
            x0, y0 = map(float, words[0:-1])
            paths.append([(x0, y0)])

        # path: l -> straight line to point
        elif op == 'l':  # x y l
            x0, y0 = map(float, words[0:-1])
            paths[-1].append((x0, y0))

        # close the path
        elif op == 'h':
            for path in paths:
                _close_path(path)

        # -----------------------------------------------
        # Path-painting Operatores: PDF Reference Table 4.10
        # -----------------------------------------------
        # close and stroke the path
        elif op.upper() == 'S':
            # close
            if op == 's':
                for path in paths:
                    _close_path(path)

            # stroke path
            for path in paths:
                rects_ = _stroke_path(path, WCS, Wcs, Wd, matrix)
                rects.extend(rects_)

            # reset path
            paths = []

        # fill the path
        elif line in ('f', 'F', 'f*'):
            for path in paths:
                # close the path implicitly
                _close_path(path)

                # fill path
                rect = _fill_rect_path(path, WCS, Wcf, matrix)
                if rect: rects.append(rect)

            # reset path
            paths = []

        # close, fill and stroke the path
        elif op.upper() in ('B', 'B*'):
            for path in paths:
                # close path
                _close_path(path)

                # fill path
                rect = _fill_rect_path(path, WCS, Wcf, matrix)
                if rect: rects.append(rect)

                # stroke path
                rects_ = _stroke_path(path, WCS, Wcs, Wd, matrix)
                rects.extend(rects_)

            # reset path
            paths = []

        # TODO: clip the path
        elif line in ('W', 'W*'):
            pass

        # end the path without stroking or filling
        elif op == 'n':
            paths = []

    return rects