Пример #1
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = _apply_geom_transform(
            file, file.copy_foreign(first_original), first)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = _apply_geom_transform(
            file, file.copy_foreign(second_original), second)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()
        # See PDF reference section 4.2.3 Transformation Matrices
        tx1 = -first_foreign.MediaBox[0]
        ty1 = -first_foreign.MediaBox[1]
        tx2 = first_page_size[0] - float(second_foreign.MediaBox[0])
        ty2 = -second_foreign.MediaBox[1]
        content_txt = (f"q 1 0 0 1 {tx1} {ty1} cm /Page{i*2} Do Q "
                       f"q 1 0 0 1 {tx2} {ty2} cm /Page{i*2 + 1} Do Q ")

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
Пример #2
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = file.copy_foreign(first_original)
        _update_angle(first, first_original, first_foreign)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = file.copy_foreign(second_original)
        _update_angle(second, second_original, second_foreign)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()

        content_txt = (
            f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q'
            f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ')

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
Пример #3
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    rotate = 0
    if "/Rotate" in page:
        # We'll set the rotate attribute on the resulting page so we must
        # unset it on the input page before
        rotate = page.Rotate
        page.Rotate = 0
    page = doc.make_indirect(page)
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
        Rotate=rotate,
    )
    return new_page
Пример #4
0
def test_draw_page(pal, monkeypatch):
    # Test page drawing error handling independent of whether mudraw is installed

    page0 = pikepdf.Page(pal.pages[0])

    def raise_filenotfound(prog_args, *args, **kwargs):
        raise FileNotFoundError(prog_args[0])

    monkeypatch.setattr(pikepdf._methods, 'run', raise_filenotfound)
    mimebundle = page0._repr_mimebundle_(
        include=['image/png'], exclude=['application/pdf']
    )
    assert (
        'image/png' not in mimebundle
    ), "Generated image/png when mudraw() was rigged to fail"

    def return_simple_png(prog_args, *args, **kwargs):
        im = Image.new('1', (1, 1))
        bio = BytesIO()
        im.save(bio, format='PNG')
        bio.seek(0)
        return subprocess.CompletedProcess(prog_args, 0, stdout=bio.read(), stderr=b'')

    monkeypatch.setattr(pikepdf._methods, 'run', return_simple_png)
    mimebundle = page0._repr_mimebundle_(
        include=['image/png'], exclude=['application/pdf']
    )
    assert (
        'image/png' in mimebundle
    ), "Did not generate image/png when mudraw() was rigged to succeed"
Пример #5
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    rotate = 0
    if "/Rotate" in page:
        # We'll set the rotate attribute on the resulting page so we must
        # unset it on the input page before
        rotate = page.Rotate
        page.Rotate = 0
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
        Rotate=rotate,
    )
    # This was needed for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
    # It's also needed with pikepdf 4.2 else we get:
    # RuntimeError: QPDFPageObjectHelper::getFormXObjectForPage called with a direct object
    # when calling as_form_xobject in generate_booklet
    new_page = doc.make_indirect(new_page)
    return new_page
Пример #6
0
def test_filter_names(pal):
    page = pikepdf.Page(pal.pages[0])
    filter = FilterCollectNames()
    result = page.get_filtered_contents(filter)
    assert result == b''
    assert filter.names == ['/Im0']
    after = page.obj.Contents.read_bytes()
    assert after != b''
Пример #7
0
def test_image_scale0(resources, outpdf):
    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
        xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject()

        p = pikepdf.Pdf.new()
        p.add_blank_page(page_size=(72, 72))
        objname = pikepdf.Page(p.pages[0]).add_resource(
            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0)
        print(objname)
        p.pages[0].Contents = pikepdf.Stream(
            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname))
        p.save(outpdf)

    pi = pdfinfo.PdfInfo(outpdf,
                         detailed_analysis=True,
                         progbar=False,
                         max_workers=1)
    assert not pi.pages[0]._images[0].dpi.is_finite
    assert pi.pages[0].dpi == Resolution(0, 0)
Пример #8
0
def mergePage(layerPage, mainPage, pdf, name) -> None:
    contentsForName = pdf.copy_foreign(
        pikepdf.Page(layerPage).as_form_xobject())
    newContents = b'q\n %s Do\nQ\n' % (name.encode())
    if not mainPage.Resources.get("/XObject"):
        mainPage.Resources["/XObject"] = pikepdf.Dictionary({})
    mainPage.Resources["/XObject"][name] = contentsForName
    # Use the MediaBox from the merged page
    mainPage.MediaBox = layerPage.MediaBox
    mainPage.page_contents_add(contents=pikepdf.Stream(pdf, newContents),
                               prepend=True)
Пример #9
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = pikepdf.Page(page)
    rich_page.contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode('ascii')

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator('INLINE IMAGE'):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b' '.join(convert(op)
                             for op in operands) + b' ' + operator.unparse()
        lines.append(line)

    content_stream = b'\n'.join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
Пример #10
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    page = doc.make_indirect(page)
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
    )
    return new_page
Пример #11
0
def main() -> None:
    #Initialize parser
    parser = init_argparse()
    args = parser.parse_args()
    #Open PDF
    for file in args.files:
        current_file = pikepdf.Pdf.open(file)

        #For each page in the pdf
        for page in current_file.pages:
            pagehelper = pikepdf.Page(page)

            #Navigate to Resources/XObject (errror if not found)
            cf_xobjects = pagehelper.resources.XObject

            #Loop through each entry in XObject and store the one with the maximum length
            #Other possible conditionals are whatever is /DCTDecode
            maxlength = -1
            for cf_object in cf_xobjects:
                cfo_length = cf_xobjects.get(cf_object).Length
                if cfo_length > maxlength:
                    maxlength = cfo_length

            #Loop through again and delete all but those ones
            #For deletion, need to use dictionary addressing as otherwise we're just deleteing the object reference
            for cf_object in cf_xobjects:
                cfo_length = cf_xobjects.get(cf_object).Length
                if cfo_length != maxlength:
                    del cf_xobjects[cf_object]

        #Remove unreferenced resources
        #Probably unnecessary as we're doing this backward
        current_file.remove_unreferenced_resources()

        #Save the pdf with compression
        save_filename = file[:-4] + "XObjectsRemoved.pdf"
        current_file.save(filename_or_stream=save_filename,
                          object_stream_mode=pikepdf.ObjectStreamMode.generate,
                          compress_streams=True,
                          recompress_flate=True,
                          encryption=False)

        #Close the current file
        current_file.close()
Пример #12
0
def test_display_rich_page(pal):
    page0 = pikepdf.Page(pal.pages[0])
    mimebundle = page0._repr_mimebundle_(
        include=['application/pdf'], exclude=['application/malware']
    )
    assert 'application/pdf' in mimebundle
Пример #13
0
def test_invalid_handle_token(pal):
    page = pikepdf.Page(pal.pages[0])
    with pytest.raises((TypeError, pikepdf.PdfError)):
        page.get_filtered_contents(FilterInvalid())
Пример #14
0
def test_tokenfilter_is_abstract(pal):
    page = pikepdf.Page(pal.pages[0])
    try:
        result = page.get_filtered_contents(pikepdf.TokenFilter())
    except pikepdf.PdfError:
        assert 'Tried to call pure virtual' in pal.get_warnings()[0]
Пример #15
0
def graph_page(graph):
    return pikepdf.Page(graph.pages[0])
Пример #16
0
    def _graft_text_layer(
        self,
        *,
        page_num: int,
        textpdf: Path,
        font: pikepdf.Object,
        font_key: pikepdf.Object,
        procset: pikepdf.Object,
        text_rotation: int,
        strip_old_text: bool,
    ):
        """Insert the text layer from text page 0 on to pdf_base at page_num"""

        log.debug("Grafting")
        if Path(textpdf).stat().st_size == 0:
            return

        # This is a pointer indicating a specific page in the base file
        with pikepdf.open(textpdf) as pdf_text:
            pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

            base_page = self.pdf_base.pages.p(page_num)

            # The text page always will be oriented up by this stage but the original
            # content may have a rotation applied. Wrap the text stream with a rotation
            # so it will be oriented the same way as the rest of the page content.
            # (Previous versions OCRmyPDF rotated the content layer to match the text.)
            mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
            wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
            wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
            untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
            corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
            # -rotation because the input is a clockwise angle and this formula
            # uses CCW
            text_rotation = -text_rotation % 360
            rotate = pikepdf.PdfMatrix().rotated(text_rotation)

            # Because of rounding of DPI, we might get a text layer that is not
            # identically sized to the target page. Scale to adjust. Normally this
            # is within 0.998.
            if text_rotation in (90, 270):
                wt, ht = ht, wt
            scale_x = wp / wt
            scale_y = hp / ht

            # log.debug('%r', scale_x, scale_y)
            scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

            # Translate the text so it is centered at (0, 0), rotate it there, adjust
            # for a size different between initial and text PDF, then untranslate, and
            # finally move the lower left corner to match the mediabox
            ctm = translate @ rotate @ scale @ untranslate @ corner

            base_resources = _ensure_dictionary(base_page, Name.Resources)
            base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
            text_xobj_name = Name('/' + str(uuid.uuid4()))
            xobj = self.pdf_base.make_stream(pdf_text_contents)
            base_xobjs[text_xobj_name] = xobj
            xobj.Type = Name.XObject
            xobj.Subtype = Name.Form
            xobj.FormType = 1
            xobj.BBox = mediabox
            _update_resources(obj=xobj,
                              font=font,
                              font_key=font_key,
                              procset=[Name.PDF])

            pdf_draw_xobj = ((b'q %s cm\n' % ctm.encode()) +
                             (b'%s Do\n' % text_xobj_name) + b'\nQ\n')
            new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)

            if strip_old_text:
                strip_invisible_text(self.pdf_base, base_page)

            if hasattr(pikepdf.Page, 'contents_add'):
                # pikepdf >= 2.14 adds this method and deprecates the one below
                pikepdf.Page(base_page).contents_add(new_text_layer,
                                                     prepend=True)
            else:
                # pikepdf < 2.14
                base_page.page_contents_add(new_text_layer,
                                            prepend=True)  # pragma: no cover

            _update_resources(obj=base_page,
                              font=font,
                              font_key=font_key,
                              procset=procset)
Пример #17
0
    def run(self, rows=0, cols=0, actually_trim=0):
        if self.in_doc is None:
            print(_('Input document not loaded'))
            return

        if len(self.page_range) == 0:
            self.set_page_range()

        # initialize a new document and copy over the layer info (OCGs) if it exists
        new_doc = pikepdf.Pdf.new()

        if '/OCProperties' in self.in_doc.Root.keys():
            localRoot = new_doc.copy_foreign(self.in_doc.Root)
            new_doc.Root.OCProperties = localRoot.OCProperties

        content_dict = pikepdf.Dictionary({})
        page_names = []
        pw = None
        ph = None
        page_size_ref = 0

        page_count = len(self.in_doc.pages)
        trim = [self.units_to_px(t) for t in self.trim]

        for p in self.page_range:
            if p > page_count:
                print(
                    _('Only {} pages in document, skipping {}').format(
                        page_count, p))
                continue

            if p > 0:
                pagekey = f'/Page{p}'

                if pagekey not in content_dict.keys():
                    # copy the page over as an xobject
                    # pikepdf.pages is zero indexed, so subtract one
                    localpage = new_doc.copy_foreign(self.in_doc.pages[p - 1])

                    # set the trim box to cut off content if requested
                    if actually_trim == 1:
                        if '/TrimBox' not in localpage.keys():
                            localpage.TrimBox = copy.copy(localpage.MediaBox)

                        localpage.TrimBox[0] = float(
                            localpage.TrimBox[0]) + trim[0]
                        localpage.TrimBox[1] = float(
                            localpage.TrimBox[1]) + trim[3]
                        localpage.TrimBox[2] = float(
                            localpage.TrimBox[2]) - trim[1]
                        localpage.TrimBox[3] = float(
                            localpage.TrimBox[3]) - trim[2]

                    content_dict[pagekey] = pikepdf.Page(
                        localpage).as_form_xobject()

                    # only get the width/height for the first page
                    if pw is None:
                        pw = float(localpage.MediaBox[2])
                        ph = float(localpage.MediaBox[3])
                        page_size_ref = p
                    elif abs(pw - float(localpage.MediaBox[2])) > 1 or abs(
                            ph - float(localpage.MediaBox[3])) > 1:
                        print(
                            _('Warning: page {} is a different size from {}, output may be unpredictable'
                              .format(p, page_size_ref)))

                page_names.append(pagekey)
            else:
                page_names.append(None)

        # take the most common page width/height

        # create a new document with a page big enough to contain all the tiled pages, plus requested margin
        # figure out how big it needs to be based on requested columns/rows
        n_tiles = len(page_names)
        if cols == 0 and rows == 0:
            # try for square
            cols = math.ceil(math.sqrt(n_tiles))
            rows = cols

        # columns take priority if both are specified
        if cols > 0:
            rrows = rows
            rows = math.ceil(n_tiles / cols)
            if rrows != rows and rrows != 0:
                print(
                    _('Warning: requested {} columns and {} rows, but {} rows are needed with {} pages'
                      ).format(cols, rrows, rows, n_tiles))
        else:
            cols = math.ceil(n_tiles / rows)

        # convert the margin and trim options into pixels
        unitstr = 'cm' if self.units else 'in'
        margin = self.units_to_px(self.margin)

        rotstr = _('None')

        if self.rotation == 1:
            rotstr = _('Clockwise')

        if self.rotation == 2:
            rotstr = _('Counterclockwise')

        orderstr = _('Rows then columns')
        if self.col_major:
            orderstr = _('Columns then rows')

        lrstr = _('Left to right')
        if self.right_to_left:
            lrstr = _('Right to left')

        btstr = _('Top to bottom')
        if self.bottom_to_top:
            btstr = _('Bottom to top')

        print(_('Tiling with {} rows and {} columns').format(rows, cols))
        print(_('Options') + ':')
        print('    ' + _('Margins') + ': {} {}'.format(self.margin, unitstr))
        print('    ' + _('Trim') + ': {} {}'.format(self.trim, unitstr))
        print('    ' + _('Rotation') + ': {}'.format(rotstr))
        print('    ' + _('Page order') +
              ': {}, {}, {}'.format(orderstr, lrstr, btstr))

        # define the media box with the final grid + margins
        # run through the width/height combos to find the maximum required
        # R is the rotation matrix (default to identity)
        R = [1, 0, 0, 1]

        # We need to account for the shift in origin if page rotation is applied
        o_shift = [0, 0]

        if self.rotation != 0:
            # define the rotation transform and
            # swap the trim order
            if self.rotation == 1:
                R = [0, -1, 1, 0]
                o_shift = [0, pw]
                order = [3, 2, 0, 1]

            if self.rotation == 2:
                R = [0, 1, -1, 0]
                o_shift = [ph, 0]
                order = [2, 3, 1, 0]

            # swap width and height of pages
            tmp = ph
            ph = pw
            pw = tmp

            trim = [trim[o] for o in order]

        # define the output page media box
        width = (pw - trim[0] - trim[1]) * cols
        height = (ph - trim[2] - trim[3]) * rows
        media_box = [0, 0, width + 2 * margin, height + 2 * margin]

        i = 0
        content_txt = ''

        for i in range(n_tiles):
            if not page_names[i]:
                continue

            if self.col_major:
                c = math.floor(i / rows)
                r = i % rows
            else:
                r = math.floor(i / cols)
                c = i % cols

            if self.right_to_left:
                c = cols - c - 1

            if not self.bottom_to_top:
                r = rows - r - 1

            x0 = margin - trim[0] + c * (pw - trim[0] - trim[1])
            y0 = margin - trim[3] + r * (ph - trim[2] - trim[3])

            # don't scale, just shift and rotate
            # first shift to origin, then rotate, then shift to final destination
            content_txt += f'q {R[0]} {R[1]} {R[2]} {R[3]} {x0+o_shift[0]} {y0+o_shift[1]} cm '
            content_txt += f'{page_names[i]} Do Q '

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=media_box,
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(new_doc, content_txt.encode()))

        new_doc.pages.append(newpage)
        return new_doc
Пример #18
0
def test_filter_thru(pal, filter, expected):
    page = pikepdf.Page(pal.pages[0])
    page.add_content_token_filter(filter())
    after = page.obj.Contents.read_bytes()
    assert after == expected
Пример #19
0
def test_tokenfilter_is_abstract(pal):
    page = pikepdf.Page(pal.pages[0])
    with pytest.raises((RuntimeError, pikepdf.PdfError)):
        page.get_filtered_contents(pikepdf.TokenFilter())
Пример #20
0
def test_invalid_tokenfilter(pal):
    page = pikepdf.Page(pal.pages[0])
    with pytest.raises(TypeError):
        page.get_filtered_contents(list())