Exemplo n.º 1
0
def set_layer_visibility(pdf, layers_to_show):
    """Set visibility of layers."""
    try:
        ocgs = pdf.Root.OCProperties.OCGs
    except (AttributeError, KeyError):
        logger.error("Unable to locate layers in PDF.")
        sys.exit(1)

    ocgs_on = []
    for ocg in ocgs:
        if ocg.Name in layers_to_show:
            logger.info("Layer %s will be visible.", ocg.Name)
            ocgs_on.append(ocg)
        else:
            logger.info("Layer %s will be hidden.", ocg.Name)

    ocgs_config = pikepdf.Dictionary(
        BaseState=pikepdf.Name('/OFF'),
        ON=ocgs_on,
        Order=ocgs,
    )

    pdf.Root.OCProperties = pikepdf.Dictionary(
        D=ocgs_config,
        OCGs=ocgs,
    )

    # Needed for google-chrome (at least):
    for ocg in ocgs:
        if '/View' in ocg.Usage:
            del ocg.Usage.View
        if '/Print' in ocg.Usage:
            del ocg.Usage.Print
Exemplo n.º 2
0
def attach_notebook(pdf_in, pdf_out, notebook):
    N = pikepdf.Name

    main_pdf = pikepdf.open(pdf_in)

    the_file = pikepdf.Stream(main_pdf, notebook["contents"])
    the_file[N("/Type")] = N("/EmbeddedFile")

    file_wrapper = pikepdf.Dictionary(F=the_file)

    fname = notebook["file_name"]
    embedded_file = pikepdf.Dictionary(Type=N("/Filespec"),
                                       UF=fname,
                                       F=fname,
                                       EF=file_wrapper)

    name_tree = pikepdf.Array([pikepdf.String(fname), embedded_file])

    embedded_files = pikepdf.Dictionary(Names=name_tree)

    names = pikepdf.Dictionary(EmbeddedFiles=embedded_files)

    main_pdf.Root[N("/Names")] = names

    main_pdf.save(pdf_out)
Exemplo n.º 3
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = _apply_geom_transform(
            file, file.copy_foreign(first_original), first)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = _apply_geom_transform(
            file, file.copy_foreign(second_original), second)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()
        # See PDF reference section 4.2.3 Transformation Matrices
        tx1 = -first_foreign.MediaBox[0]
        ty1 = -first_foreign.MediaBox[1]
        tx2 = first_page_size[0] - float(second_foreign.MediaBox[0])
        ty2 = -second_foreign.MediaBox[1]
        content_txt = (f"q 1 0 0 1 {tx1} {ty1} cm /Page{i*2} Do Q "
                       f"q 1 0 0 1 {tx2} {ty2} cm /Page{i*2 + 1} Do Q ")

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
Exemplo n.º 4
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = file.copy_foreign(first_original)
        _update_angle(first, first_original, first_foreign)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = file.copy_foreign(second_original)
        _update_angle(second, second_original, second_foreign)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()

        content_txt = (
            f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q'
            f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ')

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
Exemplo n.º 5
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    rotate = 0
    if "/Rotate" in page:
        # We'll set the rotate attribute on the resulting page so we must
        # unset it on the input page before
        rotate = page.Rotate
        page.Rotate = 0
    page = doc.make_indirect(page)
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
        Rotate=rotate,
    )
    return new_page
Exemplo n.º 6
0
 def test_bad_name_set(self):
     d = pikepdf.Dictionary()
     d['/Slash'] = 'dot'
     with pytest.raises(KeyError, match=r"must begin with '/'"):
         d['unslash'] = 'error'
     with pytest.raises(KeyError, match=r"may not be '/'"):
         d['/'] = 'error'
Exemplo n.º 7
0
def metadata_fixup(input_files_groups, output_file, log, context):
    options = context.get_options()

    input_files = list(f for f in flatten_groups(input_files_groups))
    metadata_file = next(
        (ii for ii in input_files if ii.endswith('.repaired.pdf')), None)
    layers_file = next(
        (ii for ii in input_files if ii.endswith('layers.rendered.pdf')), None)
    ps = next((ii for ii in input_files if ii.endswith('.ps')), None)

    if options.output_type.startswith('pdfa'):
        input_pdfinfo = context.get_pdfinfo()
        ghostscript.generate_pdfa(
            pdf_version=input_pdfinfo.min_version,
            pdf_pages=[layers_file, ps],
            output_file=output_file,
            compression=options.pdfa_image_compression,
            log=log,
            threads=options.jobs or 1,
            pdfa_part=options.output_type[-1]  # is pdfa-1, pdfa-2, or pdfa-3
        )
    else:
        metadata = pikepdf.open(metadata_file)
        pdfmark = get_pdfmark(metadata, options)
        pdf = pikepdf.open(layers_file)
        pdf.metadata = pdf.make_indirect(pikepdf.Dictionary(pdfmark))
        pdf.save(output_file, stream_data_mode=pikepdf.StreamDataMode.compress)
Exemplo n.º 8
0
def test_random_valid_docinfo(docinfo):
    p = pikepdf.new()
    with p.open_metadata() as m:
        pdf_docinfo = pikepdf.Dictionary(docinfo)

        m.load_from_docinfo(pdf_docinfo, raise_failure=True)
        ET.fromstring(str(m))  # ensure we can parse it
Exemplo n.º 9
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    rotate = 0
    if "/Rotate" in page:
        # We'll set the rotate attribute on the resulting page so we must
        # unset it on the input page before
        rotate = page.Rotate
        page.Rotate = 0
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
        Rotate=rotate,
    )
    # This was needed for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
    # It's also needed with pikepdf 4.2 else we get:
    # RuntimeError: QPDFPageObjectHelper::getFormXObjectForPage called with a direct object
    # when calling as_form_xobject in generate_booklet
    new_page = doc.make_indirect(new_page)
    return new_page
Exemplo n.º 10
0
def transcode_pngs(pike, pngs, root, log, options):
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10)
        )
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=options.jobs) as executor:
            for xref in pngs:
                executor.submit(
                    pngquant.quantize,
                    png_name(root, xref), png_name(root, xref),
                    png_quality[0], png_quality[1])

    for xref in pngs:
        im_obj = pike.get_object(xref, 0)

        # Open, transcode (!), package for PDF
        try:
            pix = leptonica.Pix.open(png_name(root, xref))
            if pix.depth == 1:
                pix = pix.invert()  # PDF assumes 1 is black for monochrome
            compdata = pix.generate_pdf_ci_data(
                leptonica.lept.L_FLATE_ENCODE, 0
            )
        except leptonica.LeptonicaError as e:
            log.error(e)
            continue

        # This is what we should be doing: open the compressed data without
        # transcoding. However this shifts each pixel row by one for some
        # reason.
        #compdata = leptonica.CompressedData.open(png_name(root, xref))
        if len(compdata) > int(im_obj.stream_dict.Length):
            continue  # If we produced a larger image, don't use

        predictor = Null()
        if compdata.predictor > 0:
            predictor = pikepdf.Dictionary({'/Predictor': compdata.predictor})

        im_obj.BitsPerComponent = compdata.bps
        im_obj.Width = compdata.w
        im_obj.Height = compdata.h

        if compdata.ncolors > 0:
            palette_pdf_string = compdata.get_palette_pdf_string()
            palette_data = pikepdf.Object.parse(palette_pdf_string)
            palette_stream = pikepdf.Stream(pike, bytes(palette_data))
            palette = [pikepdf.Name('/Indexed'), pikepdf.Name('/DeviceRGB'),
                       compdata.ncolors - 1, palette_stream]
            cs = palette
        else:
            if compdata.spp == 1:
                cs = pikepdf.Name('/DeviceGray')
            elif compdata.spp == 3:
                cs = pikepdf.Name('/DeviceRGB')
            elif compdata.spp == 4:
                cs = pikepdf.Name('/DeviceCMYK')
        im_obj.ColorSpace = cs
        im_obj.write(compdata.read(), pikepdf.Name('/FlateDecode'), predictor)
Exemplo n.º 11
0
def test_prevent_gs_invalid_xml(resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    # Inject a string with a trailing nul character into the DocumentInfo
    # dictionary of this PDF, as often occurs in practice.
    with pikepdf.open(outdir / 'layers.rendered.pdf') as pike:
        pike.Root.DocumentInfo = pikepdf.Dictionary(
            Title=b'String with trailing nul\x00'
        )

    options = get_parser().parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
    )
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PdfContext(
        options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, get_plugin_manager([])
    )

    convert_to_pdfa(
        str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
    )

    with open(outdir / 'pdfa.pdf', 'r+b') as f:
        with mmap.mmap(f.fileno(), 0) as mm:
            # Since the XML may be invalid, we scan instead of actually feeding it
            # to a parser.
            XMP_MAGIC = b'W5M0MpCehiHzreSzNTczkc9d'
            xmp_start = mm.find(XMP_MAGIC)
            xmp_end = mm.rfind(b'<?xpacket end', xmp_start)
            assert 0 < xmp_start < xmp_end
            # Ensure we did not carry the nul forward.
            assert mm.find(b'&#0;', xmp_start, xmp_end) == -1, "found escaped nul"
            assert mm.find(b'\x00', xmp_start, xmp_end) == -1
Exemplo n.º 12
0
def _update_page_resources(*, page, font, font_key, procset):
    """Update this page's fonts with a reference to the Glyphless font"""

    if '/Resources' not in page:
        page['/Resources'] = pikepdf.Dictionary({})
    resources = page['/Resources']
    try:
        fonts = resources['/Font']
    except KeyError:
        fonts = pikepdf.Dictionary({})
    if font_key is not None and font_key not in fonts:
        fonts[font_key] = font
    resources['/Font'] = fonts

    # Reassign /ProcSet to one that just lists everything - ProcSet is
    # obsolete and doesn't matter but recommended for old viewer support
    resources['/ProcSet'] = procset
Exemplo n.º 13
0
def test_random_docinfo(docinfo):
    p = pikepdf.new()
    with p.open_metadata() as m:
        pdf_docinfo = pikepdf.Dictionary(docinfo)

        try:
            m.load_from_docinfo(pdf_docinfo, raise_failure=True)
        except ValueError as e:
            assert 'could not be copied to XMP' in str(e) or '/Dummy' in str(e)
        else:
            ET.fromstring(str(m))  # ensure we can parse it
Exemplo n.º 14
0
def test_failed_add_page_cleanup():
    pdf = pikepdf.new()
    d = pikepdf.Dictionary(Type=pikepdf.Name.NotAPage)
    num_objects = len(pdf.objects)
    with pytest.raises(TypeError, match="only pages can be inserted"):
        pdf.pages.append(d)
    assert len(pdf.pages) == 0

    # If we fail to add a new page, we expect one new null object handle to be
    # be added (since QPDF does not remove the object outright)
    assert len(pdf.objects) == num_objects + 1, "QPDF semantics changed"
    assert pdf.objects[-1] is None, "Left a stale object behind without deleting"

    # But we'd better not delete an existing object...
    d2 = pdf.make_indirect(pikepdf.Dictionary(Type=pikepdf.Name.StillNotAPage))
    with pytest.raises(TypeError, match="only pages can be inserted"):
        pdf.pages.append(d2)
    assert len(pdf.pages) == 0

    assert d2.same_owner_as(pdf.Root)
Exemplo n.º 15
0
def mergePage(layerPage, mainPage, pdf, name) -> None:
    contentsForName = pdf.copy_foreign(
        pikepdf.Page(layerPage).as_form_xobject())
    newContents = b'q\n %s Do\nQ\n' % (name.encode())
    if not mainPage.Resources.get("/XObject"):
        mainPage.Resources["/XObject"] = pikepdf.Dictionary({})
    mainPage.Resources["/XObject"][name] = contentsForName
    # Use the MediaBox from the merged page
    mainPage.MediaBox = layerPage.MediaBox
    mainPage.page_contents_add(contents=pikepdf.Stream(pdf, newContents),
                               prepend=True)
Exemplo n.º 16
0
def test_not_convertible():
    class PurePythonObj:
        def __repr__(self):
            return 'PurePythonObj()'

    c = PurePythonObj()
    with pytest.raises(RuntimeError):
        encode(c)
    with pytest.raises(RuntimeError):
        pikepdf.Array([1, 2, c])

    d = pikepdf.Dictionary()
    with pytest.raises(RuntimeError):
        d.SomeKey = c
Exemplo n.º 17
0
def test_issue_162(trivial, author):
    trivial.Root.Metadata = Stream(
        trivial,
        b"""
        <?xpacket begin="\xef\xbb\xbf" id="W5M0MpCehiHzreSzNTczkc9d"?>
        <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
                xmlns:xmp="http://ns.adobe.com/xap/1.0/"
                xmlns:dc="http://purl.org/dc/elements/1.1/">
        <rdf:Description rdf:about="" xmlns:dc="http://purl.org/dc/elements/1.1/" dc:creator="Foo"></rdf:Description>
        </rdf:RDF>
        <?xpacket end="w"?>""",
    )
    with trivial.open_metadata() as m:
        docinfo = pikepdf.Dictionary(Author=author)
        with pytest.warns(UserWarning, match=r'Merging elements'):
            m.load_from_docinfo(docinfo, raise_failure=True)
        assert m['dc:creator'] == [author]
Exemplo n.º 18
0
def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    """
    Convert a group of JBIG2 images and insert into PDF.

    We use a group because JBIG2 works best with a symbol dictionary that spans
    multiple pages. When inserted back into the PDF, each JBIG2 must reference
    the symbol dictionary it is associated with. So convert a group at a time,
    and replace their streams with a parameter set that points to the
    appropriate dictionary.

    If too many pages shared the same dictionary JBIG2 encoding becomes more
    expensive and less efficient.

    """
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=options.jobs) as executor:
        futures = []
        for group, xref_exts in jbig2_groups.items():
            prefix = 'group{:08d}'.format(group)
            future = executor.submit(
                jbig2enc.convert_group,
                cwd=fspath(root),
                infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
                out_prefix=prefix
            )
            futures.append(future)
        for future in concurrent.futures.as_completed(futures):
            proc = future.result()
            log.debug(proc.stderr.decode())

    for group, xref_exts in jbig2_groups.items():
        prefix = 'group{:08d}'.format(group)
        jbig2_globals_data = (root / (prefix + '.sym')).read_bytes()
        jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + '.{:04d}'.format(n))
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(
                jbig2_im_data, pikepdf.Name('/JBIG2Decode'),
                pikepdf.Dictionary({
                    '/JBIG2Globals': jbig2_globals
                })
            )
Exemplo n.º 19
0
def _scale(doc, page, factor):
    """ Scale a page """
    if factor == 1:
        return page
    page = doc.make_indirect(page)
    page_id = len(doc.pages)
    newmediabox = [factor * float(x) for x in page.MediaBox]
    content = "q {} 0 0 {} 0 0 cm /p{} Do Q".format(factor, factor, page_id)
    xobject = pikepdf.Page(page).as_form_xobject()
    new_page = pikepdf.Dictionary(
        Type=pikepdf.Name.Page,
        MediaBox=newmediabox,
        Contents=doc.make_stream(content.encode()),
        Resources={'/XObject': {
            '/p{}'.format(page_id): xobject
        }},
    )
    return new_page
Exemplo n.º 20
0
def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    """Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this is must be lossless JBIG2.
    """

    _produce_jbig2_images(jbig2_groups, root, log, options)

    for group, xref_exts in jbig2_groups.items():
        prefix = f'group{group:08d}'
        jbig2_symfile = root / (prefix + '.sym')
        if jbig2_symfile.exists():
            jbig2_globals_data = jbig2_symfile.read_bytes()
            jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)
            jbig2_globals_dict = pikepdf.Dictionary(
                {'/JBIG2Globals': jbig2_globals})
        elif options.jbig2_page_group_size == 1:
            jbig2_globals_dict = None
        else:
            raise FileNotFoundError(jbig2_symfile)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + f'.{n:04d}')
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(
                jbig2_im_data,
                filter=pikepdf.Name('/JBIG2Decode'),
                decode_parms=jbig2_globals_dict,
            )
Exemplo n.º 21
0
 def test_bad_name(self):
     with pytest.raises(ValueError, match=r"must begin with '/'"):
         pikepdf.Dictionary({'/Slash': 'dot', 'unslash': 'error'})
Exemplo n.º 22
0
 def test_nonpage(self):
     d = pikepdf.Dictionary(A='a')
     with pytest.raises(TypeError):
         d.images  # pylint: disable=pointless-statement
     with pytest.raises(TypeError):
         d.page_contents_add(b'', True)
Exemplo n.º 23
0
 def test_get(self):
     d = pikepdf.Dictionary(A='a')
     assert d.get(Name.A) == 'a'
     assert d.get(Name.Resources, 42) == 42
Exemplo n.º 24
0
 def test_attr(self):
     d = pikepdf.Dictionary(A='a')
     with pytest.raises(AttributeError):
         d.invalidname  # pylint: disable=pointless-statement
Exemplo n.º 25
0
 def test_str(self):
     d = pikepdf.Dictionary(A='a')
     with pytest.raises(NotImplementedError):
         str(d)
Exemplo n.º 26
0
 def test_items(self):
     d = pikepdf.Dictionary(A='a')
     for _k in d.items():
         pass
Exemplo n.º 27
0
 def test_iter(self):
     d = pikepdf.Dictionary(A='a')
     for k in d:
         assert k == '/A'
         assert d[k] == 'a'
Exemplo n.º 28
0
 def test_kwargs(self):
     d = pikepdf.Dictionary(A='a', B='b', C='c')
     assert '/B' in d
     assert 'B' in dir(d)
Exemplo n.º 29
0
 def test_init(self):
     d1 = pikepdf.Dictionary({'/Animal': 'Dog'})
     d2 = pikepdf.Dictionary(Animal='Dog')
     assert d1 == d2
Exemplo n.º 30
0
 def test_none(self):
     d = pikepdf.Dictionary({'/One': 1, '/Two': 2})
     with pytest.raises(ValueError):
         d['/Two'] = None