示例#1
0
def test_stack_abuse():
    p = pikepdf.Pdf.new()

    stream = pikepdf.Stream(p, b'q ' * 35)
    with pytest.warns(None) as record:
        pdfinfo.info._interpret_contents(stream)
    assert 'overflowed' in str(record[0].message)

    stream = pikepdf.Stream(p, b'q Q Q Q Q')
    with pytest.warns(None) as record:
        pdfinfo.info._interpret_contents(stream)
    assert 'underflowed' in str(record[0].message)

    stream = pikepdf.Stream(p, b'q ' * 135)
    with pytest.warns(None):
        with pytest.raises(RuntimeError):
            pdfinfo.info._interpret_contents(stream)
示例#2
0
def rewrite_png(pike: Pdf, im_obj: Object,
                compdata) -> None:  # pragma: no cover
    # When a PNG is inserted into a PDF, we more or less copy the IDAT section from
    # the PDF and transfer the rest of the PNG headers to PDF image metadata.
    # One thing we have to do is tell the PDF reader whether a predictor was used
    # on the image before Flate encoding. (Typically one is.)
    # According to Leptonica source, PDF readers don't actually need us
    # to specify the correct predictor, they just need a value of either:
    #   1 - no predictor
    #   10-14 - there is a predictor
    # Leptonica's compdata->predictor only tells TRUE or FALSE
    # 10-14 means the actual predictor is specified in the data, so for any
    # number >= 10 the PDF reader will use whatever the PNG data specifies.
    # In practice Leptonica should use Paeth, 14, but 15 seems to be the
    # designated value for "optimal". So we will use 15.
    # See:
    #   - PDF RM 7.4.4.4 Table 10
    #   - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
    predictor = 15 if compdata.predictor > 0 else 1
    dparms = Dictionary(Predictor=predictor)
    if predictor > 1:
        dparms.BitsPerComponent = compdata.bps  # Yes, this is redundant
        dparms.Colors = compdata.spp
        dparms.Columns = compdata.w

    im_obj.BitsPerComponent = compdata.bps
    im_obj.Width = compdata.w
    im_obj.Height = compdata.h

    log.debug(
        f"PNG {im_obj.objgen}: palette={compdata.ncolors} spp={compdata.spp} bps={compdata.bps}"
    )
    if compdata.ncolors > 0:
        # .ncolors is the number of colors in the palette, not the number of
        # colors used in a true color image. The palette string is always
        # given as RGB tuples even when the image is grayscale; see
        # https://github.com/DanBloomberg/leptonica/blob/master/src/colormap.c#L2067
        palette_pdf_string = compdata.get_palette_pdf_string()
        palette_data = pikepdf.Object.parse(palette_pdf_string)
        palette_stream = pikepdf.Stream(pike, bytes(palette_data))
        palette = [
            Name.Indexed, Name.DeviceRGB, compdata.ncolors - 1, palette_stream
        ]
        cs = palette
    else:
        # ncolors == 0 means we are using a colorspace without a palette
        if compdata.spp == 1:
            cs = Name.DeviceGray
        elif compdata.spp == 4:
            cs = Name.DeviceCMYK
        else:  # spp == 3
            cs = Name.DeviceRGB
    im_obj.ColorSpace = cs
    im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=dparms)
示例#3
0
文件: page.py 项目: vorujack/z3c.rml
def mergePage(layerPage, mainPage, pdf, name) -> None:
    contentsForName = pdf.copy_foreign(
        pikepdf.Page(layerPage).as_form_xobject())
    newContents = b'q\n %s Do\nQ\n' % (name.encode())
    if not mainPage.Resources.get("/XObject"):
        mainPage.Resources["/XObject"] = pikepdf.Dictionary({})
    mainPage.Resources["/XObject"][name] = contentsForName
    # Use the MediaBox from the merged page
    mainPage.MediaBox = layerPage.MediaBox
    mainPage.page_contents_add(contents=pikepdf.Stream(pdf, newContents),
                               prepend=True)
示例#4
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = _apply_geom_transform(
            file, file.copy_foreign(first_original), first)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = _apply_geom_transform(
            file, file.copy_foreign(second_original), second)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()
        # See PDF reference section 4.2.3 Transformation Matrices
        tx1 = -first_foreign.MediaBox[0]
        ty1 = -first_foreign.MediaBox[1]
        tx2 = first_page_size[0] - float(second_foreign.MediaBox[0])
        ty2 = -second_foreign.MediaBox[1]
        content_txt = (f"q 1 0 0 1 {tx1} {ty1} cm /Page{i*2} Do Q "
                       f"q 1 0 0 1 {tx2} {ty2} cm /Page{i*2 + 1} Do Q ")

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
示例#5
0
def generate_booklet(pdfqueue, tmp_dir, pages):
    file, filename = make_tmp_file(tmp_dir)
    content_dict = pikepdf.Dictionary({})
    file_indexes = {p.nfile for p in pages}
    source_files = {
        n: pikepdf.open(pdfqueue[n - 1].copyname)
        for n in file_indexes
    }
    for i in range(len(pages) // 2):
        even = i % 2 == 0
        first = pages[-i - 1 if even else i]
        second = pages[i if even else -i - 1]

        second_page_size = second.size_in_points()
        first_page_size = first.size_in_points()
        page_size = [
            max(second_page_size[0], first_page_size[0]) * 2,
            max(second_page_size[1], first_page_size[1])
        ]

        first_original = source_files[first.nfile].pages[first.npage - 1]
        first_foreign = file.copy_foreign(first_original)
        _update_angle(first, first_original, first_foreign)

        second_original = source_files[second.nfile].pages[second.npage - 1]
        second_foreign = file.copy_foreign(second_original)
        _update_angle(second, second_original, second_foreign)

        content_dict[f'/Page{i*2}'] = pikepdf.Page(
            first_foreign).as_form_xobject()
        content_dict[f'/Page{i*2 + 1}'] = pikepdf.Page(
            second_foreign).as_form_xobject()

        content_txt = (
            f'q 1 0 0 1 0 0 cm /Page{i*2} Do Q'
            f' q 1 0 0 1 {first_page_size[0]} 0 cm /Page{i*2 + 1} Do Q ')

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=[0, 0, *page_size],
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(file, content_txt.encode()))

        # workaround for pikepdf <= 2.6.0. See https://github.com/pikepdf/pikepdf/issues/174
        if pikepdf.__version__ < '2.7.0':
            newpage = file.make_indirect(newpage)
        file.pages.append(newpage)

    file.save(filename)
    return filename
示例#6
0
def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    """
    Convert a group of JBIG2 images and insert into PDF.

    We use a group because JBIG2 works best with a symbol dictionary that spans
    multiple pages. When inserted back into the PDF, each JBIG2 must reference
    the symbol dictionary it is associated with. So convert a group at a time,
    and replace their streams with a parameter set that points to the
    appropriate dictionary.

    If too many pages shared the same dictionary JBIG2 encoding becomes more
    expensive and less efficient.

    """
    with concurrent.futures.ThreadPoolExecutor(
            max_workers=options.jobs) as executor:
        futures = []
        for group, xref_exts in jbig2_groups.items():
            prefix = 'group{:08d}'.format(group)
            future = executor.submit(
                jbig2enc.convert_group,
                cwd=fspath(root),
                infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
                out_prefix=prefix
            )
            futures.append(future)
        for future in concurrent.futures.as_completed(futures):
            proc = future.result()
            log.debug(proc.stderr.decode())

    for group, xref_exts in jbig2_groups.items():
        prefix = 'group{:08d}'.format(group)
        jbig2_globals_data = (root / (prefix + '.sym')).read_bytes()
        jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + '.{:04d}'.format(n))
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(
                jbig2_im_data, pikepdf.Name('/JBIG2Decode'),
                pikepdf.Dictionary({
                    '/JBIG2Globals': jbig2_globals
                })
            )
示例#7
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    rich_page = pikepdf.Page(page)
    rich_page.contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ''):
        if not in_text_obj:
            if operator == pikepdf.Operator('BT'):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator('Tr'):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator('ET'):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode('ascii')

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator('INLINE IMAGE'):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b' '.join(convert(op)
                             for op in operands) + b' ' + operator.unparse()
        lines.append(line)

    content_stream = b'\n'.join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
示例#8
0
def strip_invisible_text(pdf, page):
    stream = []
    in_text_obj = False
    render_mode = 0
    text_objects = []

    page.page_contents_coalesce()
    for operands, operator in pikepdf.parse_content_stream(page, ""):
        if not in_text_obj:
            if operator == pikepdf.Operator("BT"):
                in_text_obj = True
                render_mode = 0
                text_objects.append((operands, operator))
            else:
                stream.append((operands, operator))
        else:
            if operator == pikepdf.Operator("Tr"):
                render_mode = operands[0]
            text_objects.append((operands, operator))
            if operator == pikepdf.Operator("ET"):
                in_text_obj = False
                if render_mode != 3:
                    stream.extend(text_objects)
                text_objects.clear()

    def convert(op):
        try:
            return op.unparse()
        except AttributeError:
            return str(op).encode("ascii")

    lines = []

    for operands, operator in stream:
        if operator == pikepdf.Operator("INLINE IMAGE"):
            iim = operands[0]
            line = iim.unparse()
        else:
            line = b" ".join(convert(op) for op in operands) + b" " + operator.unparse()
        lines.append(line)

    content_stream = b"\n".join(lines)
    page.Contents = pikepdf.Stream(pdf, content_stream)
示例#9
0
def test_image_scale0(resources, outpdf):
    with pikepdf.open(resources / 'cmyk.pdf') as cmyk:
        xobj = pikepdf.Page(cmyk.pages[0]).as_form_xobject()

        p = pikepdf.Pdf.new()
        p.add_blank_page(page_size=(72, 72))
        objname = pikepdf.Page(p.pages[0]).add_resource(
            p.copy_foreign(xobj), pikepdf.Name.XObject, pikepdf.Name.Im0)
        print(objname)
        p.pages[0].Contents = pikepdf.Stream(
            p, b"q 0 0 0 0 0 0 cm %s Do Q" % bytes(objname))
        p.save(outpdf)

    pi = pdfinfo.PdfInfo(outpdf,
                         detailed_analysis=True,
                         progbar=False,
                         max_workers=1)
    assert not pi.pages[0]._images[0].dpi.is_finite
    assert pi.pages[0].dpi == Resolution(0, 0)
示例#10
0
def convert_to_jbig2(
    pike: Pdf,
    jbig2_groups: Dict[int, List[XrefExt]],
    root: Path,
    options,
    executor: Executor,
) -> None:
    """Convert images to JBIG2 and insert into PDF.

    When the JBIG2 page group size is > 1 we do several JBIG2 images at once
    and build a symbol dictionary that will span several pages. Each JBIG2
    image must reference to its symbol dictionary. If too many pages shared the
    same dictionary JBIG2 encoding becomes more expensive and less efficient.
    The default value of 10 was determined through testing. Currently this
    must be lossy encoding since jbig2enc does not support refinement coding.

    When the JBIG2 symbolic coder is not used, each JBIG2 stands on its own
    and needs no dictionary. Currently this must be lossless JBIG2.
    """

    _produce_jbig2_images(jbig2_groups, root, options, executor)

    for group, xref_exts in jbig2_groups.items():
        prefix = f'group{group:08d}'
        jbig2_symfile = root / (prefix + '.sym')
        if jbig2_symfile.exists():
            jbig2_globals_data = jbig2_symfile.read_bytes()
            jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data)
            jbig2_globals_dict = Dictionary(JBIG2Globals=jbig2_globals)
        elif options.jbig2_page_group_size == 1:
            jbig2_globals_dict = None
        else:
            raise FileNotFoundError(jbig2_symfile)

        for n, xref_ext in enumerate(xref_exts):
            xref, _ = xref_ext
            jbig2_im_file = root / (prefix + f'.{n:04d}')
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike.get_object(xref, 0)
            im_obj.write(jbig2_im_data,
                         filter=Name.JBIG2Decode,
                         decode_parms=jbig2_globals_dict)
示例#11
0
def test_malformed_docinfo(caplog, resources, outdir):
    generate_pdfa_ps(outdir / 'pdfa.ps')
    # copyfile(resources / 'trivial.pdf', outdir / 'layers.rendered.pdf')

    with pikepdf.open(resources / 'trivial.pdf') as pike:
        pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
        pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)

    options = parser.parse_args(
        args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'])
    pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
    context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf',
                         pdfinfo)

    convert_to_pdfa(str(outdir / 'layers.rendered.pdf'),
                    str(outdir / 'pdfa.ps'), context)

    print(caplog.records)
    assert any('malformed DocumentInfo block' in record.message
               for record in caplog.records)
示例#12
0
def attach_notebook(pdf_in, pdf_out, notebook):
    N = pikepdf.Name

    main_pdf = pikepdf.open(pdf_in)

    the_file = pikepdf.Stream(main_pdf, notebook["contents"])
    the_file[N("/Type")] = N("/EmbeddedFile")

    file_wrapper = pikepdf.Dictionary(F=the_file)

    fname = notebook["file_name"]
    embedded_file = pikepdf.Dictionary(
        Type=N("/Filespec"), UF=fname, F=fname, EF=file_wrapper
    )

    name_tree = pikepdf.Array([pikepdf.String(fname), embedded_file])

    embedded_files = pikepdf.Dictionary(Names=name_tree)

    names = pikepdf.Dictionary(EmbeddedFiles=embedded_files)

    main_pdf.Root[N("/Names")] = names

    main_pdf.save(pdf_out)
示例#13
0
    def run(self, rows=0, cols=0, actually_trim=0):
        if self.in_doc is None:
            print(_('Input document not loaded'))
            return

        if len(self.page_range) == 0:
            self.set_page_range()

        # initialize a new document and copy over the layer info (OCGs) if it exists
        new_doc = pikepdf.Pdf.new()

        if '/OCProperties' in self.in_doc.Root.keys():
            localRoot = new_doc.copy_foreign(self.in_doc.Root)
            new_doc.Root.OCProperties = localRoot.OCProperties

        content_dict = pikepdf.Dictionary({})
        page_names = []
        pw = None
        ph = None
        page_size_ref = 0

        page_count = len(self.in_doc.pages)
        trim = [self.units_to_px(t) for t in self.trim]

        for p in self.page_range:
            if p > page_count:
                print(
                    _('Only {} pages in document, skipping {}').format(
                        page_count, p))
                continue

            if p > 0:
                pagekey = f'/Page{p}'

                if pagekey not in content_dict.keys():
                    # copy the page over as an xobject
                    # pikepdf.pages is zero indexed, so subtract one
                    localpage = new_doc.copy_foreign(self.in_doc.pages[p - 1])

                    # set the trim box to cut off content if requested
                    if actually_trim == 1:
                        if '/TrimBox' not in localpage.keys():
                            localpage.TrimBox = copy.copy(localpage.MediaBox)

                        localpage.TrimBox[0] = float(
                            localpage.TrimBox[0]) + trim[0]
                        localpage.TrimBox[1] = float(
                            localpage.TrimBox[1]) + trim[3]
                        localpage.TrimBox[2] = float(
                            localpage.TrimBox[2]) - trim[1]
                        localpage.TrimBox[3] = float(
                            localpage.TrimBox[3]) - trim[2]

                    content_dict[pagekey] = pikepdf.Page(
                        localpage).as_form_xobject()

                    # only get the width/height for the first page
                    if pw is None:
                        pw = float(localpage.MediaBox[2])
                        ph = float(localpage.MediaBox[3])
                        page_size_ref = p
                    elif abs(pw - float(localpage.MediaBox[2])) > 1 or abs(
                            ph - float(localpage.MediaBox[3])) > 1:
                        print(
                            _('Warning: page {} is a different size from {}, output may be unpredictable'
                              .format(p, page_size_ref)))

                page_names.append(pagekey)
            else:
                page_names.append(None)

        # take the most common page width/height

        # create a new document with a page big enough to contain all the tiled pages, plus requested margin
        # figure out how big it needs to be based on requested columns/rows
        n_tiles = len(page_names)
        if cols == 0 and rows == 0:
            # try for square
            cols = math.ceil(math.sqrt(n_tiles))
            rows = cols

        # columns take priority if both are specified
        if cols > 0:
            rrows = rows
            rows = math.ceil(n_tiles / cols)
            if rrows != rows and rrows != 0:
                print(
                    _('Warning: requested {} columns and {} rows, but {} rows are needed with {} pages'
                      ).format(cols, rrows, rows, n_tiles))
        else:
            cols = math.ceil(n_tiles / rows)

        # convert the margin and trim options into pixels
        unitstr = 'cm' if self.units else 'in'
        margin = self.units_to_px(self.margin)

        rotstr = _('None')

        if self.rotation == 1:
            rotstr = _('Clockwise')

        if self.rotation == 2:
            rotstr = _('Counterclockwise')

        orderstr = _('Rows then columns')
        if self.col_major:
            orderstr = _('Columns then rows')

        lrstr = _('Left to right')
        if self.right_to_left:
            lrstr = _('Right to left')

        btstr = _('Top to bottom')
        if self.bottom_to_top:
            btstr = _('Bottom to top')

        print(_('Tiling with {} rows and {} columns').format(rows, cols))
        print(_('Options') + ':')
        print('    ' + _('Margins') + ': {} {}'.format(self.margin, unitstr))
        print('    ' + _('Trim') + ': {} {}'.format(self.trim, unitstr))
        print('    ' + _('Rotation') + ': {}'.format(rotstr))
        print('    ' + _('Page order') +
              ': {}, {}, {}'.format(orderstr, lrstr, btstr))

        # define the media box with the final grid + margins
        # run through the width/height combos to find the maximum required
        # R is the rotation matrix (default to identity)
        R = [1, 0, 0, 1]

        # We need to account for the shift in origin if page rotation is applied
        o_shift = [0, 0]

        if self.rotation != 0:
            # define the rotation transform and
            # swap the trim order
            if self.rotation == 1:
                R = [0, -1, 1, 0]
                o_shift = [0, pw]
                order = [3, 2, 0, 1]

            if self.rotation == 2:
                R = [0, 1, -1, 0]
                o_shift = [ph, 0]
                order = [2, 3, 1, 0]

            # swap width and height of pages
            tmp = ph
            ph = pw
            pw = tmp

            trim = [trim[o] for o in order]

        # define the output page media box
        width = (pw - trim[0] - trim[1]) * cols
        height = (ph - trim[2] - trim[3]) * rows
        media_box = [0, 0, width + 2 * margin, height + 2 * margin]

        i = 0
        content_txt = ''

        for i in range(n_tiles):
            if not page_names[i]:
                continue

            if self.col_major:
                c = math.floor(i / rows)
                r = i % rows
            else:
                r = math.floor(i / cols)
                c = i % cols

            if self.right_to_left:
                c = cols - c - 1

            if not self.bottom_to_top:
                r = rows - r - 1

            x0 = margin - trim[0] + c * (pw - trim[0] - trim[1])
            y0 = margin - trim[3] + r * (ph - trim[2] - trim[3])

            # don't scale, just shift and rotate
            # first shift to origin, then rotate, then shift to final destination
            content_txt += f'q {R[0]} {R[1]} {R[2]} {R[3]} {x0+o_shift[0]} {y0+o_shift[1]} cm '
            content_txt += f'{page_names[i]} Do Q '

        newpage = pikepdf.Dictionary(
            Type=pikepdf.Name.Page,
            MediaBox=media_box,
            Resources=pikepdf.Dictionary(XObject=content_dict),
            Contents=pikepdf.Stream(new_doc, content_txt.encode()))

        new_doc.pages.append(newpage)
        return new_doc
示例#14
0
def transcode_pngs(pike, images, image_name_fn, root, log, options):
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )
        with concurrent.futures.ThreadPoolExecutor(
                max_workers=options.jobs) as executor:
            for xref in images:
                log.debug(image_name_fn(root, xref))
                executor.submit(
                    pngquant.quantize,
                    image_name_fn(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )

    for xref in images:
        im_obj = pike.get_object(xref, 0)
        try:
            compdata = leptonica.CompressedData.open(png_name(root, xref))
        except leptonica.LeptonicaError as e:
            # Most likely this means file not found, i.e. quantize did not
            # produce an improved version
            log.error(e)
            continue

        # If re-coded image is larger don't use it - we test here because
        # pngquant knows the size of the temporary output file but not the actual
        # object in the PDF
        if len(compdata) > int(im_obj.stream_dict.Length):
            log.debug(
                f"pngquant: pngquant did not improve over original image "
                f"{len(compdata)} > {int(im_obj.stream_dict.Length)}")
            continue

        # When a PNG is inserted into a PDF, we more or less copy the IDAT section from
        # the PDF and transfer the rest of the PNG headers to PDF image metadata.
        # One thing we have to do is tell the PDF reader whether a predictor was used
        # on the image before Flate encoding. (Typically one is.)
        # According to Leptonica source, PDF readers don't actually need us
        # to specify the correct predictor, they just need a value of either:
        #   1 - no predictor
        #   10-14 - there is a predictor
        # Leptonica's compdata->predictor only tells TRUE or FALSE
        # From there the PNG decoder can infer the rest from the file.
        # In practice the predictor should be Paeth, 14, so we'll use that.
        # See:
        #   - PDF RM 7.4.4.4 Table 10
        #   - https://github.com/DanBloomberg/leptonica/blob/master/src/pdfio2.c#L757
        predictor = 14 if compdata.predictor > 0 else 1
        dparms = Dictionary(Predictor=predictor)
        if predictor > 1:
            dparms.BitsPerComponent = compdata.bps  # Yes, this is redundant
            dparms.Colors = compdata.spp
            dparms.Columns = compdata.w

        im_obj.BitsPerComponent = compdata.bps
        im_obj.Width = compdata.w
        im_obj.Height = compdata.h

        if compdata.ncolors > 0:
            # .ncolors is the number of colors in the palette, not the number of
            # colors used in a true color image
            palette_pdf_string = compdata.get_palette_pdf_string()
            palette_data = pikepdf.Object.parse(palette_pdf_string)
            palette_stream = pikepdf.Stream(pike, bytes(palette_data))
            palette = [
                Name.Indexed,
                Name.DeviceRGB,
                compdata.ncolors - 1,
                palette_stream,
            ]
            cs = palette
        else:
            if compdata.spp == 1:
                # PDF interprets binary-1 as black in 1bpp, but PNG sets
                # black to 0 for 1bpp. Create a palette that informs the PDF
                # of the mapping - seems cleaner to go this way but pikepdf
                # needs to be patched to support it.
                # palette = [Name.Indexed, Name.DeviceGray, 1, b"\xff\x00"]
                # cs = palette
                cs = Name.DeviceGray
            elif compdata.spp == 3:
                cs = Name.DeviceRGB
            elif compdata.spp == 4:
                cs = Name.DeviceCMYK
        if compdata.bps == 1:
            im_obj.Decode = [
                1, 0
            ]  # Bit of a kludge but this inverts photometric too
        im_obj.ColorSpace = cs
        im_obj.write(compdata.read(),
                     filter=Name.FlateDecode,
                     decode_parms=dparms)
示例#15
0
    def _graft_text_layer(
        self,
        *,
        page_num: int,
        textpdf: Path,
        font: pikepdf.Object,
        font_key: pikepdf.Object,
        procset: pikepdf.Object,
        text_rotation: int,
        strip_old_text: bool,
    ):
        """Insert the text layer from text page 0 on to pdf_base at page_num"""

        if Path(textpdf).stat().st_size == 0:
            return

        # This is a pointer indicating a specific page in the base file
        with pikepdf.open(textpdf) as pdf_text:
            pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

            base_page = self.pdf_base.pages.p(page_num)

            # The text page always will be oriented up by this stage but the original
            # content may have a rotation applied. Wrap the text stream with a rotation
            # so it will be oriented the same way as the rest of the page content.
            # (Previous versions OCRmyPDF rotated the content layer to match the text.)
            mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
            wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
            wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

            translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
            untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
            corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
            # -rotation because the input is a clockwise angle and this formula
            # uses CCW
            text_rotation = -text_rotation % 360
            rotate = pikepdf.PdfMatrix().rotated(text_rotation)

            # Because of rounding of DPI, we might get a text layer that is not
            # identically sized to the target page. Scale to adjust. Normally this
            # is within 0.998.
            if text_rotation in (90, 270):
                wt, ht = ht, wt
            scale_x = wp / wt
            scale_y = hp / ht

            # log.debug('%r', scale_x, scale_y)
            scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

            # Translate the text so it is centered at (0, 0), rotate it there, adjust
            # for a size different between initial and text PDF, then untranslate, and
            # finally move the lower left corner to match the mediabox
            ctm = translate @ rotate @ scale @ untranslate @ corner

            base_resources = _ensure_dictionary(base_page, Name.Resources)
            base_xobjs = _ensure_dictionary(base_resources, Name.XObject)
            text_xobj_name = Name("/" + str(uuid.uuid4()))
            xobj = self.pdf_base.make_stream(pdf_text_contents)
            base_xobjs[text_xobj_name] = xobj
            xobj.Type = Name.XObject
            xobj.Subtype = Name.Form
            xobj.FormType = 1
            xobj.BBox = mediabox
            _update_resources(
                obj=xobj, font=font, font_key=font_key, procset=[Name.PDF]
            )

            pdf_draw_xobj = (
                (b"q %s cm\n" % ctm.encode()) + (b"%s Do\n" % text_xobj_name) + b"\nQ\n"
            )
            new_text_layer = pikepdf.Stream(self.pdf_base, pdf_draw_xobj)

            if strip_old_text:
                strip_invisible_text(self.pdf_base, base_page)

            base_page.page_contents_add(new_text_layer, prepend=True)

            _update_resources(
                obj=base_page, font=font, font_key=font_key, procset=procset
            )
示例#16
0
def _graft_text_layer(*, pdf_base, page_num, text, font, font_key, procset,
                      rotation, strip_old_text, log):
    """Insert the text layer from text page 0 on to pdf_base at page_num"""

    log.debug("Grafting")
    if Path(text).stat().st_size == 0:
        return

    # This is a pointer indicating a specific page in the base file
    pdf_text = pikepdf.open(text)
    pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

    base_page = pdf_base.pages.p(page_num)

    # The text page always will be oriented up by this stage but the original
    # content may have a rotation applied. Wrap the text stream with a rotation
    # so it will be oriented the same way as the rest of the page content.
    # (Previous versions OCRmyPDF rotated the content layer to match the text.)
    mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
    wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
    wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
    untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
    corner = pikepdf.PdfMatrix().translated(mediabox[0], mediabox[1])
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    # log.debug('%r', scale_x, scale_y)
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

    # Translate the text so it is centered at (0, 0), rotate it there, adjust
    # for a size different between initial and text PDF, then untranslate, and
    # finally move the lower left corner to match the mediabox
    ctm = translate @ rotate @ scale @ untranslate @ corner

    pdf_text_contents = b'q %s cm\n' % ctm.encode(
    ) + pdf_text_contents + b'\nQ\n'

    new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)

    if strip_old_text:
        strip_invisible_text(pdf_base, base_page)

    base_page.page_contents_add(new_text_layer, prepend=True)

    _update_page_resources(page=base_page,
                           font=font,
                           font_key=font_key,
                           procset=procset)
    pdf_text.close()
示例#17
0
def transcode_pngs(pike, pngs, root, log, options):
    if options.optimize >= 2:
        png_quality = (
            max(10, options.png_quality - 10),
            min(100, options.png_quality + 10),
        )
        with concurrent.futures.ThreadPoolExecutor(
            max_workers=options.jobs
        ) as executor:
            for xref in pngs:
                executor.submit(
                    pngquant.quantize,
                    png_name(root, xref),
                    png_name(root, xref),
                    png_quality[0],
                    png_quality[1],
                )

    for xref in pngs:
        im_obj = pike.get_object(xref, 0)

        # Open, transcode (!), package for PDF
        try:
            pix = leptonica.Pix.open(png_name(root, xref))
            if pix.depth == 1:
                pix = pix.invert()  # PDF assumes 1 is black for monochrome
            compdata = pix.generate_pdf_ci_data(leptonica.lept.L_FLATE_ENCODE, 0)
        except leptonica.LeptonicaError as e:
            log.error(e)
            continue

        # This is what we should be doing: open the compressed data without
        # transcoding. However this shifts each pixel row by one for some
        # reason.
        # compdata = leptonica.CompressedData.open(png_name(root, xref))
        if len(compdata) > int(im_obj.stream_dict.Length):
            continue  # If we produced a larger image, don't use

        predictor = None
        if compdata.predictor > 0:
            predictor = Dictionary(Predictor=compdata.predictor)

        im_obj.BitsPerComponent = compdata.bps
        im_obj.Width = compdata.w
        im_obj.Height = compdata.h

        if compdata.ncolors > 0:
            palette_pdf_string = compdata.get_palette_pdf_string()
            palette_data = pikepdf.Object.parse(palette_pdf_string)
            palette_stream = pikepdf.Stream(pike, bytes(palette_data))
            palette = [
                Name.Indexed,
                Name.DeviceRGB,
                compdata.ncolors - 1,
                palette_stream,
            ]
            cs = palette
        else:
            if compdata.spp == 1:
                cs = Name.DeviceGray
            elif compdata.spp == 3:
                cs = Name.DeviceRGB
            elif compdata.spp == 4:
                cs = Name.DeviceCMYK
        im_obj.ColorSpace = cs
        im_obj.write(compdata.read(), filter=Name.FlateDecode, decode_parms=predictor)
示例#18
0
def _weave_layers_graft(*, pdf_base, page_num, text, font, font_key, procset,
                        rotation, log):
    """Insert the text layer from text page 0 on to pdf_base at page_num"""

    log.debug("Grafting")
    if Path(text).stat().st_size == 0:
        return

    # This is a pointer indicating a specific page in the base file
    pdf_text = pikepdf.open(text)
    pdf_text_contents = pdf_text.pages[0].Contents.read_bytes()

    if not tesseract.has_textonly_pdf():
        # If we don't have textonly_pdf, edit the stream to delete the
        # instruction to draw the image Tesseract generated, which we do not
        # use.
        stream = bytearray(pdf_text_contents)
        pattern = b'/Im1 Do'
        idx = stream.find(pattern)
        stream[idx:(idx + len(pattern))] = b' ' * len(pattern)
        pdf_text_contents = bytes(stream)

    base_page = pdf_base.pages.p(page_num)

    # The text page always will be oriented up by this stage but the original
    # content may have a rotation applied. Wrap the text stream with a rotation
    # so it will be oriented the same way as the rest of the page content.
    # (Previous versions OCRmyPDF rotated the content layer to match the text.)
    mediabox = [float(pdf_text.pages[0].MediaBox[v]) for v in range(4)]
    wt, ht = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    mediabox = [float(base_page.MediaBox[v]) for v in range(4)]
    wp, hp = mediabox[2] - mediabox[0], mediabox[3] - mediabox[1]

    translate = pikepdf.PdfMatrix().translated(-wt / 2, -ht / 2)
    untranslate = pikepdf.PdfMatrix().translated(wp / 2, hp / 2)
    # -rotation because the input is a clockwise angle and this formula
    # uses CCW
    rotation = -rotation % 360
    rotate = pikepdf.PdfMatrix().rotated(rotation)

    # Because of rounding of DPI, we might get a text layer that is not
    # identically sized to the target page. Scale to adjust. Normally this
    # is within 0.998.
    if rotation in (90, 270):
        wt, ht = ht, wt
    scale_x = wp / wt
    scale_y = hp / ht

    log.debug('%r', (scale_x, scale_y))
    scale = pikepdf.PdfMatrix().scaled(scale_x, scale_y)

    # Translate the text so it is centered at (0, 0), rotate it there, adjust
    # for a size different between initial and text PDF, then untranslate
    ctm = translate @ rotate @ scale @ untranslate

    pdf_text_contents = (b'q %s cm\n' % ctm.encode() + pdf_text_contents +
                         b'\nQ\n')

    new_text_layer = pikepdf.Stream(pdf_base, pdf_text_contents)

    base_page.page_contents_add(new_text_layer, prepend=True)

    _update_page_resources(page=base_page,
                           font=font,
                           font_key=font_key,
                           procset=procset)