Exemplo n.º 1
0
    def __init__(self, stream, should_chop: bool = False):
        """
        Initializes a PdfFileReader object.  This operation can take some time, as
        the PDF stream's cross-reference tables are read into memory.

        Stability: Added in v1.0, will exist for all v1.x releases.

        stream - An object that supports the standard read
                 and seek methods similar to a file object.
        """
        self._xref = {}
        self._xref_obj_stream = {}
        self._resolved_objects = {}
        self._trailer = DictObject()
        self._stream: BufferedReader = stream
        self._read_cross_reference()

        self._override_encryption = False

        self._flattened_pages = []
        self._flatten_pages(self._trailer[_k.ROOT].get_object()[_k.PAGES].get_object(), dict())
        if should_chop:
            self._chop_images()
        self._named_dests = self.__load_named_destinations()
        self.__outlines = self.__load_outlines()
Exemplo n.º 2
0
def create_blank_page(_pdf=None, width=None, height=None):
    """
    Returns a new blank page.
    If width or height is None, try to get the page size from the
    last page of PyPDF. If PyPDF is None or contains no page, a
    PageSizeNotDefinedError is raised.

     _pdf -- PDF file the page belongs to
    width -- The width of the new page expressed in default user space units.
    height -- The height of the new page expressed in default user space units.
    """
    page = PageObject(_pdf)

    # Creates a new page (cf PDF Reference  7.7.3.3)
    page.__setitem__(NameObject(_k.TYPE), NameObject(_k.PAGE))
    page.__setitem__(NameObject(b'/Parent'), NullObject())
    page.__setitem__(NameObject(_k.RESOURCES), DictObject())
    if width is None or height is None:
        if _pdf is not None and _pdf.get_pages_count() > 0:
            last_page = _pdf.get_page(_pdf.get_pages_count() - 1)
            width = last_page.media_box.get_width()
            height = last_page.media_box.get_height()
        else:
            raise utils.PageSizeNotDefinedError()
    page.__setitem__(NameObject(_k.MEDIA_BOX),
                     RectangleObject([0, 0, width, height]))
    return page
Exemplo n.º 3
0
    def merge_page(self, page2, page2transformation=None):
        """Merges the content streams of two pages into one. Resource
        references (i.e. fonts) are maintained from both pages. The
        mediabox/cropbox/etc of this page are not altered. The parameter page's
        content stream will be added to the end of this page's content stream,
        meaning that it will be drawn after, or "on top" of this page.

        page2 - An instance of {@link #PageObject PageObject} to be merged
                into this one.
        page2transformation - A function which applies a transformation to
                              the content stream of page2. Takes: page2
                              contents stream. Must return: new contents
                              stream. If omitted, the content stream will
                              not be modified."""
        # First we work on merging the resource dictionaries.  This allows us
        # to find out what symbols in the content streams we might need to
        # rename.
        new_resources = DictObject()
        rename = {}
        original_resources = self[_k.RESOURCES].get_object()
        page2_resources = page2[_k.RESOURCES].get_object()

        for res in b'/ExtGState', b'/Font', b'/XObject', b'/ColorSpace', b'/Pattern', b'/Shading', b'/Properties':
            new, new_name = _merge_resources(original_resources,
                                             page2_resources, res)
            if new:
                new_resources[NameObject(res)] = new
                rename.update(new_name)

        # Combine /ProcSet sets.
        new_resources[NameObject(b'/ProcSet')] = ArrayObject(
            frozenset(
                original_resources.get(
                    b'/ProcSet', ArrayObject()).get_object()).union(
                        frozenset(
                            page2_resources.get(b'/ProcSet',
                                                ArrayObject()).get_object())))

        new_content_array = ArrayObject()
        original_content = self.get_contents()
        if original_content is not None:
            new_content_array.append(
                _push_pop_graphics_state(original_content, self.parent))

        page2_content = page2.get_contents()
        if page2_content is not None:
            if page2transformation is not None:
                page2_content = page2transformation(page2_content)
            page2_content = _content_stream_rename(page2_content, rename,
                                                   self.parent)
            page2_content = _push_pop_graphics_state(page2_content,
                                                     self.parent)
            new_content_array.append(page2_content)

        self[NameObject(_k.CONTENT)] = _ContentStreamObject(
            new_content_array, self.parent)
        self[NameObject(_k.RESOURCES)] = new_resources
Exemplo n.º 4
0
 def add_bookmark(self, title: str, page_index: int, container_ref=None):
     container_ref = self.__outlines if container_ref is None else container_ref
     title_obj = TextStringObject(title)
     mark_mami = DictObject()
     target_page = self.get_page_ref(page_index - 1)
     mark_mami.update({
         NameObject(_k.TITLE): title_obj,
         NameObject(_k.PARENT): container_ref,
         NameObject(_k.DEST): ArrayObject([target_page, NameObject(b'/Fit')]),
     })
     container = container_ref.get_object()
     mark_ref = self._add_object(mark_mami)
     if _k.COUNT in container:
         container[NameObject(_k.COUNT)] += 1
         last_mark = container[NameObject(_k.LAST)]
         container[_k.LAST] = mark_ref
         last_mark[_k.NEXT] = mark_ref
     else:
         container[NameObject(_k.TYPE)] = NameObject(_k.OUTLINES)
         container[NameObject(_k.COUNT)] = NumberObject(1)
         container[NameObject(_k.FIRST)] = mark_ref
         container[NameObject(_k.LAST)] = mark_ref
     return mark_ref
Exemplo n.º 5
0
 def _write_trailer_to(self, stream):
     stream.write(b'trailer\n')
     trailer = DictObject()
     trailer.update({
         NameObject(_k.SIZE): NumberObject(len(self._objects) + 1),
         NameObject(_k.ROOT): self._root,
         NameObject(_k.INFO): self._info,
     })
     if self._id is not None:
         trailer[NameObject(_k.ID)] = self._id
     if self._encrypt is not None:
         trailer[NameObject(_k.ENCRYPT)] = self._encrypt
     trailer.write_to_stream(stream)
Exemplo n.º 6
0
def _merge_resources(res1, res2, resource):
    new_res = DictObject()
    new_res.update(res1.get(resource, DictObject()).get_object())
    page2_res = res2.get(resource, DictObject()).get_object()
    rename_res = {}
    for key in page2_res.keys():
        if key in new_res and new_res[key] != page2_res[key]:
            new_name = NameObject(key + b'renamed')
            rename_res[key] = new_name
            new_res[new_name] = page2_res[key]
        elif key not in new_res:
            new_res[key] = page2_res.raw_get(key)
    return new_res, rename_res
Exemplo n.º 7
0
    def encrypt(self, user_pwd, owner_pwd=None, use_128bit=True):
        """Encrypt this PDF file with the PDF Standard encryption handler.

        user_pwd - The "user password", which allows for opening and reading
                the PDF file with the restrictions provided.
        owner_pwd - The "owner password", which allows for opening the PDF
                files without any restrictions.  By default, the owner password is the
                same as the user password.
        use_128bit - Boolean argument as to whether to use 128bit
                encryption.  When false, 40bit encryption will be used.  By default, this
                flag is on."""
        if owner_pwd is None:
            owner_pwd = user_pwd
        if use_128bit:
            v = 2
            rev = 3
            keylen = 128 / 8
        else:
            v = 1
            rev = 2
            keylen = 40 / 8
        # permit everything:
        p = -1
        o = ByteStringObject(_u.algorithm_33(owner_pwd, user_pwd, rev, keylen))
        id_1 = _md5(bytes(repr(time.time()), _u.ENCODING_UTF8)).digest()
        id_2 = _md5(bytes(repr(random.random()), _u.ENCODING_UTF8)).digest()
        self._id = ArrayObject((ByteStringObject(id_1), ByteStringObject(id_2)))
        if rev == 2:
            u, key = _u.algorithm_34(user_pwd, o, p, id_1)
        else:
            assert rev == 3
            u, key = _u.algorithm_35(user_pwd, rev, keylen, o, p, id_1, False)
        encrypt = DictObject()
        encrypt[NameObject(b'/Filter')] = NameObject(b'/Standard')
        encrypt[NameObject(b'/V')] = NumberObject(v)
        if v == 2:
            encrypt[NameObject(b'/Length')] = NumberObject(keylen * 8)
        encrypt[NameObject(b'/R')] = NumberObject(rev)
        encrypt[NameObject(b'/O')] = ByteStringObject(o)
        encrypt[NameObject(b'/U')] = ByteStringObject(u)
        encrypt[NameObject(b'/P')] = NumberObject(p)
        self._encrypt = self._add_object(encrypt)
        self._encrypt_key = key
Exemplo n.º 8
0
 def __read_inline_image(self, stream):
     # begin reading just after the "BI" - begin image
     # first read the dictionary of settings.
     settings = DictObject()
     while True:
         tok = utils.seek_token(stream)
         if tok == b'I':
             # "ID" - begin of image data
             break
         key = read_object(stream, self.pdf)
         utils.seek_token(stream)
         value = read_object(stream, self.pdf)
         settings[key] = value
     # left at beginning of ID
     tmp = stream.read(3)
     assert tmp[:2] == b'ID'
     data = _read_image_data(stream)
     utils.debug(len(data))
     utils.seek_token(stream)
     return {b'settings': settings, b'data': data}
Exemplo n.º 9
0
    def __init__(self):
        self._id = None
        self._encrypt = None
        self._encrypt_key = None
        self._objects = []  # array of indirect objects

        # The root of our page tree node.
        pages = DictObject()
        pages.update({
            NameObject(_k.TYPE): NameObject(_k.PAGES),
            NameObject(_k.COUNT): NumberObject(0),
            NameObject(_k.KIDS): ArrayObject(),
        })
        self._pages = self._add_object(pages)

        # info object
        info = DictObject()
        info.update({
            NameObject(b'/Producer'): create_string_object(b'PyPDF - Refactored by QXF')
        })
        self._info = self._add_object(info)

        # root object
        self.__outlines = self._add_object(DictObject())
        root = DictObject()
        root.update({
            NameObject(_k.TYPE): NameObject(b'/Catalog'),
            NameObject(_k.PAGES): self._pages,
            NameObject(_k.OUTLINES): self.__outlines,
        })
        self._root = self._add_object(root)
Exemplo n.º 10
0
 def __init__(self, parent=None, indirect_ref=None):
     DictObject.__init__(self)
     self.parent = parent
     # Stores the original indirect reference to this object in its source PDF
     self.indirect_ref = indirect_ref