예제 #1
0
def _create_pdf_attachment(attachment, url_fetcher):
    """
    Create an attachment to the PDF stream

    :return:
        the object representing the ``/Filespec`` object or :obj:`None` if the
        attachment couldn't be read.
    """
    try:
        # Attachments from document links like <link> or <a> can only be URLs.
        # They're passed in as tuples
        if isinstance(attachment, tuple):
            url, description = attachment
            attachment = Attachment(url=url,
                                    url_fetcher=url_fetcher,
                                    description=description)
        elif not isinstance(attachment, Attachment):
            attachment = Attachment(guess=attachment, url_fetcher=url_fetcher)

        with attachment.source as (source_type, source, url, _):
            if isinstance(source, bytes):
                source = io.BytesIO(source)
            pdf_file_object = _create_compressed_file_object(source)
    except URLFetchingError as exc:
        LOGGER.error('Failed to load attachment: %s', exc)
        return None

    # TODO: Use the result object from a URL fetch operation to provide more
    # details on the possible filename
    return PdfDict(Type=PdfName('Filespec'),
                   F=PdfString.encode(''),
                   UF=PdfString.encode(_get_filename_from_result(url, None)),
                   EF=PdfDict(F=pdf_file_object),
                   Desc=PdfString.encode(attachment.description or ''))
예제 #2
0
파일: text.py 프로젝트: ptwz/pdf-annotate
    def make_cid_system_info_object():
        """Make a CID System Info object.

        :returns PdfDict: CID System Info PdfDict object.
        """
        return IndirectPdfDict(Registry=PdfString('(Adobe)'),
                               Ordering=PdfString('(UCS)'),
                               Supplement=0)
예제 #3
0
 def encode(self, value):
     x = PdfString.encode(value)
     if isinstance(value, type(u'')):
         y = PdfString.from_unicode(value)
     else:
         y = PdfString.from_bytes(value)
     self.assertEqual(x, y)
     return x
예제 #4
0
 def __str__(self):
     # __str__ is used for serialization
     if self.value == self.original_value:
         # If unchanged, return the raw original value without decoding/encoding.
         return PdfString.from_bytes(self.raw_original_value)
     else:
         # If the value changed, encode it from Unicode according to the encoding
         # of the font that is active at the location of this token.
         return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache))
예제 #5
0
def transPdfString(v, translator):
    if isinstance(v, PdfString):
        if v[0] == "(":
            s0 = v.decode()
            if s0.startswith(
                    "\xfe\xff"):  #chardet.detect(s0)["encoding"]=="UTF-16BE":
                s1 = translator(s0.decode("utf-16be", "ignore"))
                s2 = PdfString.encode(s1.encode("utf-16be"))
                return PdfString(s2)
    return None
예제 #6
0
def update_annotation(annotation, options):
    import re

    from pdfrw.objects import PdfString

    # Contents holds a plain-text representation of the annotation
    # content, such as for accessibility. All annotation types may
    # have a Contents. NM holds the "annotation name" which also
    # could have redactable text, I suppose. Markup annotations have
    # "T" fields that hold a title / text label. Subj holds a
    # comment subject. CA, RC, and AC are used in widget annotations.
    for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"):
        if getattr(annotation, string_field):
            value = getattr(annotation, string_field).to_unicode()
            for pattern, function in options.content_filters:
                value = pattern.sub(function, value)
            setattr(annotation, string_field, PdfString.from_unicode(value))

    # A rich-text stream. Not implemented. Bail so that we don't
    # accidentally leak something that should be redacted.
    if annotation.RC:
        raise ValueError(
            "Annotation rich-text streams (Annot/RC) are not supported.")

    # An action, usually used for links.
    if annotation.A:
        update_annotation_action(annotation, annotation.A, options)
    if annotation.PA:
        update_annotation_action(annotation, annotation.PA, options)

    # If set, another annotation.
    if annotation.Popup:
        update_annotation(annotation.Popup, options)
예제 #7
0
def _create_compressed_file_object(source):
    """
    Create a file like object as ``/EmbeddedFile`` compressing it with deflate.

    :return:
        the object representing the compressed file stream object
    """
    md5 = hashlib.md5()
    compress = zlib.compressobj()

    pdf_file_object = PdfDict(Type=PdfName('EmbeddedFile'),
                              Filter=PdfName('FlateDecode'))

    # pdfrw needs Latin-1-decoded unicode strings in object.stream
    pdf_file_object.stream = ''
    size = 0
    for data in iter(lambda: source.read(4096), b''):
        size += len(data)
        md5.update(data)
        pdf_file_object.stream += compress.compress(data).decode('latin-1')
    pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1')
    pdf_file_object.Params = PdfDict(CheckSum=PdfString('<{}>'.format(
        md5.hexdigest())),
                                     Size=size)
    return pdf_file_object
예제 #8
0
def create_bookmarks(bookmarks, pages, parent=None):
    count = len(bookmarks)
    bookmark_objects = []
    for label, target, children in bookmarks:
        destination = (pages[target[0]].indirect, PdfName('XYZ'), target[1],
                       target[2], 0)
        bookmark_object = PdfDict(Title=PdfString.encode(label),
                                  A=PdfDict(Type=PdfName('Action'),
                                            S=PdfName('GoTo'),
                                            D=PdfArray(destination)))
        bookmark_object.indirect = True
        children_objects, children_count = create_bookmarks(
            children, pages, parent=bookmark_object)
        bookmark_object.Count = 1 + children_count
        if bookmark_objects:
            bookmark_object.Prev = bookmark_objects[-1]
            bookmark_objects[-1].Next = bookmark_object
        if children_objects:
            bookmark_object.First = children_objects[0]
            bookmark_object.Last = children_objects[-1]
        if parent is not None:
            bookmark_object.Parent = parent
        count += children_count
        bookmark_objects.append(bookmark_object)
    return bookmark_objects, count
예제 #9
0
def transPdfString(v, translator)        :
    if isinstance(v, PdfString):                    
        if v[0]=="(":
            s0=v.decode()
            if s0.startswith("\xfe\xff"): #chardet.detect(s0)["encoding"]=="UTF-16BE":
                s1=translator(s0.decode("utf-16be", "ignore"))  
                s2=PdfString.encode(s1.encode("utf-16be"))                        
                return PdfString(s2)
    return None
예제 #10
0
 def pdfobjs(self):
     """Returns a tuple of two elements to insert in the PageLabels.Nums
     entry of a pdf"""
     pagenum = PdfObject(self.startpage)
     opts = PdfDict(S=styles[self.style])
     if self.prefix != defaults["prefix"]:
         opts.P = PdfString.encode(self.prefix)
     if self.firstpagenum != defaults["firstpagenum"]:
         opts.St = PdfObject(self.firstpagenum)
     return (pagenum, opts)
예제 #11
0
 def pdfobjs(self):
     """Returns a tuple of two elements to insert in the PageLabels.Nums
     entry of a pdf"""
     page_num = PdfObject(self.startpage)
     opts = PdfDict(S=styles[self.style])
     if self.prefix != defaults["prefix"]:
         opts.P = PdfString.encode(self.prefix)
     if self.firstpagenum != defaults["firstpagenum"]:
         opts.St = PdfObject(self.firstpagenum)
     return page_num, opts
예제 #12
0
def fix_metadata(doc, title=None, creation_date=None):
    # Clear any existing XMP meta data
    doc.Root.Metadata = None

    meta = {
        'Creator': 'OffeneGesetze.de',
        'Keywords': 'Amtliches Werk nach §5 UrhG https://offenegesetze.de',
        'ModDate': make_pdf_date(datetime.now()),
    }
    if title is not None:
        meta['Title'] = title
    if creation_date is not None:
        meta['CreationDate'] = make_pdf_date(creation_date)

    for key, val in meta.items():
        if 'Date' not in key:
            val = PdfString.from_unicode(val)
        doc.Info[PdfName(key)] = val
예제 #13
0
def update_annotation_action(annotation, action, options):
    from pdfrw.objects import PdfString

    if action.URI and options.link_filters:
        value = action.URI.to_unicode()
        for func in options.link_filters:
            value = func(value, annotation)
        if value is None:
            # Remove annotation by supressing the action.
            action.URI = None
        else:
            action.URI = PdfString.from_unicode(value)

    if action.Next:
        # May be an Action or array of Actions to execute next.
        next_action = action.Next
        if isinstance(action.Next, dict):
            next_action = [action.Next]
        for a in next_action:
            update_annotation_action(annotation, a, options)
예제 #14
0
def make_pdf_date(value):
    value = value.strftime("%Y%m%d%H%M%S%z")
    if len(value) == 19:
        value = value[:17] + "'" + value[17:]
    return PdfString("(D:%s)" % value)
예제 #15
0
 def test_continuation(self):
     # See PDF 1.7 ref section 3.2 page 55
     s1 = PdfString('(These two strings are the same.)')
     self.assertEqual(s1.decode(), s1[1:-1])
     s2 = PdfString('(These \\\ntwo strings \\\nare the same.)')
     self.assertEqual(s1.decode(), s2.decode())
     s2 = PdfString(s2.replace('\n', '\r'))
     self.assertEqual(s1.decode(), s2.decode())
     s2 = PdfString(s2.replace('\r', '\r\n'))
     self.assertEqual(s1.decode(), s2.decode())
예제 #16
0
 def test_constructor(self):
     obj = PdfString('hello')
예제 #17
0
 def decode_bytes(self, decode_this, expected):
     """ Decode to bytes"""
     self.assertEqual(PdfString(decode_this).to_bytes(),
                      convert_store(expected))
예제 #18
0
 def decode(self, value):
     s = PdfString(value)
     x = s.to_unicode()
     y = s.decode()
     self.assertEqual(x, y)
     return x
예제 #19
0
 def test_nullstring(self):
     self.assertEqual(PdfString('<>').to_bytes(), b'')
     self.assertEqual(PdfString('()').to_bytes(), b'')
예제 #20
0
def write_pdf_metadata(document, fileobj, scale, metadata, attachments,
                       url_fetcher):
    """Append to a seekable file-like object to add PDF metadata."""
    fileobj.seek(0)
    trailer = PdfReader(fileobj)
    pages = trailer.Root.Pages.Kids

    bookmarks, links = prepare_metadata(document, scale, pages)
    if bookmarks:
        bookmark_objects, count = create_bookmarks(bookmarks, pages)
        trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'),
                                        Count=count,
                                        First=bookmark_objects[0],
                                        Last=bookmark_objects[-1])

    attachments = metadata.attachments + (attachments or [])
    if attachments:
        embedded_files = []
        for attachment in attachments:
            attachment_object = _create_pdf_attachment(attachment, url_fetcher)
            if attachment_object is not None:
                embedded_files.append(PdfString.encode('attachment'))
                embedded_files.append(attachment_object)
        if embedded_files:
            trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict(
                Names=PdfArray(embedded_files)))

    # A single link can be split in multiple regions. We don't want to embedded
    # a file multiple times of course, so keep a reference to every embedded
    # URL and reuse the object number.
    # TODO: If we add support for descriptions this won't always be correct,
    # because two links might have the same href, but different titles.
    annot_files = {}
    for page_links in links:
        for link_type, target, rectangle in page_links:
            if link_type == 'attachment' and target not in annot_files:
                # TODO: use the title attribute as description
                annot_files[target] = _create_pdf_attachment((target, None),
                                                             url_fetcher)

    # TODO: splitting a link into multiple independent rectangular annotations
    # works well for pure links, but rather mediocre for other annotations and
    # fails completely for transformed (CSS) or complex link shapes (area).
    # It would be better to use /AP for all links and coalesce link shapes that
    # originate from the same HTML link. This would give a feeling similiar to
    # what browsers do with links that span multiple lines.
    for page, page_links in zip(pages, links):
        annotations = PdfArray()
        for link_type, target, rectangle in page_links:
            if link_type != 'attachment' or annot_files[target] is None:
                annotation = PdfDict(Type=PdfName('Annot'),
                                     Subtype=PdfName('Link'),
                                     Rect=PdfArray(rectangle),
                                     Border=PdfArray((0, 0, 0)))
                if link_type == 'internal':
                    destination = (target[0], PdfName('XYZ'), target[1],
                                   target[2], 0)
                    annotation.A = PdfDict(Type=PdfName('Action'),
                                           S=PdfName('GoTo'),
                                           D=PdfArray(destination))
                else:
                    annotation.A = PdfDict(Type=PdfName('Action'),
                                           S=PdfName('URI'),
                                           URI=PdfString.encode(
                                               iri_to_uri(target)))
            else:
                assert annot_files[target] is not None
                ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle),
                                       Subtype=PdfName('Form'),
                                       Type=PdfName('XObject')))
                # evince needs /T or fails on an internal assertion. PDF
                # doesn't require it.
                annotation = PdfDict(Type=PdfName('Annot'),
                                     Subtype=PdfName('FileAttachment'),
                                     T=PdfString.encode(''),
                                     Rect=PdfArray(rectangle),
                                     Border=PdfArray((0, 0, 0)),
                                     FS=annot_files[target],
                                     AP=ap)
            annotations.append(annotation)

        if annotations:
            page.Annots = annotations

    trailer.Info.Producer = VERSION_STRING
    for attr, key in (('title', 'Title'), ('description', 'Subject'),
                      ('generator', 'Creator')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, value)
    for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')):
        value = getattr(metadata, attr)
        if value is not None:
            setattr(trailer.Info, key, ', '.join(getattr(metadata, attr)))
    for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')):
        value = w3c_date_to_pdf(getattr(metadata, attr), attr)
        if value is not None:
            setattr(trailer.Info, key, value)

    for page, document_page in zip(pages, document.pages):
        left, top, right, bottom = (float(value) for value in page.MediaBox)
        # Convert pixels into points
        bleed = {
            key: value * 0.75
            for key, value in document_page.bleed.items()
        }

        trim_left = left + bleed['left']
        trim_top = top + bleed['top']
        trim_right = right - bleed['right']
        trim_bottom = bottom - bleed['bottom']
        page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom))

        # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and
        # CSS page box (PDF TrimBox), at most 10 points from the TrimBox.
        bleed_left = trim_left - min(10, bleed['left'])
        bleed_top = trim_top - min(10, bleed['top'])
        bleed_right = trim_right + min(10, bleed['right'])
        bleed_bottom = trim_bottom + min(10, bleed['bottom'])
        page.BleedBox = PdfArray(
            (bleed_left, bleed_top, bleed_right, bleed_bottom))

    fileobj.seek(0)
    PdfWriter().write(fileobj, trailer=trailer)
    fileobj.truncate()