def _create_pdf_attachment(attachment, url_fetcher): """ Create an attachment to the PDF stream :return: the object representing the ``/Filespec`` object or :obj:`None` if the attachment couldn't be read. """ try: # Attachments from document links like <link> or <a> can only be URLs. # They're passed in as tuples if isinstance(attachment, tuple): url, description = attachment attachment = Attachment(url=url, url_fetcher=url_fetcher, description=description) elif not isinstance(attachment, Attachment): attachment = Attachment(guess=attachment, url_fetcher=url_fetcher) with attachment.source as (source_type, source, url, _): if isinstance(source, bytes): source = io.BytesIO(source) pdf_file_object = _create_compressed_file_object(source) except URLFetchingError as exc: LOGGER.error('Failed to load attachment: %s', exc) return None # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename return PdfDict(Type=PdfName('Filespec'), F=PdfString.encode(''), UF=PdfString.encode(_get_filename_from_result(url, None)), EF=PdfDict(F=pdf_file_object), Desc=PdfString.encode(attachment.description or ''))
def make_cid_system_info_object(): """Make a CID System Info object. :returns PdfDict: CID System Info PdfDict object. """ return IndirectPdfDict(Registry=PdfString('(Adobe)'), Ordering=PdfString('(UCS)'), Supplement=0)
def encode(self, value): x = PdfString.encode(value) if isinstance(value, type(u'')): y = PdfString.from_unicode(value) else: y = PdfString.from_bytes(value) self.assertEqual(x, y) return x
def __str__(self): # __str__ is used for serialization if self.value == self.original_value: # If unchanged, return the raw original value without decoding/encoding. return PdfString.from_bytes(self.raw_original_value) else: # If the value changed, encode it from Unicode according to the encoding # of the font that is active at the location of this token. return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache))
def transPdfString(v, translator): if isinstance(v, PdfString): if v[0] == "(": s0 = v.decode() if s0.startswith( "\xfe\xff"): #chardet.detect(s0)["encoding"]=="UTF-16BE": s1 = translator(s0.decode("utf-16be", "ignore")) s2 = PdfString.encode(s1.encode("utf-16be")) return PdfString(s2) return None
def update_annotation(annotation, options): import re from pdfrw.objects import PdfString # Contents holds a plain-text representation of the annotation # content, such as for accessibility. All annotation types may # have a Contents. NM holds the "annotation name" which also # could have redactable text, I suppose. Markup annotations have # "T" fields that hold a title / text label. Subj holds a # comment subject. CA, RC, and AC are used in widget annotations. for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"): if getattr(annotation, string_field): value = getattr(annotation, string_field).to_unicode() for pattern, function in options.content_filters: value = pattern.sub(function, value) setattr(annotation, string_field, PdfString.from_unicode(value)) # A rich-text stream. Not implemented. Bail so that we don't # accidentally leak something that should be redacted. if annotation.RC: raise ValueError( "Annotation rich-text streams (Annot/RC) are not supported.") # An action, usually used for links. if annotation.A: update_annotation_action(annotation, annotation.A, options) if annotation.PA: update_annotation_action(annotation, annotation.PA, options) # If set, another annotation. if annotation.Popup: update_annotation(annotation.Popup, options)
def _create_compressed_file_object(source): """ Create a file like object as ``/EmbeddedFile`` compressing it with deflate. :return: the object representing the compressed file stream object """ md5 = hashlib.md5() compress = zlib.compressobj() pdf_file_object = PdfDict(Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode')) # pdfrw needs Latin-1-decoded unicode strings in object.stream pdf_file_object.stream = '' size = 0 for data in iter(lambda: source.read(4096), b''): size += len(data) md5.update(data) pdf_file_object.stream += compress.compress(data).decode('latin-1') pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1') pdf_file_object.Params = PdfDict(CheckSum=PdfString('<{}>'.format( md5.hexdigest())), Size=size) return pdf_file_object
def create_bookmarks(bookmarks, pages, parent=None): count = len(bookmarks) bookmark_objects = [] for label, target, children in bookmarks: destination = (pages[target[0]].indirect, PdfName('XYZ'), target[1], target[2], 0) bookmark_object = PdfDict(Title=PdfString.encode(label), A=PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination))) bookmark_object.indirect = True children_objects, children_count = create_bookmarks( children, pages, parent=bookmark_object) bookmark_object.Count = 1 + children_count if bookmark_objects: bookmark_object.Prev = bookmark_objects[-1] bookmark_objects[-1].Next = bookmark_object if children_objects: bookmark_object.First = children_objects[0] bookmark_object.Last = children_objects[-1] if parent is not None: bookmark_object.Parent = parent count += children_count bookmark_objects.append(bookmark_object) return bookmark_objects, count
def transPdfString(v, translator) : if isinstance(v, PdfString): if v[0]=="(": s0=v.decode() if s0.startswith("\xfe\xff"): #chardet.detect(s0)["encoding"]=="UTF-16BE": s1=translator(s0.decode("utf-16be", "ignore")) s2=PdfString.encode(s1.encode("utf-16be")) return PdfString(s2) return None
def pdfobjs(self): """Returns a tuple of two elements to insert in the PageLabels.Nums entry of a pdf""" pagenum = PdfObject(self.startpage) opts = PdfDict(S=styles[self.style]) if self.prefix != defaults["prefix"]: opts.P = PdfString.encode(self.prefix) if self.firstpagenum != defaults["firstpagenum"]: opts.St = PdfObject(self.firstpagenum) return (pagenum, opts)
def pdfobjs(self): """Returns a tuple of two elements to insert in the PageLabels.Nums entry of a pdf""" page_num = PdfObject(self.startpage) opts = PdfDict(S=styles[self.style]) if self.prefix != defaults["prefix"]: opts.P = PdfString.encode(self.prefix) if self.firstpagenum != defaults["firstpagenum"]: opts.St = PdfObject(self.firstpagenum) return page_num, opts
def fix_metadata(doc, title=None, creation_date=None): # Clear any existing XMP meta data doc.Root.Metadata = None meta = { 'Creator': 'OffeneGesetze.de', 'Keywords': 'Amtliches Werk nach §5 UrhG https://offenegesetze.de', 'ModDate': make_pdf_date(datetime.now()), } if title is not None: meta['Title'] = title if creation_date is not None: meta['CreationDate'] = make_pdf_date(creation_date) for key, val in meta.items(): if 'Date' not in key: val = PdfString.from_unicode(val) doc.Info[PdfName(key)] = val
def update_annotation_action(annotation, action, options): from pdfrw.objects import PdfString if action.URI and options.link_filters: value = action.URI.to_unicode() for func in options.link_filters: value = func(value, annotation) if value is None: # Remove annotation by supressing the action. action.URI = None else: action.URI = PdfString.from_unicode(value) if action.Next: # May be an Action or array of Actions to execute next. next_action = action.Next if isinstance(action.Next, dict): next_action = [action.Next] for a in next_action: update_annotation_action(annotation, a, options)
def make_pdf_date(value): value = value.strftime("%Y%m%d%H%M%S%z") if len(value) == 19: value = value[:17] + "'" + value[17:] return PdfString("(D:%s)" % value)
def test_continuation(self): # See PDF 1.7 ref section 3.2 page 55 s1 = PdfString('(These two strings are the same.)') self.assertEqual(s1.decode(), s1[1:-1]) s2 = PdfString('(These \\\ntwo strings \\\nare the same.)') self.assertEqual(s1.decode(), s2.decode()) s2 = PdfString(s2.replace('\n', '\r')) self.assertEqual(s1.decode(), s2.decode()) s2 = PdfString(s2.replace('\r', '\r\n')) self.assertEqual(s1.decode(), s2.decode())
def test_constructor(self): obj = PdfString('hello')
def decode_bytes(self, decode_this, expected): """ Decode to bytes""" self.assertEqual(PdfString(decode_this).to_bytes(), convert_store(expected))
def decode(self, value): s = PdfString(value) x = s.to_unicode() y = s.decode() self.assertEqual(x, y) return x
def test_nullstring(self): self.assertEqual(PdfString('<>').to_bytes(), b'') self.assertEqual(PdfString('()').to_bytes(), b'')
def write_pdf_metadata(document, fileobj, scale, metadata, attachments, url_fetcher): """Append to a seekable file-like object to add PDF metadata.""" fileobj.seek(0) trailer = PdfReader(fileobj) pages = trailer.Root.Pages.Kids bookmarks, links = prepare_metadata(document, scale, pages) if bookmarks: bookmark_objects, count = create_bookmarks(bookmarks, pages) trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'), Count=count, First=bookmark_objects[0], Last=bookmark_objects[-1]) attachments = metadata.attachments + (attachments or []) if attachments: embedded_files = [] for attachment in attachments: attachment_object = _create_pdf_attachment(attachment, url_fetcher) if attachment_object is not None: embedded_files.append(PdfString.encode('attachment')) embedded_files.append(attachment_object) if embedded_files: trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict( Names=PdfArray(embedded_files))) # A single link can be split in multiple regions. We don't want to embedded # a file multiple times of course, so keep a reference to every embedded # URL and reuse the object number. # TODO: If we add support for descriptions this won't always be correct, # because two links might have the same href, but different titles. annot_files = {} for page_links in links: for link_type, target, rectangle in page_links: if link_type == 'attachment' and target not in annot_files: # TODO: use the title attribute as description annot_files[target] = _create_pdf_attachment((target, None), url_fetcher) # TODO: splitting a link into multiple independent rectangular annotations # works well for pure links, but rather mediocre for other annotations and # fails completely for transformed (CSS) or complex link shapes (area). # It would be better to use /AP for all links and coalesce link shapes that # originate from the same HTML link. This would give a feeling similiar to # what browsers do with links that span multiple lines. for page, page_links in zip(pages, links): annotations = PdfArray() for link_type, target, rectangle in page_links: if link_type != 'attachment' or annot_files[target] is None: annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('Link'), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0))) if link_type == 'internal': destination = (target[0], PdfName('XYZ'), target[1], target[2], 0) annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination)) else: annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('URI'), URI=PdfString.encode( iri_to_uri(target))) else: assert annot_files[target] is not None ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle), Subtype=PdfName('Form'), Type=PdfName('XObject'))) # evince needs /T or fails on an internal assertion. PDF # doesn't require it. annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'), T=PdfString.encode(''), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)), FS=annot_files[target], AP=ap) annotations.append(annotation) if annotations: page.Annots = annotations trailer.Info.Producer = VERSION_STRING for attr, key in (('title', 'Title'), ('description', 'Subject'), ('generator', 'Creator')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, value) for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, ', '.join(getattr(metadata, attr))) for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')): value = w3c_date_to_pdf(getattr(metadata, attr), attr) if value is not None: setattr(trailer.Info, key, value) for page, document_page in zip(pages, document.pages): left, top, right, bottom = (float(value) for value in page.MediaBox) # Convert pixels into points bleed = { key: value * 0.75 for key, value in document_page.bleed.items() } trim_left = left + bleed['left'] trim_top = top + bleed['top'] trim_right = right - bleed['right'] trim_bottom = bottom - bleed['bottom'] page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom)) # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and # CSS page box (PDF TrimBox), at most 10 points from the TrimBox. bleed_left = trim_left - min(10, bleed['left']) bleed_top = trim_top - min(10, bleed['top']) bleed_right = trim_right + min(10, bleed['right']) bleed_bottom = trim_bottom + min(10, bleed['bottom']) page.BleedBox = PdfArray( (bleed_left, bleed_top, bleed_right, bleed_bottom)) fileobj.seek(0) PdfWriter().write(fileobj, trailer=trailer) fileobj.truncate()