def __str__(self): # __str__ is used for serialization if self.value == self.original_value: # If unchanged, return the raw original value without decoding/encoding. return PdfString.from_bytes(self.raw_original_value) else: # If the value changed, encode it from Unicode according to the encoding # of the font that is active at the location of this token. return PdfString.from_bytes(fromUnicode(self.value, self.font, fontcache, options))
def update_annotation(annotation, options): import re from pdfrw.objects import PdfString # Contents holds a plain-text representation of the annotation # content, such as for accessibility. All annotation types may # have a Contents. NM holds the "annotation name" which also # could have redactable text, I suppose. Markup annotations have # "T" fields that hold a title / text label. Subj holds a # comment subject. CA, RC, and AC are used in widget annotations. for string_field in ("Contents", "NM", "T", "Subj", "CA", "RC", "AC"): if getattr(annotation, string_field): value = getattr(annotation, string_field).to_unicode() for pattern, function in options.content_filters: value = pattern.sub(function, value) setattr(annotation, string_field, PdfString.from_unicode(value)) # A rich-text stream. Not implemented. Bail so that we don't # accidentally leak something that should be redacted. if annotation.RC: raise ValueError( "Annotation rich-text streams (Annot/RC) are not supported.") # An action, usually used for links. if annotation.A: update_annotation_action(annotation, annotation.A, options) if annotation.PA: update_annotation_action(annotation, annotation.PA, options) # If set, another annotation. if annotation.Popup: update_annotation(annotation.Popup, options)
def update_annotation_action(annotation, action, options): from pdfrw.objects import PdfString if action.URI and options.link_filters: value = action.URI.to_unicode() for func in options.link_filters: value = func(value, annotation) if value is None: # Remove annotation by supressing the action. action.URI = None else: action.URI = PdfString.from_unicode(value) if action.Next: # May be an Action or array of Actions to execute next. next_action = action.Next if isinstance(action.Next, dict): next_action = [action.Next] for a in next_action: update_annotation_action(annotation, a, options)
def update_metadata(trailer, options): # Update the PDF's Document Information Dictionary, which contains keys like # Title, Author, Subject, Keywords, Creator, Producer, CreationDate, and ModDate # (the latter two containing Date values, the rest strings). import codecs from pdfrw.objects import PdfString, PdfName # Create the metadata dict if it doesn't exist, since the caller may be adding fields. if not trailer.Info: trailer.Info = PdfDict() # Get a list of all metadata fields that exist in the PDF plus any fields # that there are metadata filters for (since they may insert field values). keys = set(str(k)[1:] for k in trailer.Info.keys()) \ | set(k for k in options.metadata_filters.keys() if k not in ("DEFAULT", "ALL")) # Update each metadata field. for key in keys: # Get the functions to apply to this field. functions = options.metadata_filters.get(key) if functions is None: # If nothing is defined for this field, use the DEFAULT functions. functions = options.metadata_filters.get("DEFAULT", []) # Append the ALL functions. functions += options.metadata_filters.get("ALL", []) # Run the functions on any existing values. value = trailer.Info[PdfName(key)] for f in functions: # Before passing to the function, convert from a PdfString to a Python string. if isinstance(value, PdfString): # decode from PDF's "(...)" syntax. value = value.decode() # Filter the value. value = f(value) # Convert Python data type to PdfString. if isinstance(value, str) or (sys.version_info < (3, ) and isinstance(value, unicode)): # Convert string to a PdfString instance. value = PdfString.from_unicode(value) elif isinstance(value, datetime): # Convert datetime into a PDF "D" string format. value = value.strftime("%Y%m%d%H%M%S%z") if len(value) == 19: # If TZ info was included, add an apostrophe between the hour/minutes offsets. value = value[:17] + "'" + value[17:] value = PdfString("(D:%s)" % value) elif value is None: # delete the metadata value pass else: raise ValueError( "Invalid type of value returned by metadata_filter function. %s was returned by %s." % (repr(value), f.__name__ or "anonymous function")) # Replace value. trailer.Info[PdfName(key)] = value
def _build_font(): with open(FONT_FILENAME, "rb") as f: embedded_font_stream = f.read() embedded_font = PdfDict() embedded_font.indirect = True embedded_font.Filter = [PdfName.FlateDecode] embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode( "latin-1") embedded_font.Length1 = len(embedded_font_stream) font_descriptor = PdfDict() font_descriptor.indirect = True font_descriptor.Ascent = 1000 font_descriptor.CapHeight = 1000 font_descriptor.Descent = -1 font_descriptor.Flags = 5 # FixedPitch + Symbolic font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500]) font_descriptor.FontFile2 = embedded_font font_descriptor.FontName = PdfName.GlyphLessFont font_descriptor.ItalicAngle = 0 font_descriptor.StemV = 80 font_descriptor.Type = PdfName.FontDescriptor # Map everything to glyph 1 cid_to_gid_map_stream = b"\0\1" * (1 << 16) cid_to_gid_map = PdfDict() cid_to_gid_map.indirect = True cid_to_gid_map.Filter = [PdfName.FlateDecode] cid_to_gid_map.stream = zlib.compress( cid_to_gid_map_stream, 9).decode("latin-1") cid_to_gid_map.Length1 = len(cid_to_gid_map_stream) cid_system_info = PdfDict() cid_system_info.Ordering = PdfString.from_unicode("Identity") cid_system_info.Registry = PdfString.from_unicode("Adobe") cid_system_info.Supplement = 0 cid_font = PdfDict() cid_font.indirect = True cid_font.CIDToGIDMap = cid_to_gid_map cid_font.BaseFont = PdfName.GlyphLessFont cid_font.CIDSystemInfo = cid_system_info cid_font.FontDescriptor = font_descriptor cid_font.Subtype = PdfName.CIDFontType2 cid_font.Type = PdfName.Font cid_font.DW = 500 with open(UNICODE_CMAP_FILENAME, "rb") as f: unicode_cmap_stream = f.read() unicode_cmap = PdfDict() unicode_cmap.indirect = True unicode_cmap.Filter = [PdfName.FlateDecode] unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode( "latin-1") font = PdfDict() font.indirect = True font.BaseFont = PdfName.GlyphLessFont font.DescendantFonts = PdfArray([cid_font]) font.Encoding = PdfName("Identity-H") font.Subtype = PdfName.Type0 font.ToUnicode = unicode_cmap font.Type = PdfName.Font return font
def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages))
def write_async(self, outfile, process_semaphore, progress_cb=None): pdf_writer = PdfWriter(version="1.5") pdf_group = PdfDict() pdf_group.indirect = True pdf_group.CS = PdfName.DeviceRGB pdf_group.I = PdfBool(True) pdf_group.S = PdfName.Transparency pdf_font_mapping = PdfDict() pdf_font_mapping.indirect = True pdf_font_mapping.F1 = self._build_font() for _ in self._pages: pdf_page = PdfDict() pdf_page.Type = PdfName.Page pdf_writer.addpage(pdf_page) # pdfrw makes a internal copy of the pages # use the copy so that references to pages in links are correct pdf_pages = list(pdf_writer.pagearray) srgb_colorspace = PdfDict() srgb_colorspace.indirect = True srgb_colorspace.N = 3 # Number of components (red, green, blue) with open(SRGB_ICC_FILENAME, "rb") as f: srgb_colorspace_stream = f.read() srgb_colorspace.Filter = [PdfName.FlateDecode] srgb_colorspace.stream = zlib.compress(srgb_colorspace_stream, 9).decode("latin-1") srgb_colorspace.Length1 = len(srgb_colorspace_stream) default_rgb_colorspace = PdfArray([PdfName.ICCBased, srgb_colorspace]) default_rgb_colorspace.indirect = True # Handle all pages in parallel @asyncio.coroutine def make_page(page, pdf_page, psem): # Prepare everything in parallel @asyncio.coroutine def get_pdf_thumbnail(psem): if page.thumbnail is None: return None return (yield from page.thumbnail.pdf_thumbnail(psem)) @asyncio.coroutine def get_pdf_background(psem): if page.background is None: return None return (yield from page.background.pdf_image(psem)) @asyncio.coroutine def get_pdf_mask(foreground, psem): if foreground.color is not None: return None return (yield from foreground.pdf_mask(psem)) pdf_thumbnail, pdf_background, pdf_foregrounds, pdf_masks = ( yield from asyncio.gather( get_pdf_thumbnail(psem), get_pdf_background(psem), asyncio.gather( *[fg.pdf_image(psem) for fg in page.foreground]), asyncio.gather( *[get_pdf_mask(fg, psem) for fg in page.foreground]))) pdf_page.MediaBox = PdfArray( [0, 0, PdfNumber(page.width), PdfNumber(page.height)]) pdf_page.Group = pdf_group pdf_resources = PdfDict() pdf_colorspace = PdfDict() pdf_colorspace.DefaultRGB = default_rgb_colorspace pdf_resources.ColorSpace = pdf_colorspace pdf_xobject = PdfDict() if pdf_thumbnail is not None: pdf_page.Thumb = pdf_thumbnail im_index = 0 # Save graphics state and scale unity rectangle to page size matrix = TransformationMatrix() matrix.scale(page.width, page.height) before_graphics = ("q\n" + "%s cm\n" % matrix.to_pdf()) after_graphics = "\nQ\n" contents = "" graphics = "" current_color = None if page.color != self._factory.WHITE: if current_color != page.color: current_color = page.color graphics += page.color.to_pdf() + " rg " graphics += ("0 0 1 1 re " + "f\n") if pdf_background is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_background graphics += "/Im%d Do\n" % im_index im_index += 1 for foreground, pdf_foreground, pdf_mask in zip( page.foreground, pdf_foregrounds, pdf_masks): if pdf_mask is not None: pdf_xobject[PdfName("Im%d" % im_index)] = pdf_mask im_index += 1 pdf_xobject[PdfName("Im%d" % im_index)] = pdf_foreground if (foreground.color is not None and current_color != foreground.color): current_color = foreground.color graphics += foreground.color.to_pdf() + " rg " graphics += "/Im%d Do\n" % im_index im_index += 1 if graphics: contents += (before_graphics + graphics.rstrip(" \n") + after_graphics) current_color = None before_text = ("BT\n" + "/F1 1 Tf 3 Tr\n") after_text = "\nET\n" text = "" pdf_annots = [] for t in page.text: if t.text: matrix = TransformationMatrix() # Glyph size is 0.5 x 1 matrix.scale(2 / len(t.text), 1) matrix.translate(-0.5, -0.5) if t.direction == "ltr": pass elif t.direction == "rtl": matrix.translate(0, -1) elif t.direction == "ttb": matrix.rotate(90) matrix.rotate(-t.rotation) matrix.translate(0.5, 0.5) matrix.scale(t.width, t.height) matrix.translate(t.x, t.y) text += "%s Tm %s Tj\n" % ( matrix.to_pdf(), PdfString().from_bytes( t.text.encode("utf-16-be"), bytes_encoding="hex")) if t.external_link is not None or t.internal_link is not None: pdf_annot = PdfDict() pdf_annots.append(pdf_annot) pdf_annot.Type = PdfName.Annot pdf_annot.Subtype = PdfName.Link pdf_annot.Border = [0, 0, 0] pdf_annot.Rect = [ PdfNumber(t.x), PdfNumber(t.y), PdfNumber(t.x + t.width), PdfNumber(t.y + t.height) ] if t.external_link is not None: pdf_a = PdfDict() pdf_annot.A = pdf_a pdf_a.Type = PdfName.Action pdf_a.S = PdfName.URI pdf_a.URI = t.external_link.decode("latin-1") if t.internal_link is not None: pdf_target_page = pdf_pages[t.internal_link[0]] target_x, target_y = t.internal_link[1] pdf_annot.Dest = [ pdf_target_page, PdfName.XYZ, PdfNumber(target_x), PdfNumber(target_y), 0 ] text = text.rstrip(" \n") if text: pdf_resources.Font = pdf_font_mapping contents += (before_text + text + after_text) contents = contents.rstrip(" \n") if contents: pdf_contents = PdfDict() pdf_contents.indirect = True pdf_page.Contents = pdf_contents if COMPRESS_PAGE_CONTENTS: pdf_contents.Filter = [PdfName.FlateDecode] pdf_contents.stream = zlib.compress( contents.encode("latin-1"), 9).decode("latin-1") else: pdf_contents.stream = contents if pdf_annots: pdf_page.Annots = pdf_annots if pdf_xobject: pdf_resources.XObject = pdf_xobject if pdf_resources: pdf_page.Resources = pdf_resources # Report progress nonlocal finished_pages finished_pages += 1 if progress_cb: progress_cb(finished_pages / len(self._pages)) finished_pages = 0 yield from asyncio.gather(*[ make_page(page, pdf_page, process_semaphore) for page, pdf_page in zip(self._pages, pdf_pages) ]) trailer = pdf_writer.trailer document_id = PdfString().from_bytes(os.urandom(16)) trailer.ID = [document_id, document_id] mark_info = PdfDict() mark_info.Marked = PdfBool(True) trailer.Root.MarkInfo = mark_info struct_tree_root = PdfDict() struct_tree_root.Type = PdfName.StructTreeRoot trailer.Root.StructTreeRoot = struct_tree_root metadata = PdfDict() metadata.indirect = True metadata.Type = PdfName.Metadata metadata.Subtype = PdfName.XML xmp = XMPMeta() xmp.set_property(XMP_NS_PDFA_ID, "part", "2") xmp.set_property(XMP_NS_PDFA_ID, "conformance", "A") metadata_stream = xmp.serialize_to_str().encode("utf-8") metadata.Filter = [PdfName.FlateDecode] metadata.stream = zlib.compress(metadata_stream, 9).decode("latin-1") metadata.Length1 = len(metadata_stream) trailer.Root.Metadata = metadata with TemporaryDirectory(prefix="djpdf-") as temp_dir: pdf_writer.write(path.join(temp_dir, "temp.pdf")) cmd = [ QPDF_CMD, "--stream-data=preserve", "--object-streams=preserve", "--normalize-content=n", "--newline-before-endstream" ] if LINEARIZE_PDF: cmd.extend(["--linearize"]) cmd.extend([ path.abspath(path.join(temp_dir, "temp.pdf")), path.abspath(outfile) ]) yield from run_command_async(cmd, process_semaphore)
def _build_font(): with open(FONT_FILENAME, "rb") as f: embedded_font_stream = f.read() embedded_font = PdfDict() embedded_font.indirect = True embedded_font.Filter = [PdfName.FlateDecode] embedded_font.stream = zlib.compress(embedded_font_stream, 9).decode("latin-1") embedded_font.Length1 = len(embedded_font_stream) font_descriptor = PdfDict() font_descriptor.indirect = True font_descriptor.Ascent = 1000 font_descriptor.CapHeight = 1000 font_descriptor.Descent = -1 font_descriptor.Flags = 5 # FixedPitch + Symbolic font_descriptor.FontBBox = PdfArray([0, 0, 1000, 500]) font_descriptor.FontFile2 = embedded_font font_descriptor.FontName = PdfName.GlyphLessFont font_descriptor.ItalicAngle = 0 font_descriptor.StemV = 80 font_descriptor.Type = PdfName.FontDescriptor # Map everything to glyph 1 cid_to_gid_map_stream = b"\0\1" * (1 << 16) cid_to_gid_map = PdfDict() cid_to_gid_map.indirect = True cid_to_gid_map.Filter = [PdfName.FlateDecode] cid_to_gid_map.stream = zlib.compress(cid_to_gid_map_stream, 9).decode("latin-1") cid_to_gid_map.Length1 = len(cid_to_gid_map_stream) cid_system_info = PdfDict() cid_system_info.Ordering = PdfString.from_unicode("Identity") cid_system_info.Registry = PdfString.from_unicode("Adobe") cid_system_info.Supplement = 0 cid_font = PdfDict() cid_font.indirect = True cid_font.CIDToGIDMap = cid_to_gid_map cid_font.BaseFont = PdfName.GlyphLessFont cid_font.CIDSystemInfo = cid_system_info cid_font.FontDescriptor = font_descriptor cid_font.Subtype = PdfName.CIDFontType2 cid_font.Type = PdfName.Font cid_font.DW = 500 with open(UNICODE_CMAP_FILENAME, "rb") as f: unicode_cmap_stream = f.read() unicode_cmap = PdfDict() unicode_cmap.indirect = True unicode_cmap.Filter = [PdfName.FlateDecode] unicode_cmap.stream = zlib.compress(unicode_cmap_stream, 9).decode("latin-1") font = PdfDict() font.indirect = True font.BaseFont = PdfName.GlyphLessFont font.DescendantFonts = PdfArray([cid_font]) font.Encoding = PdfName("Identity-H") font.Subtype = PdfName.Type0 font.ToUnicode = unicode_cmap font.Type = PdfName.Font return font