def meta(input_pdf, output_pdf, value): pdf_writer = PdfFileWriter() pdf_reader = PdfFileReader(input_pdf) for page in range(pdf_reader.getNumPages()): pdf_writer.addPage(pdf_reader.getPage(page)) # pdf_writer.encrypt(user_pwd=password, owner_pwd=None, # use_128bit=True) infoDict = pdf_writer._info.getObject() infoDict.update({NameObject('/Version'): createStringObject(u'234ds2')}) info = pdf_reader.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) # add the grade # infoDict.update({NameObject('/Grade'): createStringObject(u'A+')}) # infoDict.update({NameObject('/Grade2'): createStringObject(u'A+')}) infoDict.update({NameObject('/Key'): createStringObject(value)}) with open(output_pdf, 'wb') as fh: pdf_writer.write(fh)
def _create_attachment_object(self, attachment): ''' Create a PyPdf2.generic object representing an embedded file. :param attachment: A dictionary containing: * filename: The name of the file to embed (required) * content: The bytes of the file to embed (required) * subtype: The mime-type of the file to embed (optional) :return: ''' file_entry = DecodedStreamObject() file_entry.setData(attachment['content']) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): DictionaryObject({ NameObject('/CheckSum'): createStringObject(md5(attachment['content']).hexdigest()), NameObject('/ModDate'): createStringObject( datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)), NameObject('/Size'): NameObject(str(len(attachment['content']))), }), }) if attachment.get('subtype'): file_entry.update({ NameObject("/Subtype"): NameObject(attachment['subtype']), }) file_entry_object = self._addObject(file_entry) filename_object = createStringObject(attachment['filename']) filespec_object = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): filename_object, NameObject("/EF"): DictionaryObject({ NameObject("/F"): file_entry_object, NameObject('/UF'): file_entry_object, }), NameObject("/UF"): filename_object, }) if attachment.get('description'): filespec_object.update({ NameObject("/Desc"): createStringObject(attachment['description']) }) return self._addObject(filespec_object)
def meta(pa, ti, au, fi, loo, ho, loi): OUTPUT = ti + '.pdf' INPUTS = [ ti + '.pdf', ] if au == None: au = '' else: pass output = PdfFileWriter() infoDict = output._info.getObject() infoDict.update({ NameObject('/Title'): createStringObject(ti), NameObject('/Author'): createStringObject(str(au)), # NameObject('/Subject'): createStringObject(su), #NameObject('/'): createStringObject('Fit'), # NameObject('/Fit'): createStringObject('Fit-to-page') }) inputs = [PdfFileReader(open(i, "rb")) for i in INPUTS] for input in inputs: for page in range(input.getNumPages()): output.addPage(input.getPage(page)) #output.addLink(page,0,rect='[0,0,0,0]',border=None,fit='/Fit') if os.path.isdir(ho + '/' + loo) == True: os.chdir(ho + '/' + loo) elif os.path.isdir(ho + '/' + loo) == False: os.chdir(loo) #pdf.generic.Destination(title='test',page=1,typ='/Fit') output.setPageLayout('/SinglePage') outputStream = open(OUTPUT, 'wb') #output.addLink(0,0,rect='[0,0,0,0]',border=None,fit='/Fit') output.write(outputStream) outputStream.close() if os.path.isdir(loi) == True: os.chdir(loi) elif os.path.isdir(loi) == False: os.chdir(ho + '/' + loi) os.remove(ti + '.pdf') if os.name == 'posix': subprocess.call(ho + 'cpdf/mac/cpdf.sh -fit-window true ' + ho + '/' + loo + ti + ' -o ' + ho + '/' + loo + ti, shell=True) elif os.name == 'nt': subprocess.call(ho + 'cpdf/win/cpdf.exe -fit-window true ' + ho + '/' + loo + ti + ' -o ' + ho + '/' + loo + ti, shell=True)
def embed_hidden_data_into_pdf(inpdf, indata): with open(indata, "r", encoding="cp850") as f1: mydata = f1.read() # Read xml and encode it mydata_enc = base64.b64encode(mydata.encode("utf-8")) logging.debug(mydata_enc) logging.debug(type(mydata_enc)) logging.debug(base64.b64decode(mydata_enc).decode("utf-8")) with open(inpdf, "rb") as p1: tempfile_pdf = tempfile.NamedTemporaryFile( mode="w+b", delete=False, suffix=".pdf" ) invoice = PdfFileReader(p1) output_pdf = PdfFileWriter() infodict = output_pdf._info.getObject() for k, v in invoice.documentInfo.items(): infodict.update({NameObject(k): createStringObject(v)}) infodict.update( {NameObject(PDF_DATA_TAG): createStringObject(mydata_enc.decode("utf-8"))} ) for k, v in invoice.documentInfo.items(): logging.debug("{} {}".format(k, v)) for i in range(0, invoice.getNumPages()): output_pdf.addPage(invoice.getPage(i)) # save pdf output_pdf.write(tempfile_pdf) tempfile_pdf.close() persistant_tempfile = tempfile_pdf.name logging.info("Using tempfile {}".format(tempfile_pdf.name)) logging.info("validating temp file") validated_data = get_hidden_data_from_pdf(persistant_tempfile) logging.debug(validated_data) assert mydata == validated_data, "embedded data does not match" logging.info("{} {} {}".format(inpdf, indata, persistant_tempfile)) backup_pdf = "{}.bak".format(inpdf) shutil.move(inpdf, backup_pdf) os.remove(indata) shutil.move(persistant_tempfile, inpdf) os.remove(backup_pdf)
def createAnnotPdf(geom_type, myShapePdf): # input variables # part 1: read geometry pdf to get the vertices and rectangle to use source = PdfFileReader(open(myShapePdf, 'rb')) geomPage = source.getPage(0) mystr = geomPage.getObject()['/Contents'].getData() # to pinpoint the string part: 1.19997 791.75999 m 1.19997 0.19466 l 611.98627 0.19466 l 611.98627 791.75999 l 1.19997 791.75999 l # the format seems to follow x1 y1 m x2 y2 l x3 y3 l x4 y4 l x5 y5 l geomString = mystr.split('S\r\n')[0].split('M\r\n')[1] coordsString = [ value for value in geomString.split(' ') if value not in ['m', 'l', ''] ] # part 2: update geometry in the map if geom_type.upper() == 'POLYGON': pdf_geom = PdfFileReader(open(annot_poly, 'rb')) elif geom_type.upper() == 'POLYLINE': pdf_geom = PdfFileReader(open(annot_line, 'rb')) page_geom = pdf_geom.getPage(0) annot = page_geom['/Annots'][0] updateVertices = "annot.getObject().update({NameObject('/Vertices'):ArrayObject([FloatObject(" + coordsString[ 0] + ")" for item in coordsString[1:]: updateVertices = updateVertices + ',FloatObject(' + item + ')' updateVertices = updateVertices + "])})" exec(updateVertices) xcoords = [] ycoords = [] for i in range(0, len(coordsString) - 1): if i % 2 == 0: xcoords.append(float(coordsString[i])) else: ycoords.append(float(coordsString[i])) # below rect seems to be geom bounding box coordinates: xmin, ymin, xmax,ymax annot.getObject().update({ NameObject('/Rect'): ArrayObject([ FloatObject(min(xcoords)), FloatObject(min(ycoords)), FloatObject(max(xcoords)), FloatObject(max(ycoords)) ]) }) annot.getObject().pop('/AP') # this is to get rid of the ghost shape annot.getObject().update({NameObject('/T'): createStringObject(u'ERIS')}) output = PdfFileWriter() output.addPage(page_geom) annotPdf = os.path.join(scratch, "annot.pdf") outputStream = open(annotPdf, "wb") #output.setPageMode('/UseOutlines') output.write(outputStream) outputStream.close() output = None return annotPdf
def pdf_suffix_fields(self, page, sfx): for j in range(0, len(page['/Annots'])): writer_annot = page['/Annots'][j].getObject() writer_annot.update({ NameObject("/T"): createStringObject(writer_annot.get('/T') + sfx) })
def add_update_pdf_metadata(filename, update_dictionary): # This seems to be the only way to modify the existing PDF metadata. # # pylint: disable=protected-access, no-member def add_prefix(value): return '/' + value full_update_dictionary = {add_prefix(k): v for k, v in update_dictionary.items()} with open(filename, 'rb') as input_file: pdf_input = PdfFileReader(input_file) pdf_output = PdfFileWriter() for page in range(pdf_input.getNumPages()): pdf_output.addPage(pdf_input.getPage(page)) info_dict = pdf_output._info.getObject() info = pdf_input.documentInfo full_update_dictionary = dict(chain(info.items(), full_update_dictionary.items())) for key in full_update_dictionary: assert full_update_dictionary[key] is not None info_dict.update({NameObject(key): createStringObject(full_update_dictionary[key])}) _, temp_file_name = tempfile.mkstemp(prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf") with open(temp_file_name, 'wb') as file_out: pdf_output.write(file_out) shutil.move(temp_file_name, filename)
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict, file_dict, file_bin): filename = file_dict['filename'] logger.debug('_filespec_additional_attachments filename=%s', filename) mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date']) md5sum = hashlib.md5(file_bin).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(mod_date_pdf), NameObject('/Size'): NameObject(str(len(file_bin))), }) file_entry = DecodedStreamObject() file_entry.setData(file_bin) file_mimetype = mimetypes.guess_type(filename)[0] if not file_mimetype: file_mimetype = 'application/octet-stream' file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f') file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, NameObject("/Subtype"): NameObject(file_mimetype_insert), }) file_entry_obj = pdf_filestream._addObject(file_entry) ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, }) fname_obj = createStringObject(filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Unspecified"), NameObject("/Desc"): createStringObject(file_dict.get('desc', '')), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict[fname_obj] = filespec_obj
def write_pdf(filename, chapters, meta): output = PyPDF2.PdfFileWriter() info = output._info.getObject() info.update({ NameObject('/Title') : createStringObject(uni(meta['title'])), NameObject('/Author') : createStringObject(uni(meta['author'])), NameObject('/Creator') : createStringObject('springer.py') }) page = 0 for chapter in chapters: inp = PyPDF2.PdfFileReader(open(chapter[0], 'rb')) for i in range(inp.getNumPages()): output.addPage(inp.getPage(i)) if chapter[1] != None: output.addBookmark(uni(chapter[1]), page) page += inp.getNumPages() fp = file(filename, 'wb') output.write(fp) fp.close()
def crea_cuaderno(nombre, listapdf, ruta='', generar=True): fins = [PdfFileReader(ruta+'/pdf/' + i+'.pdf') for i in listapdf] tapa = PdfFileReader(ruta +'/documentos/' + 'tapa_' + nombre.lower() + '.pdf') objetivos = PdfFileReader(ruta +'/documentos/'+'objetivos.pdf') observaciones = PdfFileReader(ruta +'/documentos/'+'observaciones.pdf') # gestión metadatos info_old = fins[0].getDocumentInfo() output = crea_nombre_pdf(nombre) fo = PdfFileWriter() info_dict = fo._info.getObject() for key in info_old: info_dict.update({NameObject(key): createStringObject(info_old[key])}) info_dict.update({ NameObject('/Title'): createStringObject(nombre) }) # crear con páginas # añadir tapa fo.addPage() # añadir objetivos fo.addPage(tapa.getPage(0)) fo.addPage(objetivos.getPage(0)) for i in fins: fo.addPage(i.getPage(0)) fo.addPage(observaciones.getPage(0)) fo.addPage(observaciones.getPage(0)) if generar: fo.write(open(ruta + 'documentos/Cuaderno ' + nombre + '.pdf', 'wb')) else: buffer = BytesIO() fo.write(buffer) pdf = buffer.getvalue() buffer.close() return pdf
def main(path, new_name): inpfn = path fin = file(inpfn, 'rb') pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) # rename infoDict.update({NameObject('/Title'): createStringObject(unicode(new_name))}) # It does not appear possible to alter in place. fout = open(inpfn+'out.pdf', 'wb') writer.write(fout) fin.close() fout.close() import os os.unlink(inpfn) os.rename(inpfn+'out.pdf', inpfn)
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes): """Append attachments to a PDF.""" # The entry for the file file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")}) # The Filespec entry efEntry = DictionaryObject() efEntry.update({NameObject("/F"): file_entry}) filespec = DictionaryObject() filespec.update({ NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): createStringObject(fname), NameObject("/EF"): efEntry, }) if "/Names" not in writer._root_object.keys(): # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([createStringObject(fname), filespec]) }) embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) writer._root_object.update( {NameObject("/Names"): embeddedFilesDictionary}) else: # There are files already attached. Append the new file. writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( createStringObject(fname)) writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append( filespec)
def add_update_pdf_metadata(filename, update_dictionary): # This seems to be the only way to modify the existing PDF metadata. # # pylint: disable=protected-access, no-member def add_prefix(value): return '/' + value full_update_dictionary = { add_prefix(k): v for k, v in update_dictionary.items() } with open(filename, 'rb') as input_file: pdf_input = PdfFileReader(input_file) pdf_output = PdfFileWriter() for page in range(pdf_input.getNumPages()): pdf_output.addPage(pdf_input.getPage(page)) info_dict = pdf_output._info.getObject() info = pdf_input.documentInfo full_update_dictionary = dict( chain(info.items(), full_update_dictionary.items())) for key in full_update_dictionary: assert full_update_dictionary[key] is not None info_dict.update({ NameObject(key): createStringObject(full_update_dictionary[key]) }) os_file_out, temp_file_name = tempfile.mkstemp( prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf") # Immediately close the file as created to work around issue on # Windows where file cannot be opened twice. os.close(os_file_out) with open(temp_file_name, 'wb') as file_out: pdf_output.write(file_out) shutil.move(temp_file_name, filename)
def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname, fdata): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file moddate = DictionaryObject() moddate.update({ NameObject('/ModDate'): createStringObject(self._get_pdf_timestamp()) }) file_entry = DecodedStreamObject() file_entry.setData(fdata) file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): moddate, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry efEntry = DictionaryObject() efEntry.update({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject(fname) filespec = DictionaryObject() filespec.update({ NameObject("/AFRelationship"): NameObject("/Alternative"), NameObject("/Desc"): createStringObject("ZUGFeRD Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): efEntry, NameObject("/UF"): fname_obj, }) embeddedFilesNamesDictionary = DictionaryObject() embeddedFilesNamesDictionary.update({ NameObject("/Names"): ArrayObject([fname_obj, pdf_filestream._addObject(filespec)]) }) # Then create the entry for the root, as it needs a # reference to the Filespec embeddedFilesDictionary = DictionaryObject() embeddedFilesDictionary.update( {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary}) # Update the root metadata_xml_str = self._prepare_pdf_metadata() metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_value = pdf_filestream._addObject(metadata_file_entry) af_value = pdf_filestream._addObject( ArrayObject([pdf_filestream._addObject(filespec)])) pdf_filestream._root_object.update({ NameObject("/AF"): af_value, NameObject("/Metadata"): metadata_value, NameObject("/Names"): embeddedFilesDictionary, }) info_dict = self._prepare_pdf_info() pdf_filestream.addMetadata(info_dict)
def clear_metadata(src, dst): src = settings.LEADING_SLASH + src dst = settings.LEADING_SLASH + dst ext = os.path.splitext(dst)[1] # assumed to be in lower case! meta_fields = [ 'author', 'category', 'comments', 'content_status', 'identifier', 'keywords', 'last_modified_by', 'language', 'subject', 'title', 'version' ] try: if ext in ['.docx']: f = open(src, 'rb') doc = Document(f) f.close() for meta_field in meta_fields: setattr(doc.core_properties, meta_field, '') setattr(doc.core_properties, 'created', DEFAULT_DATE) setattr(doc.core_properties, 'modified', DEFAULT_DATE) setattr(doc.core_properties, 'last_printed', DEFAULT_DATE) setattr(doc.core_properties, 'revision', 1) doc.save(dst) clean_xml_in_zip(dst) elif ext in ['.pptx']: prs = Presentation(src) for meta_field in meta_fields: setattr(prs.core_properties, meta_field, '') setattr(prs.core_properties, 'created', DEFAULT_DATE) setattr(prs.core_properties, 'modified', DEFAULT_DATE) setattr(prs.core_properties, 'last_printed', DEFAULT_DATE) setattr(prs.core_properties, 'revision', 1) prs.save(dst) clean_xml_in_zip(dst) elif ext == '.pdf': fin = file(src, 'rb') inp = PdfFileReader(fin) outp = PdfFileWriter() for page in range(inp.getNumPages()): outp.addPage(inp.getPage(page)) infoDict = outp._info.getObject() infoDict.update({ NameObject('/Title'): createStringObject(u''), NameObject('/Author'): createStringObject(u''), NameObject('/Subject'): createStringObject(u''), NameObject('/Creator'): createStringObject(u'') }) fout = open(dst, 'wb') outp.write(fout) fin.close() fout.close() elif ext == '.xlsx': file_to_clear = 'docProps/core.xml' # create a copy of the Excel file while "cleaning" docProps/core.xml with ZipFile(src, 'r') as src_zip: with ZipFile(dst, 'w') as dst_zip: dst_zip.comment = src_zip.comment # preserve the comment (if any) for item in src_zip.infolist(): if item.filename == file_to_clear: # read the XML tree from the file xml = src_zip.read(item.filename) xml = re.sub(r'<dc:title>[^<]{1,1000}</dc:title>', '<dc:title></dc:title>', xml) xml = re.sub( r'<dc:subject>[^<]{1,500}</dc:subject>', '<dc:subject></dc:subject>', xml) xml = re.sub( r'<dc:creator>[^<]{1,300}</dc:creator>', '<dc:creator></dc:creator>', xml) xml = re.sub( r'<dc:description>[^<]{1,2500}</dc:description>', '<dc:description></dc:description>', xml) xml = re.sub( r'<cp:keywords>[^<]{1,1000}</cp:keywords>', '<cp:keywords></cp:keywords>', xml) xml = re.sub( r'<cp:lastModifiedBy>[^<]{1,300}</cp:lastModifiedBy>', '<cp:lastModifiedBy></cp:lastModifiedBy>', xml) xml = re.sub( r'<cp:category>[^<]{1,300}</cp:category>', '<cp:category></cp:category>', xml) xml = re.sub( r'<cp:contentStatus>[^<]{1,100}</cp:contentStatus>', '<cp:contentStatus></cp:contentStatus>', xml) xml = re.sub( r'<cp:revision>[^<]{1,10}</cp:revision>', '<cp:revision></cp:revision', xml) # replace all date-time fields with the default date xml = re.sub( r':W3CDTF">[^<]{1,25}</dcterms:', ':W3CDTF">2001-01-01T00:00:00Z</dcterms:', xml) dst_zip.writestr(item, xml) else: dst_zip.writestr(item, src_zip.read(item.filename)) except Exception, e: log_message('Exception while removing metadata from a %s file: %s' % (ext, str(e)))
def create_pdf(self): if not self.published: self.pdf = None elif (not self.pdf_filename or self.pdf_updated < self.published): with tempfile.SpooledTemporaryFile(SPOOL_LIMIT) as temp: # Create an output PDF surface with an arbitrary size (the # size doesn't matter as we'll set it independently for each # page below) surface = cairo.PDFSurface(temp, 144.0, 144.0) context = cairo.Context(surface) page = self.first_page while page: context.save() try: # Render the page's vector image if it has one if page.vector_filename: svg = Rsvg.Handle() shutil.copyfileobj(page.vector, svg) svg.close() surface.set_size( svg.props.width / svg.props.dpi_x * 72.0, svg.props.height / svg.props.dpi_y * 72.0) context.scale( 72.0 / svg.props.dpi_x, 72.0 / svg.props.dpi_y) svg.render_cairo(context) # Otherwise, render the page's bitmap image (NOTE we # assume all bitmaps are 96dpi here) else: img = cairo.ImageSurface.create_from_png(page.bitmap) surface.set_size( img.get_width() / 96.0 * 72.0, img.get_height() / 96.0 * 72.0) context.scale(72.0 / 96.0, 72.0 / 96.0) context.set_source_surface(img) context.paint() context.show_page() finally: context.restore() page = page.next_page surface.finish() # Use PyPdf to rewrite the metadata on the file (cairo provides # no PDF metadata manipulation). This involves generating a new # PDF with new metadata and copying the pages over temp.seek(0) pdf_in = PdfFileReader(temp) pdf_out = PdfFileWriter() pdf_info = pdf_out._info.getObject() pdf_info.update(pdf_in.documentInfo) pdf_info.update({ NameObject('/Title'): createStringObject('%s - Issue #%d - %s' % ( self.comic.title, self.issue_number, self.title, )), NameObject('/Author'): createStringObject( self.comic.author.name if self.comic.author else 'Anonymous' ), }) for page in range(pdf_in.getNumPages()): pdf_out.addPage(pdf_in.getPage(page)) with tempfile.SpooledTemporaryFile(SPOOL_LIMIT) as temp: pdf_out.write(temp) temp.seek(0) self.pdf = temp
def convert_to_pdfa(self): """ Transform the opened PDF file into a PDF/A compliant file """ # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant. # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1 # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker, # where 'n' is a single digit number between 0 (30h) and 7 (37h) " # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four # bytes, each of whose encoded byte values shall have a decimal value greater than 127 " self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF" # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required # when using PDF/A pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest()) # The first string is based on the content at the time of creating the file, while the second is based on the # content of the file when it was last updated. When creating a PDF, both are set to the same value. self._ID = ArrayObject((pdf_id, pdf_id)) with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile: icc_profile_file_data = compress(icc_profile.read()) icc_profile_stream_obj = DecodedStreamObject() icc_profile_stream_obj.setData(icc_profile_file_data) icc_profile_stream_obj.update({ NameObject("/Filter"): NameObject("/FlateDecode"), NameObject("/N"): NumberObject(3), NameObject("/Length"): NameObject(str(len(icc_profile_file_data))), }) icc_profile_obj = self._addObject(icc_profile_stream_obj) output_intent_dict_obj = DictionaryObject() output_intent_dict_obj.update({ NameObject("/S"): NameObject("/GTS_PDFA1"), NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"), NameObject("/DestOutputProfile"): icc_profile_obj, NameObject("/Type"): NameObject("/OutputIntent"), }) output_intent_obj = self._addObject(output_intent_dict_obj) self._root_object.update({ NameObject("/OutputIntents"): ArrayObject([output_intent_obj]), }) pages = self._root_object['/Pages']['/Kids'] # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file. # But it seems like it is not the case when exporting from wkhtmltopdf. if TTFont: fonts = {} # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF. for page in pages: for font in page.getObject()['/Resources']['/Font'].values(): for descendant in font.getObject()['/DescendantFonts']: fonts[descendant.idnum] = descendant.getObject() # Then for each font, rewrite the width array with the information taken directly from the font file. # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em) # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/ for font in fonts.values(): font_file = font['/FontDescriptor']['/FontFile2'] stream = io.BytesIO(decompress(font_file._data)) ttfont = TTFont(stream) font_upm = ttfont['head'].unitsPerEm glyphs = ttfont.getGlyphSet()._hmtx.metrics glyph_widths = [] for key, values in glyphs.items(): if key[:5] == 'glyph': glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm))) font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)]) stream.close() else: _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.') outlines = self._root_object['/Outlines'].getObject() outlines[NameObject('/Count')] = NumberObject(1) # Set odoo as producer self.addMetadata({ '/Creator': "Odoo", '/Producer': "Odoo", }) self.is_pdfa = True
# ophalen van de properties van het orginele document (is niet van belang voor de werking van het script) props = pdf_in.documentInfo # Definiëren van de writer writer = PdfFileWriter() # overschrijven van de pagina's (inhoud) naar de writer for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) # Verwijzen naar beschermde member van de writer class (_info) infoDict ook wel propertyDict o.i.d. infoDict = writer._info.getObject() # overschrijven van de properties van het oude document naar de writer for key in props: infoDict.update({NameObject(key): createStringObject(props[key])}) # toewijzen van een titel in de properties infoDict.update({NameObject('/Title'): createStringObject('test')}) # output file openen pdf_out = open(output_file, 'wb') # alles schrijven naar output file writer.write(pdf_out) # sluiten van het oude en nieuwe bestand file.close() pdf_out.close() # Verwijderen van het oorspronkelijk document en het hernoemen van het nieuwe document
def _update_metadata_add_attachment(self, pdf_metadata, output_intents): '''This method is inspired from the code of the addAttachment() method of the PyPDF2 lib''' # The entry for the file facturx_xml_str = self.factx.xml_str md5sum = hashlib.md5().hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = self._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) xmp_filename = self.factx.flavor.details['xmp_filename'] fname_obj = createStringObject(xmp_filename) filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = self._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} # TODO: add back additional attachments? logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict) name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] logger.debug('output_intents=%s', output_intents) for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = self._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = self._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str'] xmp_template = self.factx.flavor.get_xmp_xml() metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = self._addObject(metadata_file_entry) af_value_obj = self._addObject(ArrayObject(af_list)) self._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) logger.debug('res_output_intents=%s', res_output_intents) if res_output_intents: self._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) self.addMetadata(metadata_txt_dict)
props = pdf_in.documentInfo # Definiëren van de writer writer = PdfFileWriter() # overschrijven van de pagina's (inhoud) naar de writer print(range(pdf_in.getNumPages())) for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) # Verwijzen naar beschermde member van de writer class (_info) infoDict ook wel propertyDict o.i.d. infoDict = writer._info.getObject() # overschrijven van de properties van het oude document naar de writer for key in props: infoDict.update({NameObject(key): createStringObject(props[key])}) # toewijzen van een titel in de properties title = 'titletest' keywords = None None if title is None else infoDict.update({NameObject('/Title'): createStringObject(title)}) None if keywords is None else infoDict.update({NameObject('/Keywords'): createStringObject(keywords)}) # infoDict.update({NameObject('/Title'): createStringObject('testtest')}) # infoDict.update({NameObject('/Subject'): createStringObject('subtest')}) # infoDict.update({NameObject('/Keywords'): createStringObject('keytest')}) # infoDict.update({NameObject('/Category'): createStringObject('categorytest')}) # infoDict.update({NameObject('/Comments'): createStringObject('Commentstest')}) # output file openen
from PyPDF2 import PdfFileWriter, PdfFileReader # filename=raw_input("Pdf file hyperlink?") filename = "M:\Engineering\ESo\Standard Plan\Updated\OCS\Typical Pushoff Cantilever Assemblies Layout1 (1).pdf" fin = file(filename, 'rb') # Open File pdf_in = PdfFileReader(fin) # use PyPDF2 file reader # Grabbing and processing information from Attext produced .txt # AttFile=raw_input("File with attributes?") AttFile = "C:\Users\jli\Documents\CurrentConfiguration\Typical Pushoff Cantilever Assemblies.txt" AttFiletemp = open(AttFile, 'r') Attributes = [] for row in AttFiletemp: Attributes.append(row.strip().split(',')) # print Attributes AttributesTitle = Attributes[0][0] + ';' + Attributes[0][1] + ';' + Attributes[ 0][3] AttributesTitle = AttributesTitle.replace("'", '') from PyPDF2.generic import NameObject, createStringObject writer = PdfFileWriter() # New new pdf file for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() # Grab existing pdf info info = pdf_in.documentInfo for key in info: infoDict.update( {NameObject('/Title'): createStringObject(AttributesTitle)}) info.update({NameObject('/Title'): createStringObject(AttributesTitle)}) print infoDict
raise Exception("No files found in the PDF metadata directory. No work to be done.") print "Found", files_found, "files" # for each metadata PDF for file_no in range(files_found): current_filename = metadata_files[file_no] # Load the original PDF metadata if os.path.isfile(os.path.join(metadata_path,current_filename)): pdf_metadata_input = PdfFileReader(open(os.path.join(metadata_path,current_filename), "rb")) pdf_metadata = pdf_metadata_input.getDocumentInfo() # If there is a Title field grab it if pdf_metadata.title != None: pdf_metadata.update({ NameObject('/Title'): createStringObject(titlecase(pdf_metadata.title)) }) pdf_title = pdf_metadata.title else: pdf_title = '' # If there is a Producer field set it to "" if pdf_metadata.producer != None: pdf_metadata.update({ NameObject('/Producer'): createStringObject(u'') }) # If the same file name exists in the PDF directory load it if os.path.isfile(os.path.join(pdf_path,current_filename)): pdf_input = PdfFileReader(open(os.path.join(pdf_path,current_filename), "rb"))
info_dict_output = dict() ipdf_info = imagepdf.documentInfo # Our signature as a producer our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)" read_producer = False PRODUCER_KEY = "/Producer" if ipdf_info is not None: for key in ipdf_info: value = ipdf_info[key] if key == PRODUCER_KEY: value = value + "; " + our_name read_producer = True # try: # Check if value can be accepted by pypdf API testConversion = createStringObject(value) info_dict_output[key] = value except TypeError: # This can happen with some array properties. print("Warning: property " + key + " not copied to final PDF") # # # if not read_producer: info_dict_output[PRODUCER_KEY] = our_name # output.addMetadata(info_dict_output) # with open(sys.argv[3], 'wb') as f: output.write(f) #
# print("Img:", imagepage.mediaBox.upperRight) # print("Text:", textpage.mediaBox.upperRight) factor_x = textpage.mediaBox.upperRight[0] / imagepage.mediaBox.upperRight[0] factor_y = textpage.mediaBox.upperRight[1] / imagepage.mediaBox.upperRight[1] # print(factor_x, factor_y) imagepage.scale(float(factor_x), float(factor_y)) textpage.mergePage(imagepage) # imagepage stay on top textpage.compressContentStreams() output.addPage(textpage) # info_dict_output = output._info.getObject() ipdf_info = imagepdf.documentInfo # Our signature as a producer our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)" read_producer = False PRODUCER_KEY = "/Producer" for key in ipdf_info: value = ipdf_info[key] if key == PRODUCER_KEY: value = value + "; " + our_name read_producer = True # info_dict_output.update({NameObject(key): createStringObject(value)}) # if not read_producer: info_dict_output.update({NameObject(PRODUCER_KEY): createStringObject(our_name)}) # with open(sys.argv[3], 'wb') as f: output.write(f) #
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level, output_intents): md5sum = hashlib.md5(facturx_xml_str).hexdigest() md5sum_obj = createStringObject(md5sum) params_dict = DictionaryObject({ NameObject('/CheckSum'): md5sum_obj, NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()), NameObject('/Size'): NameObject(str(len(facturx_xml_str))), }) file_entry = DecodedStreamObject() file_entry.setData(facturx_xml_str) # here we integrate the file itself file_entry.update({ NameObject("/Type"): NameObject("/EmbeddedFile"), NameObject("/Params"): params_dict, # 2F is '/' in hexadecimal NameObject("/Subtype"): NameObject("/text#2Fxml"), }) file_entry_obj = pdf_filestream._addObject(file_entry) # The Filespec entry ef_dict = DictionaryObject({ NameObject("/F"): file_entry_obj, NameObject('/UF'): file_entry_obj, }) fname_obj = createStringObject("ZUGFeRD-invoice.xml") filespec_dict = DictionaryObject({ NameObject("/AFRelationship"): NameObject("/Data"), NameObject("/Desc"): createStringObject("Factur-X Invoice"), NameObject("/Type"): NameObject("/Filespec"), NameObject("/F"): fname_obj, NameObject("/EF"): ef_dict, NameObject("/UF"): fname_obj, }) filespec_obj = pdf_filestream._addObject(filespec_dict) name_arrayobj_cdict = {fname_obj: filespec_obj} name_arrayobj_content_sort = list( sorted(name_arrayobj_cdict.items(), key=lambda x: x[0])) name_arrayobj_content_final = [] af_list = [] for (fname_obj, filespec_obj) in name_arrayobj_content_sort: name_arrayobj_content_final += [fname_obj, filespec_obj] af_list.append(filespec_obj) embedded_files_names_dict = DictionaryObject({ NameObject("/Names"): ArrayObject(name_arrayobj_content_final), }) # Then create the entry for the root, as it needs a # reference to the Filespec embedded_files_dict = DictionaryObject({ NameObject("/EmbeddedFiles"): embedded_files_names_dict, }) res_output_intents = [] for output_intent_dict, dest_output_profile_dict in output_intents: dest_output_profile_obj = pdf_filestream._addObject( dest_output_profile_dict) # TODO detect if there are no other objects in output_intent_dest_obj # than /DestOutputProfile output_intent_dict.update({ NameObject("/DestOutputProfile"): dest_output_profile_obj, }) output_intent_obj = pdf_filestream._addObject(output_intent_dict) res_output_intents.append(output_intent_obj) # Update the root metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata) metadata_file_entry = DecodedStreamObject() metadata_file_entry.setData(metadata_xml_str) metadata_file_entry.update({ NameObject('/Subtype'): NameObject('/XML'), NameObject('/Type'): NameObject('/Metadata'), }) metadata_obj = pdf_filestream._addObject(metadata_file_entry) af_value_obj = pdf_filestream._addObject(ArrayObject(af_list)) pdf_filestream._root_object.update({ NameObject("/AF"): af_value_obj, NameObject("/Metadata"): metadata_obj, NameObject("/Names"): embedded_files_dict, # show attachments when opening PDF NameObject("/PageMode"): NameObject("/UseAttachments"), }) if res_output_intents: pdf_filestream._root_object.update({ NameObject("/OutputIntents"): ArrayObject(res_output_intents), }) metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata) pdf_filestream.addMetadata(metadata_txt_dict)
0] factor_y = textpage.mediaBox.upperRight[1] / imagepage.mediaBox.upperRight[ 1] # print(factor_x, factor_y) imagepage.scale(float(factor_x), float(factor_y)) textpage.mergePage(imagepage) # imagepage stay on top textpage.compressContentStreams() output.addPage(textpage) # info_dict_output = output._info.getObject() ipdf_info = imagepdf.documentInfo # Our signature as a producer our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)" read_producer = False PRODUCER_KEY = "/Producer" for key in ipdf_info: value = ipdf_info[key] if key == PRODUCER_KEY: value = value + "; " + our_name read_producer = True # info_dict_output.update({NameObject(key): createStringObject(value)}) # if not read_producer: info_dict_output.update( {NameObject(PRODUCER_KEY): createStringObject(our_name)}) # with open(sys.argv[3], 'wb') as f: output.write(f) #
def main(): # path of all pdf all_pdf_files # very bad implemenation. todo: use getopt my_path = "." #default path is the current folder if len(sys.argv) == 2: my_path = sys.argv[1] elif len(sys.argv) > 2: print("Something went wrong. Check your arguments.") exit(-1) # create results directory if not exists os.makedirs(my_path + "/results", exist_ok=True) print("Path given: ", my_path, "\n") # get all pdf files from specified folder all_pdf_files = find_ext(my_path, "pdf") c = 0 for pdf in all_pdf_files: inputPdf = PdfFileReader( open(pdf, "rb"), strict=False ) # strict=False for Windows support - PdfReadWarning: Superfluous whitespace found in object header docInfo = inputPdf.getDocumentInfo() # create metadata patch output = PdfFileWriter() infoDict = output._info.getObject() infoDict.update({ NameObject('/Title'): createStringObject(u'title removed'), NameObject('/Author'): createStringObject(u'author removed'), NameObject('/Subject'): createStringObject(u'subject removed'), NameObject('/Creator'): createStringObject(u'software quality script'), NameObject('/Producer'): createStringObject(u'software quality script'), NameObject('/Keywords'): createStringObject(u'software, quality, sanitized') }) c += 1 # get filename from path filename = os.path.basename(pdf) # remove the extension filename_noext = os.path.splitext(filename)[0] #print some information print(c, ". Processing file: ", pdf) print("filename: ", filename) print("Title: ", docInfo.title) print("Author: ", docInfo.author) print("Subject: ", docInfo.subject) print("Producer: ", docInfo.producer) print("Creator: ", docInfo.creator) #create new pdf for page in range(inputPdf.getNumPages()): output.addPage(inputPdf.getPage(page)) print("\n") # write new file in results subfolder # the new file name will be the old one extended with "-sanitized.pdf" outputStream = open( my_path + "/results/" + filename_noext + "-sanitized.pdf", 'wb') output.write(outputStream) outputStream.close()
def add_background_to_pdf( filename_in, filename_out=None, filename_letterhead=None, filename_background=None, new_title="", new_author="", ): """Merges a one page letterhead to an invoice and sets author and title of the doc info for multi page pdfs, its possible to define an extra page """ if not filename_letterhead: return use_tmpfile = False if not filename_out: filename_out = tempfile.NamedTemporaryFile( mode="w+b", delete=False, suffix=".pdf" ).name use_tmpfile = True if not filename_background: filename_background = filename_letterhead with open(filename_in, "rb") as pdf_in, open( filename_background, "rb" ) as pdf_lb, open(filename_letterhead, "rb") as pdf_lh: input_pdf = PdfFileReader(pdf_in) output_pdf = PdfFileWriter() # metadata # noinspection PyProtectedMember infodict = output_pdf._info.getObject() for k, v in input_pdf.documentInfo.items(): infodict.update({NameObject(k): createStringObject(v)}) infodict.update({NameObject("/Title"): createStringObject(new_title)}) infodict.update({NameObject("/Author"): createStringObject(new_author)}) # add first page # get the first invoice page, merge with letterhead letterhead = PdfFileReader(pdf_lh).getPage(0) letterhead.mergePage(input_pdf.getPage(0)) output_pdf.addPage(letterhead) # add other pages for i in range(1, input_pdf.getNumPages()): background = PdfFileReader(pdf_lb).getPage(0) background.mergePage(input_pdf.getPage(i)) output_pdf.addPage(background) # save pdf with open(filename_out, "wb") as pdf_out: output_pdf.write(pdf_out) if use_tmpfile: backup_pdf = "{}.bak".format(filename_in) shutil.move(filename_in, backup_pdf) shutil.move(filename_out, filename_in) os.remove(backup_pdf)