def editPDF(filename): """ function to add metadata to pdf files""" INPUT = filename OUTPUT = filename[:-4] + '_updated.pdf' output = PdfFileWriter() fin = file(INPUT, 'rb') pdf_in = PdfFileReader(fin) infoDict = output._info.getObject() ########################################################### # I've added random tags here, use what needs to be added # # # ########################################################### infoDict.update({ NameObject('/Tags'): createStringObject(tag_dict[filename]), NameObject('/Keywords'): createStringObject(tag_dict[filename]) }) for page in range(pdf_in.getNumPages()): output.addPage(pdf_in.getPage(page)) outputStream = file(os.path.join(directory, OUTPUT), 'wb') output.write(outputStream) fin.close() outputStream.close()
def __iter__(self): warnings.simplefilter('ignore', DeprecationWarning) # mostly based on PdfFileWriter.write # Begin writing, so that, even if _sweepIndirectReferences takes # a long time, the download begins object_positions = [] length = 0 s = self._header + "\n" yield s length += len(s) externalReferenceMap = {} self.set = set() self._sweepIndirectReferences(externalReferenceMap, self._root) del self.set stream = StringIO.StringIO() for i, obj in enumerate(self._objects): idnum = (i + 1) object_positions.append(length) s1 = str(idnum) + " 0 obj\n" obj.writeToStream(stream, None) s2 = stream.getvalue() + "\nendobj\n" yield s1 + s2 length += len(s1) + len(s2) stream.reset() stream.truncate() # xref table xref_location = length yield("xref\n") yield("0 %s\n" % (len(self._objects) + 1)) yield "0000000000 65535 f \n" for offset in object_positions: yield "%010d 00000 n \n" % offset # trailer yield("trailer\n") trailer = DictionaryObject() trailer.update({ NameObject("/Size"): NumberObject(len(self._objects) + 1), NameObject("/Root"): self._root, NameObject("/Info"): self._info, }) if hasattr(self, "_ID"): trailer[NameObject("/ID")] = self._ID if hasattr(self, "_encrypt"): trailer[NameObject("/Encrypt")] = self._encrypt trailer.writeToStream(stream, None) yield stream.getvalue() # eof yield("\nstartxref\n%s\n%%%%EOF\n" % (xref_location)) warnings.simplefilter('default', DeprecationWarning)
def addCopyrightToPDF(pdf_file_location, pdf_file_destination,copyrightText, drawText=True, title="",authors=""): packet = StringIO.StringIO() # create a new PDF with Reportlab can = canvas.Canvas(packet, pagesize=letter) can.setFont("Times-Roman",7) ctext = copyrightText.split("\n") if drawText: can.drawString(30, 40, ctext[0]) can.drawString(30, 50, ctext[1]) else: can.drawString(30, 50, "") can.save() #move to the beginning of the StringIO buffer packet.seek(0) new_pdf = PdfFileReader(packet) # read your existing PDF existing_pdf = PdfFileReader(file(pdf_file_location, "rb")) output = PdfFileWriter() npagesorig = existing_pdf.getNumPages() #print npagesorig for i in range(npagesorig): page = existing_pdf.getPage(i) # if last page if i == (npagesorig-1): #print "HERE",copyrightText,drawText page.mergePage(new_pdf.getPage(0)) output.addPage(page) infoDict = output._info.getObject() infoDict.update({ NameObject('/Title'): createStringObject(title), NameObject('/Author'): createStringObject(authors) }) # finally, write "output" to a real file outputStream = file(pdf_file_destination, "wb") output.write(outputStream) outputStream.close()
def replace_text(cls, page, text, replace): # HACK from pyPdf.pdf import ContentStream, PageObject from pyPdf.generic import TextStringObject, NameObject content = ContentStream(page["/Contents"].getObject(), page.pdf) for idx in range(len(content.operations)): operands, operator = content.operations[idx] if operator == 'Tj': operands[0] = TextStringObject(operands[0].replace( text, replace)) new_page = PageObject.createBlankPage(page.pdf) new_page.mergePage(page) new_page[NameObject('/Contents')] = content return new_page
def pdf_add_content(content_string, page, scale=1, offsetx=0, offsety=0): """Add content to the end of the content stream of the PDF page. Inputs: content_string The PDF drawing commands to add, as a single string. page The pyPdf.pdf.PageObject to add the content to. scale Before adding the content, adjust the the coordinate offsetx system with a (uniform) scale factor and a offsety translation of offsetx and offsety. """ coord_trans = '%.2f 0 0 %.2f %.2f %.2f cm' % (scale, scale, offsetx, offsety) commands = '\n'.join(('Q', 'q', coord_trans, content_string, 'Q')) try: orig_content = page['/Contents'].getObject() except KeyError: orig_content = ArrayObject([]) stream = ContentStream(orig_content, page.pdf) stream.operations.insert(0, [[], 'q']) # Existing content may not restore stream.operations.append([[], commands]) # graphics state at the end. page[NameObject('/Contents')] = stream
from pyPdf import PdfFileWriter, PdfFileReader from pyPdf.generic import NameObject, createStringObject OUTPUT = 'ml1.pdf' INPUT = 'NOFO.pdf' # There is no interface through pyPDF with which to set this other then getting # your hands dirty like so: output = PdfFileWriter() fin = file(INPUT, 'rb') pdf_in = PdfFileReader(fin) infoDict = output._info.getObject() print infoDict infoDict.update({ NameObject('/Title'): createStringObject(u'title'), NameObject('/Author'): createStringObject(u'author'), NameObject('/Subject'): createStringObject(u'subject'), NameObject('/Creator'): createStringObject(u'a script') }) print infoDict for page in range(pdf_in.getNumPages()): output.addPage(pdf_in.getPage(page)) outputStream = file(OUTPUT, 'wb') output.write(outputStream) outputStream.close() from pyPdf import PdfFileReader, PdfFileWriter pdf = PdfFileReader(open(OUTPUT, 'rb'))
import sys parser = argparse.ArgumentParser(description=u'Limpia los metadatos de un PDF y opcionalmente añade título y autor') parser.add_argument("input", help="fichero pdf origen") parser.add_argument("output", help="fichero pdf destino") args = parser.parse_args() fin = file(args.input, 'rb') pdfIn = PdfFileReader(fin) pdfOut = PdfFileWriter() for page in range(pdfIn.getNumPages()): pdfOut.addPage(pdfIn.getPage(page)) info = pdfOut._info.getObject() del info[NameObject('/Producer')] title = raw_input("Titulo:").decode(sys.stdin.encoding) author = raw_input("Autor:").decode(sys.stdin.encoding) info.update({ NameObject('/Title'): createStringObject(title), NameObject('/Author'): createStringObject(author) }) fout = open(args.output, 'wb') pdfOut.write(fout) fin.close() fout.close()
inpfn = raw_input('Enter PDF path : ') fin = file(inpfn, 'rb') pdf_in = PdfFileReader(fin) writer = PdfFileWriter() for page in range(pdf_in.getNumPages()): writer.addPage(pdf_in.getPage(page)) infoDict = writer._info.getObject() info = pdf_in.documentInfo for key in info: infoDict.update({NameObject(key): createStringObject(info[key])}) # add the grade list_of_data_to_delete = [ '/CreationDate', '/Author', '/Creator', '/ModDate', '/Producer', '/Title' ] for item in list_of_data_to_delete: try: infoDict.update({NameObject(item): createStringObject(u'')}) except: print("can't delete : ", i) fout = open('outputFile.pdf', 'wb') writer.write(fout) fin.close()