def meta(input_pdf, output_pdf, value):

    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(input_pdf)

    for page in range(pdf_reader.getNumPages()):
        pdf_writer.addPage(pdf_reader.getPage(page))

    # pdf_writer.encrypt(user_pwd=password, owner_pwd=None,
    #                    use_128bit=True)

    infoDict = pdf_writer._info.getObject()

    infoDict.update({NameObject('/Version'): createStringObject(u'234ds2')})
    info = pdf_reader.documentInfo
    for key in info:
        infoDict.update({NameObject(key): createStringObject(info[key])})

    # add the grade
    # infoDict.update({NameObject('/Grade'): createStringObject(u'A+')})
    # infoDict.update({NameObject('/Grade2'): createStringObject(u'A+')})
    infoDict.update({NameObject('/Key'): createStringObject(value)})

    with open(output_pdf, 'wb') as fh:
        pdf_writer.write(fh)
Пример #2
0
    def _create_attachment_object(self, attachment):
        ''' Create a PyPdf2.generic object representing an embedded file.

        :param attachment: A dictionary containing:
            * filename: The name of the file to embed (required)
            * content:  The bytes of the file to embed (required)
            * subtype: The mime-type of the file to embed (optional)
        :return:
        '''
        file_entry = DecodedStreamObject()
        file_entry.setData(attachment['content'])
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            DictionaryObject({
                NameObject('/CheckSum'):
                createStringObject(md5(attachment['content']).hexdigest()),
                NameObject('/ModDate'):
                createStringObject(
                    datetime.now().strftime(DEFAULT_PDF_DATETIME_FORMAT)),
                NameObject('/Size'):
                NameObject(str(len(attachment['content']))),
            }),
        })
        if attachment.get('subtype'):
            file_entry.update({
                NameObject("/Subtype"):
                NameObject(attachment['subtype']),
            })
        file_entry_object = self._addObject(file_entry)
        filename_object = createStringObject(attachment['filename'])
        filespec_object = DictionaryObject({
            NameObject("/AFRelationship"):
            NameObject("/Data"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            filename_object,
            NameObject("/EF"):
            DictionaryObject({
                NameObject("/F"): file_entry_object,
                NameObject('/UF'): file_entry_object,
            }),
            NameObject("/UF"):
            filename_object,
        })
        if attachment.get('description'):
            filespec_object.update({
                NameObject("/Desc"):
                createStringObject(attachment['description'])
            })
        return self._addObject(filespec_object)
Пример #3
0
def meta(pa, ti, au, fi, loo, ho, loi):
    OUTPUT = ti + '.pdf'
    INPUTS = [
        ti + '.pdf',
    ]

    if au == None:
        au = ''
    else:
        pass

    output = PdfFileWriter()

    infoDict = output._info.getObject()
    infoDict.update({
        NameObject('/Title'): createStringObject(ti),
        NameObject('/Author'): createStringObject(str(au)),
        # NameObject('/Subject'): createStringObject(su),
        #NameObject('/'): createStringObject('Fit'),
        # NameObject('/Fit'): createStringObject('Fit-to-page')
    })

    inputs = [PdfFileReader(open(i, "rb")) for i in INPUTS]
    for input in inputs:
        for page in range(input.getNumPages()):
            output.addPage(input.getPage(page))
            #output.addLink(page,0,rect='[0,0,0,0]',border=None,fit='/Fit')
    if os.path.isdir(ho + '/' + loo) == True:
        os.chdir(ho + '/' + loo)
    elif os.path.isdir(ho + '/' + loo) == False:
        os.chdir(loo)
    #pdf.generic.Destination(title='test',page=1,typ='/Fit')
    output.setPageLayout('/SinglePage')
    outputStream = open(OUTPUT, 'wb')
    #output.addLink(0,0,rect='[0,0,0,0]',border=None,fit='/Fit')
    output.write(outputStream)
    outputStream.close()

    if os.path.isdir(loi) == True:
        os.chdir(loi)
    elif os.path.isdir(loi) == False:
        os.chdir(ho + '/' + loi)
    os.remove(ti + '.pdf')
    if os.name == 'posix':

        subprocess.call(ho + 'cpdf/mac/cpdf.sh -fit-window true ' + ho + '/' +
                        loo + ti + ' -o ' + ho + '/' + loo + ti,
                        shell=True)
    elif os.name == 'nt':
        subprocess.call(ho + 'cpdf/win/cpdf.exe -fit-window true ' + ho + '/' +
                        loo + ti + ' -o ' + ho + '/' + loo + ti,
                        shell=True)
Пример #4
0
def embed_hidden_data_into_pdf(inpdf, indata):
    with open(indata, "r", encoding="cp850") as f1:

        mydata = f1.read()

        # Read xml and encode it
        mydata_enc = base64.b64encode(mydata.encode("utf-8"))
        logging.debug(mydata_enc)
        logging.debug(type(mydata_enc))
        logging.debug(base64.b64decode(mydata_enc).decode("utf-8"))

        with open(inpdf, "rb") as p1:
            tempfile_pdf = tempfile.NamedTemporaryFile(
                mode="w+b", delete=False, suffix=".pdf"
            )

            invoice = PdfFileReader(p1)
            output_pdf = PdfFileWriter()

            infodict = output_pdf._info.getObject()
            for k, v in invoice.documentInfo.items():
                infodict.update({NameObject(k): createStringObject(v)})

            infodict.update(
                {NameObject(PDF_DATA_TAG): createStringObject(mydata_enc.decode("utf-8"))}
            )

            for k, v in invoice.documentInfo.items():
                logging.debug("{} {}".format(k, v))

            for i in range(0, invoice.getNumPages()):
                output_pdf.addPage(invoice.getPage(i))

            # save pdf
            output_pdf.write(tempfile_pdf)
            tempfile_pdf.close()

            persistant_tempfile = tempfile_pdf.name
            logging.info("Using tempfile {}".format(tempfile_pdf.name))

        logging.info("validating temp file")

        validated_data = get_hidden_data_from_pdf(persistant_tempfile)
        logging.debug(validated_data)
        assert mydata == validated_data, "embedded data does not match"
        logging.info("{} {} {}".format(inpdf, indata, persistant_tempfile))

    backup_pdf = "{}.bak".format(inpdf)
    shutil.move(inpdf, backup_pdf)
    os.remove(indata)
    shutil.move(persistant_tempfile, inpdf)
    os.remove(backup_pdf)
Пример #5
0
def createAnnotPdf(geom_type, myShapePdf):
    # input variables
    # part 1: read geometry pdf to get the vertices and rectangle to use
    source = PdfFileReader(open(myShapePdf, 'rb'))
    geomPage = source.getPage(0)
    mystr = geomPage.getObject()['/Contents'].getData()
    # to pinpoint the string part: 1.19997 791.75999 m 1.19997 0.19466 l 611.98627 0.19466 l 611.98627 791.75999 l 1.19997 791.75999 l
    # the format seems to follow x1 y1 m x2 y2 l x3 y3 l x4 y4 l x5 y5 l
    geomString = mystr.split('S\r\n')[0].split('M\r\n')[1]
    coordsString = [
        value for value in geomString.split(' ')
        if value not in ['m', 'l', '']
    ]

    # part 2: update geometry in the map
    if geom_type.upper() == 'POLYGON':
        pdf_geom = PdfFileReader(open(annot_poly, 'rb'))
    elif geom_type.upper() == 'POLYLINE':
        pdf_geom = PdfFileReader(open(annot_line, 'rb'))
    page_geom = pdf_geom.getPage(0)

    annot = page_geom['/Annots'][0]
    updateVertices = "annot.getObject().update({NameObject('/Vertices'):ArrayObject([FloatObject(" + coordsString[
        0] + ")"
    for item in coordsString[1:]:
        updateVertices = updateVertices + ',FloatObject(' + item + ')'
    updateVertices = updateVertices + "])})"
    exec(updateVertices)

    xcoords = []
    ycoords = []
    for i in range(0, len(coordsString) - 1):
        if i % 2 == 0:
            xcoords.append(float(coordsString[i]))
        else:
            ycoords.append(float(coordsString[i]))

    # below rect seems to be geom bounding box coordinates: xmin, ymin, xmax,ymax
    annot.getObject().update({
        NameObject('/Rect'):
        ArrayObject([
            FloatObject(min(xcoords)),
            FloatObject(min(ycoords)),
            FloatObject(max(xcoords)),
            FloatObject(max(ycoords))
        ])
    })
    annot.getObject().pop('/AP')  # this is to get rid of the ghost shape

    annot.getObject().update({NameObject('/T'): createStringObject(u'ERIS')})

    output = PdfFileWriter()
    output.addPage(page_geom)
    annotPdf = os.path.join(scratch, "annot.pdf")
    outputStream = open(annotPdf, "wb")
    #output.setPageMode('/UseOutlines')
    output.write(outputStream)
    outputStream.close()
    output = None
    return annotPdf
Пример #6
0
 def pdf_suffix_fields(self, page, sfx):
     for j in range(0, len(page['/Annots'])):
         writer_annot = page['/Annots'][j].getObject()
         writer_annot.update({
             NameObject("/T"):
             createStringObject(writer_annot.get('/T') + sfx)
         })
Пример #7
0
def add_update_pdf_metadata(filename, update_dictionary):
    # This seems to be the only way to modify the existing PDF metadata.
    #
    # pylint: disable=protected-access, no-member

    def add_prefix(value):
        return '/' + value

    full_update_dictionary = {add_prefix(k): v for k, v in update_dictionary.items()}

    with open(filename, 'rb') as input_file:
        pdf_input = PdfFileReader(input_file)
        pdf_output = PdfFileWriter()

        for page in range(pdf_input.getNumPages()):
            pdf_output.addPage(pdf_input.getPage(page))

        info_dict = pdf_output._info.getObject()

        info = pdf_input.documentInfo

        full_update_dictionary = dict(chain(info.items(), full_update_dictionary.items()))

        for key in full_update_dictionary:
            assert full_update_dictionary[key] is not None
            info_dict.update({NameObject(key): createStringObject(full_update_dictionary[key])})

        _, temp_file_name = tempfile.mkstemp(prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf")

        with open(temp_file_name, 'wb') as file_out:
            pdf_output.write(file_out)

    shutil.move(temp_file_name, filename)
Пример #8
0
def _filespec_additional_attachments(pdf_filestream, name_arrayobj_cdict,
                                     file_dict, file_bin):
    filename = file_dict['filename']
    logger.debug('_filespec_additional_attachments filename=%s', filename)
    mod_date_pdf = _get_pdf_timestamp(file_dict['mod_date'])
    md5sum = hashlib.md5(file_bin).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'):
        md5sum_obj,
        NameObject('/ModDate'):
        createStringObject(mod_date_pdf),
        NameObject('/Size'):
        NameObject(str(len(file_bin))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(file_bin)
    file_mimetype = mimetypes.guess_type(filename)[0]
    if not file_mimetype:
        file_mimetype = 'application/octet-stream'
    file_mimetype_insert = '/' + file_mimetype.replace('/', '#2f')
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        NameObject("/Subtype"): NameObject(file_mimetype_insert),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
    })
    fname_obj = createStringObject(filename)
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"):
        NameObject("/Unspecified"),
        NameObject("/Desc"):
        createStringObject(file_dict.get('desc', '')),
        NameObject("/Type"):
        NameObject("/Filespec"),
        NameObject("/F"):
        fname_obj,
        NameObject("/EF"):
        ef_dict,
        NameObject("/UF"):
        fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict[fname_obj] = filespec_obj
Пример #9
0
def write_pdf(filename, chapters, meta):
    output = PyPDF2.PdfFileWriter()
    info = output._info.getObject()
    info.update({
        NameObject('/Title') : createStringObject(uni(meta['title'])),
        NameObject('/Author') : createStringObject(uni(meta['author'])),
        NameObject('/Creator') : createStringObject('springer.py')
    })

    page = 0
    for chapter in chapters:
        inp = PyPDF2.PdfFileReader(open(chapter[0], 'rb'))
        for i in range(inp.getNumPages()):
            output.addPage(inp.getPage(i))
        if chapter[1] != None:
            output.addBookmark(uni(chapter[1]), page)
        page += inp.getNumPages()

    fp = file(filename, 'wb')
    output.write(fp)
    fp.close()
Пример #10
0
def crea_cuaderno(nombre, listapdf, ruta='', generar=True):
    fins = [PdfFileReader(ruta+'/pdf/' + i+'.pdf') for i in listapdf]
    tapa = PdfFileReader(ruta +'/documentos/' + 'tapa_' + nombre.lower() + '.pdf')
    objetivos = PdfFileReader(ruta +'/documentos/'+'objetivos.pdf')
    observaciones = PdfFileReader(ruta +'/documentos/'+'observaciones.pdf')

    # gestión metadatos
    info_old = fins[0].getDocumentInfo()
    output = crea_nombre_pdf(nombre)
    fo = PdfFileWriter()
    info_dict = fo._info.getObject()
    
    for key in info_old:
        info_dict.update({NameObject(key): createStringObject(info_old[key])})
    info_dict.update({
        NameObject('/Title'): createStringObject(nombre)
    })

    # crear con páginas
    # añadir tapa fo.addPage()
    # añadir objetivos
    
    fo.addPage(tapa.getPage(0))
    fo.addPage(objetivos.getPage(0))

    for i in fins:
        fo.addPage(i.getPage(0))

    fo.addPage(observaciones.getPage(0))
    fo.addPage(observaciones.getPage(0))

    if generar:
        fo.write(open(ruta + 'documentos/Cuaderno ' + nombre + '.pdf', 'wb'))
    else:
        buffer = BytesIO()
        fo.write(buffer)
        pdf = buffer.getvalue()
        buffer.close()
        return pdf
Пример #11
0
def main(path, new_name):
	inpfn = path
	fin = file(inpfn, 'rb')
	pdf_in = PdfFileReader(fin)
	writer = PdfFileWriter()
	for page in range(pdf_in.getNumPages()):
		writer.addPage(pdf_in.getPage(page))
	infoDict = writer._info.getObject()
	info = pdf_in.documentInfo
	for key in info:
		infoDict.update({NameObject(key): createStringObject(info[key])})
	# rename
	infoDict.update({NameObject('/Title'): createStringObject(unicode(new_name))})
	# It does not appear possible to alter in place.
	fout = open(inpfn+'out.pdf', 'wb')
	writer.write(fout)
	fin.close()
	fout.close()

	import os
	os.unlink(inpfn)
	os.rename(inpfn+'out.pdf', inpfn)
Пример #12
0
def append_attachment(writer: PdfFileWriter, fname: str, fdata: bytes):
    """Append attachments to a PDF."""
    # The entry for the file
    file_entry = DecodedStreamObject()
    file_entry.setData(fdata)
    file_entry.update({NameObject("/Type"): NameObject("/EmbeddedFile")})

    # The Filespec entry
    efEntry = DictionaryObject()
    efEntry.update({NameObject("/F"): file_entry})

    filespec = DictionaryObject()
    filespec.update({
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): createStringObject(fname),
        NameObject("/EF"): efEntry,
    })

    if "/Names" not in writer._root_object.keys():
        # No files attached yet. Create the entry for the root, as it needs a reference to the Filespec
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([createStringObject(fname), filespec])
        })

        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        writer._root_object.update(
            {NameObject("/Names"): embeddedFilesDictionary})
    else:
        # There are files already attached. Append the new file.
        writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append(
            createStringObject(fname))
        writer._root_object["/Names"]["/EmbeddedFiles"]["/Names"].append(
            filespec)
Пример #13
0
def add_update_pdf_metadata(filename, update_dictionary):
    # This seems to be the only way to modify the existing PDF metadata.
    #
    # pylint: disable=protected-access, no-member

    def add_prefix(value):
        return '/' + value

    full_update_dictionary = {
        add_prefix(k): v
        for k, v in update_dictionary.items()
    }

    with open(filename, 'rb') as input_file:
        pdf_input = PdfFileReader(input_file)
        pdf_output = PdfFileWriter()

        for page in range(pdf_input.getNumPages()):
            pdf_output.addPage(pdf_input.getPage(page))

        info_dict = pdf_output._info.getObject()
        info = pdf_input.documentInfo
        full_update_dictionary = dict(
            chain(info.items(), full_update_dictionary.items()))

        for key in full_update_dictionary:
            assert full_update_dictionary[key] is not None
            info_dict.update({
                NameObject(key):
                createStringObject(full_update_dictionary[key])
            })

        os_file_out, temp_file_name = tempfile.mkstemp(
            prefix="email2pdf_add_update_pdf_metadata", suffix=".pdf")
        # Immediately close the file as created to work around issue on
        # Windows where file cannot be opened twice.
        os.close(os_file_out)

        with open(temp_file_name, 'wb') as file_out:
            pdf_output.write(file_out)

    shutil.move(temp_file_name, filename)
Пример #14
0
    def zugferd_update_metadata_add_attachment(self, pdf_filestream, fname,
                                               fdata):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        # The entry for the file
        moddate = DictionaryObject()
        moddate.update({
            NameObject('/ModDate'):
            createStringObject(self._get_pdf_timestamp())
        })
        file_entry = DecodedStreamObject()
        file_entry.setData(fdata)
        file_entry.update({
            NameObject("/Type"):
            NameObject("/EmbeddedFile"),
            NameObject("/Params"):
            moddate,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"):
            NameObject("/text#2Fxml"),
        })
        file_entry_obj = pdf_filestream._addObject(file_entry)
        # The Filespec entry
        efEntry = DictionaryObject()
        efEntry.update({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
        })

        fname_obj = createStringObject(fname)
        filespec = DictionaryObject()
        filespec.update({
            NameObject("/AFRelationship"):
            NameObject("/Alternative"),
            NameObject("/Desc"):
            createStringObject("ZUGFeRD Invoice"),
            NameObject("/Type"):
            NameObject("/Filespec"),
            NameObject("/F"):
            fname_obj,
            NameObject("/EF"):
            efEntry,
            NameObject("/UF"):
            fname_obj,
        })
        embeddedFilesNamesDictionary = DictionaryObject()
        embeddedFilesNamesDictionary.update({
            NameObject("/Names"):
            ArrayObject([fname_obj,
                         pdf_filestream._addObject(filespec)])
        })
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embeddedFilesDictionary = DictionaryObject()
        embeddedFilesDictionary.update(
            {NameObject("/EmbeddedFiles"): embeddedFilesNamesDictionary})
        # Update the root
        metadata_xml_str = self._prepare_pdf_metadata()
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_value = pdf_filestream._addObject(metadata_file_entry)
        af_value = pdf_filestream._addObject(
            ArrayObject([pdf_filestream._addObject(filespec)]))
        pdf_filestream._root_object.update({
            NameObject("/AF"):
            af_value,
            NameObject("/Metadata"):
            metadata_value,
            NameObject("/Names"):
            embeddedFilesDictionary,
        })
        info_dict = self._prepare_pdf_info()
        pdf_filestream.addMetadata(info_dict)
Пример #15
0
def clear_metadata(src, dst):
    src = settings.LEADING_SLASH + src
    dst = settings.LEADING_SLASH + dst
    ext = os.path.splitext(dst)[1]  # assumed to be in lower case!
    meta_fields = [
        'author', 'category', 'comments', 'content_status', 'identifier',
        'keywords', 'last_modified_by', 'language', 'subject', 'title',
        'version'
    ]
    try:
        if ext in ['.docx']:
            f = open(src, 'rb')
            doc = Document(f)
            f.close()
            for meta_field in meta_fields:
                setattr(doc.core_properties, meta_field, '')
            setattr(doc.core_properties, 'created', DEFAULT_DATE)
            setattr(doc.core_properties, 'modified', DEFAULT_DATE)
            setattr(doc.core_properties, 'last_printed', DEFAULT_DATE)
            setattr(doc.core_properties, 'revision', 1)
            doc.save(dst)
            clean_xml_in_zip(dst)
        elif ext in ['.pptx']:
            prs = Presentation(src)
            for meta_field in meta_fields:
                setattr(prs.core_properties, meta_field, '')
            setattr(prs.core_properties, 'created', DEFAULT_DATE)
            setattr(prs.core_properties, 'modified', DEFAULT_DATE)
            setattr(prs.core_properties, 'last_printed', DEFAULT_DATE)
            setattr(prs.core_properties, 'revision', 1)
            prs.save(dst)
            clean_xml_in_zip(dst)
        elif ext == '.pdf':
            fin = file(src, 'rb')
            inp = PdfFileReader(fin)
            outp = PdfFileWriter()
            for page in range(inp.getNumPages()):
                outp.addPage(inp.getPage(page))
            infoDict = outp._info.getObject()
            infoDict.update({
                NameObject('/Title'): createStringObject(u''),
                NameObject('/Author'): createStringObject(u''),
                NameObject('/Subject'): createStringObject(u''),
                NameObject('/Creator'): createStringObject(u'')
            })
            fout = open(dst, 'wb')
            outp.write(fout)
            fin.close()
            fout.close()
        elif ext == '.xlsx':
            file_to_clear = 'docProps/core.xml'
            # create a copy of the Excel file while "cleaning" docProps/core.xml
            with ZipFile(src, 'r') as src_zip:
                with ZipFile(dst, 'w') as dst_zip:
                    dst_zip.comment = src_zip.comment  # preserve the comment (if any)
                    for item in src_zip.infolist():
                        if item.filename == file_to_clear:
                            # read the XML tree from the file
                            xml = src_zip.read(item.filename)
                            xml = re.sub(r'<dc:title>[^<]{1,1000}</dc:title>',
                                         '<dc:title></dc:title>', xml)
                            xml = re.sub(
                                r'<dc:subject>[^<]{1,500}</dc:subject>',
                                '<dc:subject></dc:subject>', xml)
                            xml = re.sub(
                                r'<dc:creator>[^<]{1,300}</dc:creator>',
                                '<dc:creator></dc:creator>', xml)
                            xml = re.sub(
                                r'<dc:description>[^<]{1,2500}</dc:description>',
                                '<dc:description></dc:description>', xml)
                            xml = re.sub(
                                r'<cp:keywords>[^<]{1,1000}</cp:keywords>',
                                '<cp:keywords></cp:keywords>', xml)
                            xml = re.sub(
                                r'<cp:lastModifiedBy>[^<]{1,300}</cp:lastModifiedBy>',
                                '<cp:lastModifiedBy></cp:lastModifiedBy>', xml)
                            xml = re.sub(
                                r'<cp:category>[^<]{1,300}</cp:category>',
                                '<cp:category></cp:category>', xml)
                            xml = re.sub(
                                r'<cp:contentStatus>[^<]{1,100}</cp:contentStatus>',
                                '<cp:contentStatus></cp:contentStatus>', xml)
                            xml = re.sub(
                                r'<cp:revision>[^<]{1,10}</cp:revision>',
                                '<cp:revision></cp:revision', xml)
                            # replace all date-time fields with the default date
                            xml = re.sub(
                                r':W3CDTF">[^<]{1,25}</dcterms:',
                                ':W3CDTF">2001-01-01T00:00:00Z</dcterms:', xml)
                            dst_zip.writestr(item, xml)
                        else:
                            dst_zip.writestr(item, src_zip.read(item.filename))
    except Exception, e:
        log_message('Exception while removing metadata from a %s file: %s' %
                    (ext, str(e)))
Пример #16
0
 def create_pdf(self):
     if not self.published:
         self.pdf = None
     elif (not self.pdf_filename or self.pdf_updated < self.published):
         with tempfile.SpooledTemporaryFile(SPOOL_LIMIT) as temp:
             # Create an output PDF surface with an arbitrary size (the
             # size doesn't matter as we'll set it independently for each
             # page below)
             surface = cairo.PDFSurface(temp, 144.0, 144.0)
             context = cairo.Context(surface)
             page = self.first_page
             while page:
                 context.save()
                 try:
                     # Render the page's vector image if it has one
                     if page.vector_filename:
                         svg = Rsvg.Handle()
                         shutil.copyfileobj(page.vector, svg)
                         svg.close()
                         surface.set_size(
                             svg.props.width / svg.props.dpi_x * 72.0,
                             svg.props.height / svg.props.dpi_y * 72.0)
                         context.scale(
                             72.0 / svg.props.dpi_x,
                             72.0 / svg.props.dpi_y)
                         svg.render_cairo(context)
                     # Otherwise, render the page's bitmap image (NOTE we
                     # assume all bitmaps are 96dpi here)
                     else:
                         img = cairo.ImageSurface.create_from_png(page.bitmap)
                         surface.set_size(
                             img.get_width() / 96.0 * 72.0,
                             img.get_height() / 96.0 * 72.0)
                         context.scale(72.0 / 96.0, 72.0 / 96.0)
                         context.set_source_surface(img)
                         context.paint()
                     context.show_page()
                 finally:
                     context.restore()
                 page = page.next_page
             surface.finish()
             # Use PyPdf to rewrite the metadata on the file (cairo provides
             # no PDF metadata manipulation). This involves generating a new
             # PDF with new metadata and copying the pages over
             temp.seek(0)
             pdf_in = PdfFileReader(temp)
             pdf_out = PdfFileWriter()
             pdf_info = pdf_out._info.getObject()
             pdf_info.update(pdf_in.documentInfo)
             pdf_info.update({
                 NameObject('/Title'): createStringObject('%s - Issue #%d - %s' % (
                     self.comic.title,
                     self.issue_number,
                     self.title,
                     )),
                 NameObject('/Author'): createStringObject(
                     self.comic.author.name if self.comic.author else
                     'Anonymous'
                     ),
                 })
             for page in range(pdf_in.getNumPages()):
                 pdf_out.addPage(pdf_in.getPage(page))
             with tempfile.SpooledTemporaryFile(SPOOL_LIMIT) as temp:
                 pdf_out.write(temp)
                 temp.seek(0)
                 self.pdf = temp
Пример #17
0
    def convert_to_pdfa(self):
        """
        Transform the opened PDF file into a PDF/A compliant file
        """
        # Set the PDF version to 1.7 (as PDF/A-3 is based on version 1.7) and make it PDF/A compliant.
        # See https://github.com/veraPDF/veraPDF-validation-profiles/wiki/PDFA-Parts-2-and-3-rules#rule-612-1

        # " The file header shall begin at byte zero and shall consist of "%PDF-1.n" followed by a single EOL marker,
        # where 'n' is a single digit number between 0 (30h) and 7 (37h) "
        # " The aforementioned EOL marker shall be immediately followed by a % (25h) character followed by at least four
        # bytes, each of whose encoded byte values shall have a decimal value greater than 127 "
        self._header = b"%PDF-1.7\n%\xFF\xFF\xFF\xFF"

        # Add a document ID to the trailer. This is only needed when using encryption with regular PDF, but is required
        # when using PDF/A
        pdf_id = ByteStringObject(md5(self._reader.stream.getvalue()).digest())
        # The first string is based on the content at the time of creating the file, while the second is based on the
        # content of the file when it was last updated. When creating a PDF, both are set to the same value.
        self._ID = ArrayObject((pdf_id, pdf_id))

        with file_open('tools/data/files/sRGB2014.icc', mode='rb') as icc_profile:
            icc_profile_file_data = compress(icc_profile.read())

        icc_profile_stream_obj = DecodedStreamObject()
        icc_profile_stream_obj.setData(icc_profile_file_data)
        icc_profile_stream_obj.update({
            NameObject("/Filter"): NameObject("/FlateDecode"),
            NameObject("/N"): NumberObject(3),
            NameObject("/Length"): NameObject(str(len(icc_profile_file_data))),
        })

        icc_profile_obj = self._addObject(icc_profile_stream_obj)

        output_intent_dict_obj = DictionaryObject()
        output_intent_dict_obj.update({
            NameObject("/S"): NameObject("/GTS_PDFA1"),
            NameObject("/OutputConditionIdentifier"): createStringObject("sRGB"),
            NameObject("/DestOutputProfile"): icc_profile_obj,
            NameObject("/Type"): NameObject("/OutputIntent"),
        })

        output_intent_obj = self._addObject(output_intent_dict_obj)
        self._root_object.update({
            NameObject("/OutputIntents"): ArrayObject([output_intent_obj]),
        })

        pages = self._root_object['/Pages']['/Kids']

        # PDF/A needs the glyphs width array embedded in the pdf to be consistent with the ones from the font file.
        # But it seems like it is not the case when exporting from wkhtmltopdf.
        if TTFont:
            fonts = {}
            # First browse through all the pages of the pdf file, to get a reference to all the fonts used in the PDF.
            for page in pages:
                for font in page.getObject()['/Resources']['/Font'].values():
                    for descendant in font.getObject()['/DescendantFonts']:
                        fonts[descendant.idnum] = descendant.getObject()

            # Then for each font, rewrite the width array with the information taken directly from the font file.
            # The new width are calculated such as width = round(1000 * font_glyph_width / font_units_per_em)
            # See: http://martin.hoppenheit.info/blog/2018/pdfa-validation-and-inconsistent-glyph-width-information/
            for font in fonts.values():
                font_file = font['/FontDescriptor']['/FontFile2']
                stream = io.BytesIO(decompress(font_file._data))
                ttfont = TTFont(stream)
                font_upm = ttfont['head'].unitsPerEm
                glyphs = ttfont.getGlyphSet()._hmtx.metrics
                glyph_widths = []
                for key, values in glyphs.items():
                    if key[:5] == 'glyph':
                        glyph_widths.append(NumberObject(round(1000.0 * values[0] / font_upm)))

                font[NameObject('/W')] = ArrayObject([NumberObject(1), ArrayObject(glyph_widths)])
                stream.close()
        else:
            _logger.warning('The fonttools package is not installed. Generated PDF may not be PDF/A compliant.')

        outlines = self._root_object['/Outlines'].getObject()
        outlines[NameObject('/Count')] = NumberObject(1)

        # Set odoo as producer
        self.addMetadata({
            '/Creator': "Odoo",
            '/Producer': "Odoo",
        })
        self.is_pdfa = True
Пример #18
0
# ophalen van de properties van het orginele document (is niet van belang voor de werking van het script)
props = pdf_in.documentInfo

# Definiëren van de writer
writer = PdfFileWriter()

# overschrijven van de pagina's (inhoud) naar de writer
for page in range(pdf_in.getNumPages()):
    writer.addPage(pdf_in.getPage(page))

# Verwijzen naar beschermde member van de writer class (_info) infoDict ook wel propertyDict o.i.d.
infoDict = writer._info.getObject()

# overschrijven van de properties van het oude document naar de writer
for key in props:
    infoDict.update({NameObject(key): createStringObject(props[key])})

# toewijzen van een titel in de properties
infoDict.update({NameObject('/Title'): createStringObject('test')})

# output file openen
pdf_out = open(output_file, 'wb')

# alles schrijven naar output file
writer.write(pdf_out)

# sluiten van het oude en nieuwe bestand
file.close()
pdf_out.close()

# Verwijderen van het oorspronkelijk document en het hernoemen van het nieuwe document
Пример #19
0
    def _update_metadata_add_attachment(self, pdf_metadata, output_intents):
        '''This method is inspired from the code of the addAttachment()
        method of the PyPDF2 lib'''
        
        # The entry for the file
        facturx_xml_str = self.factx.xml_str
        md5sum = hashlib.md5().hexdigest()
        md5sum_obj = createStringObject(md5sum)
        params_dict = DictionaryObject({
            NameObject('/CheckSum'): md5sum_obj,
            NameObject('/ModDate'): createStringObject(_get_pdf_timestamp()),
            NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
            })
        file_entry = DecodedStreamObject()
        file_entry.setData(facturx_xml_str)  # here we integrate the file itself
        file_entry.update({
            NameObject("/Type"): NameObject("/EmbeddedFile"),
            NameObject("/Params"): params_dict,
            # 2F is '/' in hexadecimal
            NameObject("/Subtype"): NameObject("/text#2Fxml"),
            })
        file_entry_obj = self._addObject(file_entry)
        # The Filespec entry
        ef_dict = DictionaryObject({
            NameObject("/F"): file_entry_obj,
            NameObject('/UF'): file_entry_obj,
            })

        xmp_filename = self.factx.flavor.details['xmp_filename']
        fname_obj = createStringObject(xmp_filename)
        filespec_dict = DictionaryObject({
            NameObject("/AFRelationship"): NameObject("/Data"),
            NameObject("/Desc"): createStringObject("Factur-X Invoice"),
            NameObject("/Type"): NameObject("/Filespec"),
            NameObject("/F"): fname_obj,
            NameObject("/EF"): ef_dict,
            NameObject("/UF"): fname_obj,
            })
        filespec_obj = self._addObject(filespec_dict)
        name_arrayobj_cdict = {fname_obj: filespec_obj}
        
        # TODO: add back additional attachments?
        logger.debug('name_arrayobj_cdict=%s', name_arrayobj_cdict)
        name_arrayobj_content_sort = list(
            sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
        logger.debug('name_arrayobj_content_sort=%s', name_arrayobj_content_sort)
        name_arrayobj_content_final = []
        af_list = []
        for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
            name_arrayobj_content_final += [fname_obj, filespec_obj]
            af_list.append(filespec_obj)
        embedded_files_names_dict = DictionaryObject({
            NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
            })
        
        # Then create the entry for the root, as it needs a
        # reference to the Filespec
        embedded_files_dict = DictionaryObject({
            NameObject("/EmbeddedFiles"): embedded_files_names_dict,
            })
        res_output_intents = []
        logger.debug('output_intents=%s', output_intents)
        for output_intent_dict, dest_output_profile_dict in output_intents:
            dest_output_profile_obj = self._addObject(
                dest_output_profile_dict)
            # TODO detect if there are no other objects in output_intent_dest_obj
            # than /DestOutputProfile
            output_intent_dict.update({
                NameObject("/DestOutputProfile"): dest_output_profile_obj,
                })
            output_intent_obj = self._addObject(output_intent_dict)
            res_output_intents.append(output_intent_obj)
        
        # Update the root
        xmp_level_str = self.factx.flavor.details['levels'][self.factx.flavor.level]['xmp_str']
        xmp_template = self.factx.flavor.get_xmp_xml()
        metadata_xml_str = _prepare_pdf_metadata_xml(xmp_level_str, xmp_filename, xmp_template, pdf_metadata)
        metadata_file_entry = DecodedStreamObject()
        metadata_file_entry.setData(metadata_xml_str)
        metadata_file_entry.update({
            NameObject('/Subtype'): NameObject('/XML'),
            NameObject('/Type'): NameObject('/Metadata'),
            })
        metadata_obj = self._addObject(metadata_file_entry)
        af_value_obj = self._addObject(ArrayObject(af_list))
        self._root_object.update({
            NameObject("/AF"): af_value_obj,
            NameObject("/Metadata"): metadata_obj,
            NameObject("/Names"): embedded_files_dict,
            # show attachments when opening PDF
            NameObject("/PageMode"): NameObject("/UseAttachments"),
            })
        logger.debug('res_output_intents=%s', res_output_intents)
        if res_output_intents:
            self._root_object.update({
                NameObject("/OutputIntents"): ArrayObject(res_output_intents),
            })
        metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
        self.addMetadata(metadata_txt_dict)
Пример #20
0
props = pdf_in.documentInfo

# Definiëren van de writer
writer = PdfFileWriter()

# overschrijven van de pagina's (inhoud) naar de writer
print(range(pdf_in.getNumPages()))
for page in range(pdf_in.getNumPages()):
    writer.addPage(pdf_in.getPage(page))

# Verwijzen naar beschermde member van de writer class (_info) infoDict ook wel propertyDict o.i.d.
infoDict = writer._info.getObject()

# overschrijven van de properties van het oude document naar de writer
for key in props:
    infoDict.update({NameObject(key): createStringObject(props[key])})

# toewijzen van een titel in de properties
title = 'titletest'
keywords = None

None if title is None else infoDict.update({NameObject('/Title'): createStringObject(title)})
None if keywords is None else infoDict.update({NameObject('/Keywords'): createStringObject(keywords)})

# infoDict.update({NameObject('/Title'): createStringObject('testtest')})
# infoDict.update({NameObject('/Subject'): createStringObject('subtest')})
# infoDict.update({NameObject('/Keywords'): createStringObject('keytest')})
# infoDict.update({NameObject('/Category'): createStringObject('categorytest')})
# infoDict.update({NameObject('/Comments'): createStringObject('Commentstest')})

# output file openen
from PyPDF2 import PdfFileWriter, PdfFileReader
# filename=raw_input("Pdf file hyperlink?")
filename = "M:\Engineering\ESo\Standard Plan\Updated\OCS\Typical Pushoff Cantilever Assemblies Layout1 (1).pdf"
fin = file(filename, 'rb')  # Open File
pdf_in = PdfFileReader(fin)  # use PyPDF2 file reader

# Grabbing and processing information from Attext produced .txt
# AttFile=raw_input("File with attributes?")
AttFile = "C:\Users\jli\Documents\CurrentConfiguration\Typical Pushoff Cantilever Assemblies.txt"
AttFiletemp = open(AttFile, 'r')
Attributes = []
for row in AttFiletemp:
    Attributes.append(row.strip().split(','))
# print Attributes

AttributesTitle = Attributes[0][0] + ';' + Attributes[0][1] + ';' + Attributes[
    0][3]
AttributesTitle = AttributesTitle.replace("'", '')

from PyPDF2.generic import NameObject, createStringObject
writer = PdfFileWriter()  # New new pdf file
for page in range(pdf_in.getNumPages()):
    writer.addPage(pdf_in.getPage(page))
infoDict = writer._info.getObject()  # Grab existing pdf info
info = pdf_in.documentInfo
for key in info:
    infoDict.update(
        {NameObject('/Title'): createStringObject(AttributesTitle)})
info.update({NameObject('/Title'): createStringObject(AttributesTitle)})
print infoDict
    raise Exception("No files found in the PDF metadata directory. No work to be done.")
print "Found", files_found, "files"

# for each metadata PDF
for file_no in range(files_found):
    current_filename = metadata_files[file_no]
    
    # Load the original PDF metadata
    if os.path.isfile(os.path.join(metadata_path,current_filename)):
        pdf_metadata_input = PdfFileReader(open(os.path.join(metadata_path,current_filename), "rb"))
        pdf_metadata = pdf_metadata_input.getDocumentInfo()
        
        # If there is a Title field grab it
        if pdf_metadata.title != None:
            pdf_metadata.update({
                NameObject('/Title'): createStringObject(titlecase(pdf_metadata.title))
            })
            pdf_title = pdf_metadata.title
        else:
            pdf_title = ''
        
        # If there is a Producer field set it to ""
        if pdf_metadata.producer != None:
            pdf_metadata.update({
                NameObject('/Producer'): createStringObject(u'')
            })
        
        # If the same file name exists in the PDF directory load it 
        if os.path.isfile(os.path.join(pdf_path,current_filename)):
            pdf_input =  PdfFileReader(open(os.path.join(pdf_path,current_filename), "rb"))
    
info_dict_output = dict()
ipdf_info = imagepdf.documentInfo
# Our signature as a producer
our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)"
read_producer = False
PRODUCER_KEY = "/Producer"
if ipdf_info is not None:
    for key in ipdf_info:
        value = ipdf_info[key]
        if key == PRODUCER_KEY:
            value = value + "; " + our_name
            read_producer = True
        #
        try:
            # Check if value can be accepted by pypdf API
            testConversion = createStringObject(value)
            info_dict_output[key] = value
        except TypeError:
            # This can happen with some array properties.
            print("Warning: property " + key + " not copied to final PDF")
        #
    #
#
if not read_producer:
    info_dict_output[PRODUCER_KEY] = our_name
#
output.addMetadata(info_dict_output)
#
with open(sys.argv[3], 'wb') as f:
    output.write(f)
#
Пример #24
0
    # print("Img:", imagepage.mediaBox.upperRight)
    # print("Text:", textpage.mediaBox.upperRight)
    factor_x = textpage.mediaBox.upperRight[0] / imagepage.mediaBox.upperRight[0]
    factor_y = textpage.mediaBox.upperRight[1] / imagepage.mediaBox.upperRight[1]
    # print(factor_x, factor_y)
    imagepage.scale(float(factor_x), float(factor_y))
    textpage.mergePage(imagepage)  # imagepage stay on top
    textpage.compressContentStreams()
    output.addPage(textpage)
#
info_dict_output = output._info.getObject()
ipdf_info = imagepdf.documentInfo
# Our signature as a producer
our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)"
read_producer = False
PRODUCER_KEY = "/Producer"
for key in ipdf_info:
    value = ipdf_info[key]
    if key == PRODUCER_KEY:
        value = value + "; " + our_name
        read_producer = True
    #
    info_dict_output.update({NameObject(key): createStringObject(value)})
#
if not read_producer:
    info_dict_output.update({NameObject(PRODUCER_KEY): createStringObject(our_name)})
#
with open(sys.argv[3], 'wb') as f:
    output.write(f)
#
Пример #25
0
def _facturx_update_metadata_add_attachment(pdf_filestream, facturx_xml_str, pdf_metadata, facturx_level,
                                            output_intents):
    md5sum = hashlib.md5(facturx_xml_str).hexdigest()
    md5sum_obj = createStringObject(md5sum)
    params_dict = DictionaryObject({
        NameObject('/CheckSum'): md5sum_obj,
        NameObject('/ModDate'): createStringObject(datetime.datetime.now().isoformat()),
        NameObject('/Size'): NameObject(str(len(facturx_xml_str))),
    })
    file_entry = DecodedStreamObject()
    file_entry.setData(facturx_xml_str)  # here we integrate the file itself
    file_entry.update({
        NameObject("/Type"): NameObject("/EmbeddedFile"),
        NameObject("/Params"): params_dict,
        # 2F is '/' in hexadecimal
        NameObject("/Subtype"): NameObject("/text#2Fxml"),
    })
    file_entry_obj = pdf_filestream._addObject(file_entry)
    # The Filespec entry
    ef_dict = DictionaryObject({
        NameObject("/F"): file_entry_obj,
        NameObject('/UF'): file_entry_obj,
    })

    fname_obj = createStringObject("ZUGFeRD-invoice.xml")
    filespec_dict = DictionaryObject({
        NameObject("/AFRelationship"): NameObject("/Data"),
        NameObject("/Desc"): createStringObject("Factur-X Invoice"),
        NameObject("/Type"): NameObject("/Filespec"),
        NameObject("/F"): fname_obj,
        NameObject("/EF"): ef_dict,
        NameObject("/UF"): fname_obj,
    })
    filespec_obj = pdf_filestream._addObject(filespec_dict)
    name_arrayobj_cdict = {fname_obj: filespec_obj}
    name_arrayobj_content_sort = list(
        sorted(name_arrayobj_cdict.items(), key=lambda x: x[0]))
    name_arrayobj_content_final = []
    af_list = []
    for (fname_obj, filespec_obj) in name_arrayobj_content_sort:
        name_arrayobj_content_final += [fname_obj, filespec_obj]
        af_list.append(filespec_obj)
    embedded_files_names_dict = DictionaryObject({
        NameObject("/Names"): ArrayObject(name_arrayobj_content_final),
    })
    # Then create the entry for the root, as it needs a
    # reference to the Filespec
    embedded_files_dict = DictionaryObject({
        NameObject("/EmbeddedFiles"): embedded_files_names_dict,
    })
    res_output_intents = []
    for output_intent_dict, dest_output_profile_dict in output_intents:
        dest_output_profile_obj = pdf_filestream._addObject(
            dest_output_profile_dict)
        # TODO detect if there are no other objects in output_intent_dest_obj
        # than /DestOutputProfile
        output_intent_dict.update({
            NameObject("/DestOutputProfile"): dest_output_profile_obj,
        })
        output_intent_obj = pdf_filestream._addObject(output_intent_dict)
        res_output_intents.append(output_intent_obj)
    # Update the root
    metadata_xml_str = _prepare_pdf_metadata_xml(facturx_level, pdf_metadata)
    metadata_file_entry = DecodedStreamObject()
    metadata_file_entry.setData(metadata_xml_str)
    metadata_file_entry.update({
        NameObject('/Subtype'): NameObject('/XML'),
        NameObject('/Type'): NameObject('/Metadata'),
    })
    metadata_obj = pdf_filestream._addObject(metadata_file_entry)
    af_value_obj = pdf_filestream._addObject(ArrayObject(af_list))
    pdf_filestream._root_object.update({
        NameObject("/AF"): af_value_obj,
        NameObject("/Metadata"): metadata_obj,
        NameObject("/Names"): embedded_files_dict,
        # show attachments when opening PDF
        NameObject("/PageMode"): NameObject("/UseAttachments"),
    })
    if res_output_intents:
        pdf_filestream._root_object.update({
            NameObject("/OutputIntents"): ArrayObject(res_output_intents),
        })
    metadata_txt_dict = _prepare_pdf_metadata_txt(pdf_metadata)
    pdf_filestream.addMetadata(metadata_txt_dict)
Пример #26
0
        0]
    factor_y = textpage.mediaBox.upperRight[1] / imagepage.mediaBox.upperRight[
        1]
    # print(factor_x, factor_y)
    imagepage.scale(float(factor_x), float(factor_y))
    textpage.mergePage(imagepage)  # imagepage stay on top
    textpage.compressContentStreams()
    output.addPage(textpage)
#
info_dict_output = output._info.getObject()
ipdf_info = imagepdf.documentInfo
# Our signature as a producer
our_name = "PDF2PDFOCR(github.com/LeoFCardoso/pdf2pdfocr)"
read_producer = False
PRODUCER_KEY = "/Producer"
for key in ipdf_info:
    value = ipdf_info[key]
    if key == PRODUCER_KEY:
        value = value + "; " + our_name
        read_producer = True
    #
    info_dict_output.update({NameObject(key): createStringObject(value)})
#
if not read_producer:
    info_dict_output.update(
        {NameObject(PRODUCER_KEY): createStringObject(our_name)})
#
with open(sys.argv[3], 'wb') as f:
    output.write(f)
#
Пример #27
0
def main():
    # path of all pdf all_pdf_files
    # very bad implemenation. todo: use getopt
    my_path = "."  #default path is the current folder
    if len(sys.argv) == 2:
        my_path = sys.argv[1]
    elif len(sys.argv) > 2:
        print("Something went wrong. Check your arguments.")
        exit(-1)

    # create results directory if not exists
    os.makedirs(my_path + "/results", exist_ok=True)

    print("Path given: ", my_path, "\n")

    # get all pdf files from specified folder
    all_pdf_files = find_ext(my_path, "pdf")

    c = 0
    for pdf in all_pdf_files:
        inputPdf = PdfFileReader(
            open(pdf, "rb"), strict=False
        )  # strict=False for Windows support - PdfReadWarning: Superfluous whitespace found in object header
        docInfo = inputPdf.getDocumentInfo()

        # create metadata patch
        output = PdfFileWriter()
        infoDict = output._info.getObject()
        infoDict.update({
            NameObject('/Title'):
            createStringObject(u'title removed'),
            NameObject('/Author'):
            createStringObject(u'author removed'),
            NameObject('/Subject'):
            createStringObject(u'subject removed'),
            NameObject('/Creator'):
            createStringObject(u'software quality script'),
            NameObject('/Producer'):
            createStringObject(u'software quality script'),
            NameObject('/Keywords'):
            createStringObject(u'software, quality, sanitized')
        })

        c += 1

        # get filename from path
        filename = os.path.basename(pdf)
        # remove the extension
        filename_noext = os.path.splitext(filename)[0]

        #print some information
        print(c, ". Processing file: ", pdf)
        print("filename: ", filename)
        print("Title: ", docInfo.title)
        print("Author: ", docInfo.author)
        print("Subject: ", docInfo.subject)
        print("Producer: ", docInfo.producer)
        print("Creator: ", docInfo.creator)

        #create new pdf
        for page in range(inputPdf.getNumPages()):
            output.addPage(inputPdf.getPage(page))

        print("\n")

        # write new file in results subfolder
        # the new file name will be the old one extended with "-sanitized.pdf"
        outputStream = open(
            my_path + "/results/" + filename_noext + "-sanitized.pdf", 'wb')
        output.write(outputStream)
        outputStream.close()
Пример #28
0
def add_background_to_pdf(
    filename_in,
    filename_out=None,
    filename_letterhead=None,
    filename_background=None,
    new_title="",
    new_author="",
):
    """Merges a one page letterhead to an invoice and sets author and title of the doc info

    for multi page pdfs, its possible to define an extra page

    """

    if not filename_letterhead:
        return

    use_tmpfile = False

    if not filename_out:
        filename_out = tempfile.NamedTemporaryFile(
            mode="w+b", delete=False, suffix=".pdf"
        ).name
        use_tmpfile = True

    if not filename_background:
        filename_background = filename_letterhead

    with open(filename_in, "rb") as pdf_in, open(
        filename_background, "rb"
    ) as pdf_lb, open(filename_letterhead, "rb") as pdf_lh:

        input_pdf = PdfFileReader(pdf_in)
        output_pdf = PdfFileWriter()

        # metadata
        # noinspection PyProtectedMember
        infodict = output_pdf._info.getObject()
        for k, v in input_pdf.documentInfo.items():
            infodict.update({NameObject(k): createStringObject(v)})
        infodict.update({NameObject("/Title"): createStringObject(new_title)})
        infodict.update({NameObject("/Author"): createStringObject(new_author)})

        # add first page
        # get the first invoice page, merge with letterhead
        letterhead = PdfFileReader(pdf_lh).getPage(0)
        letterhead.mergePage(input_pdf.getPage(0))
        output_pdf.addPage(letterhead)
        # add other pages
        for i in range(1, input_pdf.getNumPages()):
            background = PdfFileReader(pdf_lb).getPage(0)
            background.mergePage(input_pdf.getPage(i))
            output_pdf.addPage(background)
        # save pdf
        with open(filename_out, "wb") as pdf_out:
            output_pdf.write(pdf_out)

    if use_tmpfile:
        backup_pdf = "{}.bak".format(filename_in)
        shutil.move(filename_in, backup_pdf)
        shutil.move(filename_out, filename_in)
        os.remove(backup_pdf)