def pdf_encrypt(filename, password): #logmessage("pdf_encrypt: running; password is " + repr(password)) if type(password) in (str, unicode, bool, int, float): owner_password = unicode(password).strip() user_password = unicode(password).strip() elif type(password) is list: owner_password = unicode(password[0]).strip() user_password = unicode(password[1]).strip() elif type(password) is dict: owner_password = unicode(password.get('owner', 'password')).strip() user_password = unicode(password.get('user', 'password')).strip() else: raise DAError("pdf_encrypt: invalid password") outfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) if owner_password == user_password: commands = [ 'pdftk', filename, 'output', outfile.name, 'user_pw', user_password, 'allow', 'printing' ] else: commands = [ 'pdftk', filename, 'output', outfile.name, 'owner_pw', owner_password, 'user_pw', user_password, 'allow', 'printing' ] try: output = subprocess.check_output(commands, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as err: output = err.output raise DAError("pdf_encrypt: error running pdftk. " + output) #logmessage(' '.join(commands)) #logmessage(output) shutil.move(outfile.name, filename)
def concatenate_files(path_list, pdfa=False, password=None): pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) subprocess_arguments = [PDFTK_PATH] new_path_list = list() for path in path_list: mimetype, encoding = mimetypes.guess_type(path) if mimetype.startswith('image'): new_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) args = ["convert", path, new_pdf_file.name] result = call(args) if result != 0: logmessage("failed to convert image to PDF: " + " ".join(args)) continue new_path_list.append(new_pdf_file.name) elif mimetype in ( 'application/rtf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword', 'application/vnd.oasis.opendocument.text'): new_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) if mimetype == 'application/rtf': ext = 'rtf' elif mimetype == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ext = 'docx' elif mimetype == 'application/msword': ext = 'doc' elif mimetype == 'application/vnd.oasis.opendocument.text': ext = 'odt' docassemble.base.pandoc.word_to_pdf(path, ext, new_pdf_file.name, pdfa=False) new_path_list.append(new_pdf_file.name) elif mimetype == 'application/pdf': new_path_list.append(path) if len(new_path_list) == 0: raise DAError("concatenate_files: no valid files to concatenate") subprocess_arguments.extend(new_path_list) subprocess_arguments.extend(['cat', 'output', pdf_file.name]) #logmessage("Arguments are " + str(subprocess_arguments)) result = call(subprocess_arguments) if result != 0: logmessage("Failed to concatenate PDF files") raise DAError( "Call to pdftk failed for concatenation where arguments were " + " ".join(subprocess_arguments)) if pdfa: pdf_to_pdfa(pdf_file.name) if password: pdf_encrypt(pdf_file.name, password) return pdf_file.name
def get_modtime(self, **kwargs): filename = kwargs.get('filename', self.filename) # logmessage("Get modtime called with filename " + str(filename)) if cloud is not None and not self.fixed: key_name = str(self.section) + '/' + str(self.file_number) + '/' + path_to_key(filename) key = cloud.search_key(key_name) if key is None or not key.does_exist: raise DAError("get_modtime: file " + filename + " in " + self.section + " did not exist") # logmessage("Modtime for key " + key_name + " is now " + str(key.last_modified)) return key.get_epoch_modtime() the_path = os.path.join(self.directory, filename) if not os.path.isfile(the_path): raise DAError("get_modtime: file " + filename + " in " + self.section + " did not exist") return os.path.getmtime(the_path)
def concatenate_files(path_list, pdfa=False, password=None): pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) subprocess_arguments = [PDFTK_PATH] new_path_list = list() for path in path_list: mimetype, encoding = mimetypes.guess_type(path) if mimetype.startswith('image'): new_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) args = [daconfig.get('imagemagick', 'convert'), path, new_pdf_file.name] try: result = subprocess.run(args, timeout=60).returncode except subprocess.TimeoutExpired: logmessage("concatenate_files: convert took too long") result = 1 if result != 0: logmessage("failed to convert image to PDF: " + " ".join(args)) continue new_path_list.append(new_pdf_file.name) elif mimetype in ('application/rtf', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'application/msword', 'application/vnd.oasis.opendocument.text'): new_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) if mimetype == 'application/rtf': ext = 'rtf' elif mimetype == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ext = 'docx' elif mimetype == 'application/msword': ext = 'doc' elif mimetype == 'application/vnd.oasis.opendocument.text': ext = 'odt' word_to_pdf(path, ext, new_pdf_file.name, pdfa=False) new_path_list.append(new_pdf_file.name) elif mimetype == 'application/pdf': new_path_list.append(path) if len(new_path_list) == 0: raise DAError("concatenate_files: no valid files to concatenate") subprocess_arguments.extend(new_path_list) subprocess_arguments.extend(['cat', 'output', pdf_file.name]) #logmessage("Arguments are " + str(subprocess_arguments)) try: result = subprocess.run(subprocess_arguments, timeout=60).returncode except subprocess.TimeoutExpired: result = 1 logmessage("concatenate_files: call to cat took too long") if result != 0: logmessage("Failed to concatenate PDF files") raise DAError("Call to pdftk failed for concatenation where arguments were " + " ".join(subprocess_arguments)) if pdfa: pdf_to_pdfa(pdf_file.name) replicate_js_and_calculations(new_path_list[0], pdf_file.name, password) return pdf_file.name
def concatenate_files(path_list): new_path_list = [] for path in path_list: mimetype, encoding = mimetypes.guess_type(path) if mimetype in ('application/rtf', 'application/msword', 'application/vnd.oasis.opendocument.text'): new_docx_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".docx", delete=False) if mimetype == 'application/rtf': ext = 'rtf' elif mimetype == 'application/msword': ext = 'doc' elif mimetype == 'application/vnd.oasis.opendocument.text': ext = 'odt' docassemble.base.pandoc.convert_file(path, new_docx_file.name, ext, 'docx') new_path_list.append(new_docx_file.name) elif mimetype == 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': new_path_list.append(path) if len(new_path_list) == 0: raise DAError("concatenate_files: no valid files to concatenate") if len(new_path_list) == 1: return new_path_list[0] composer = Composer(docx.Document(new_path_list[0])) for indexno in range(1, len(new_path_list)): composer.append(docx.Document(new_path_list[indexno])) docx_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".docx", delete=False) composer.save(docx_file.name) return docx_file.name
def pdf_encrypt(filename, password): #logmessage("pdf_encrypt: running; password is " + repr(password)) (owner_password, user_password) = get_passwords(password) outfile = tempfile.NamedTemporaryFile(prefix="datemp", suffix=".pdf", delete=False) if owner_password == user_password: commands = [ 'pdftk', filename, 'output', outfile.name, 'user_pw', user_password, 'allow', 'printing' ] else: commands = [ 'pdftk', filename, 'output', outfile.name, 'owner_pw', owner_password, 'user_pw', user_password, 'allow', 'printing' ] try: output = subprocess.check_output(commands, stderr=subprocess.STDOUT).decode() except subprocess.CalledProcessError as err: output = err.output raise DAError("pdf_encrypt: error running pdftk. " + output) #logmessage(' '.join(commands)) #logmessage(output) shutil.move(outfile.name, filename)
def apply_qpdf(filename): try: pypdf.PdfFileReader(open(filename, 'rb'), overwriteWarnings=False) pdf_ok = True except pypdf.utils.PdfReadError: pdf_ok = False if pdf_ok: return try: new_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) qpdf_subprocess_arguments = [QPDF_PATH, filename, new_file.name] try: result = subprocess.run(qpdf_subprocess_arguments, timeout=60, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("apply_qpdf: call to qpdf took too long") if result != 0: logmessage("Failed to convert PDF " + str(filename)) logmessage("Call to qpdf failed for " + str(filename) + " where arguments were " + " ".join(qpdf_subprocess_arguments)) raise Exception("qpdf error") pypdf.PdfFileReader(open(new_file.name, 'rb'), overwriteWarnings=False) except: raise DAError("Could not fix PDF") shutil.copyfile(new_file.name, filename)
def finalize(self): if cloud is None: return if not self.fixed: raise DAError("SavedFile: finalize called before fix") existing_files = list() for filename in os.listdir(self.directory): existing_files.append(filename) fullpath = os.path.join(self.directory, filename) #logmessage("Found " + fullpath) if os.path.isfile(fullpath): save = True if filename in self.keydict: key = self.keydict[filename] if self.modtimes[filename] == os.path.getmtime(fullpath): save = False else: key = cloud.get_key( str(self.section) + '/' + str(self.file_number) + '/' + str(filename)) if self.extension is not None and filename == self.filename: extension, mimetype = get_ext_and_mimetype( filename + '.' + self.extension) else: extension, mimetype = get_ext_and_mimetype(filename) key.content_type = mimetype if save: key.set_contents_from_filename(fullpath) for filename, key in self.keydict.iteritems(): if filename not in existing_files: #logmessage("Deleting filename " + str(filename) + " from cloud") key.delete() return
def alchemy_url(db_config): if db_config not in daconfig or (not isinstance(daconfig[db_config], dict)) or 'name' not in daconfig[db_config]: raise Exception("alchemy_connection_string: missing or invalid configuration for " + db_config) dbuser = daconfig[db_config].get('user', None) dbpassword = daconfig[db_config].get('password', None) dbhost = daconfig[db_config].get('host', None) if dbhost is None and dbuser is not None: dbhost = 'localhost' dbport = daconfig[db_config].get('port', None) dbprefix = daconfig[db_config].get('prefix', 'postgresql+psycopg2://') dbname = daconfig[db_config]['name'] alchemy_connect_string = "" if dbprefix is not None: alchemy_connect_string += dbprefix if dbuser is not None: alchemy_connect_string += dbuser if dbpassword is not None: alchemy_connect_string += ":" + dbpassword else: alchemy_connect_string += ":" if dbhost is not None: alchemy_connect_string += '@' + dbhost if dbport is not None: alchemy_connect_string += ':' + str(dbport) else: alchemy_connect_string += '@' if not dbprefix.startswith('oracle'): if dbname is not None: alchemy_connect_string += "/" + dbname else: raise DAError("No database name provided") return alchemy_connect_string
def __init__(self, section='', project='default'): if docassemble.base.functions.this_thread.current_info['user']['is_anonymous']: raise DAError("Users must be logged in to create Playground objects") self.user_id = docassemble.base.functions.this_thread.current_info['user']['theid'] self.current_info = docassemble.base.functions.this_thread.current_info self.section = section self.project = project self._update_file_list()
def size_in_bytes(self, **kwargs): filename = kwargs.get('filename', self.filename) if cloud is not None and not self.fixed: key = cloud.search_key(str(self.section) + '/' + str(self.file_number) + '/' + path_to_key(filename)) if key is None or not key.does_exist: raise DAError("size_in_bytes: file " + filename + " in " + self.section + " did not exist") return key.size return os.path.getsize(os.path.join(self.directory, filename))
def fill_template(template, data_strings=[], data_names=[], hidden=[], readonly=[], images=[], pdf_url=''): fdf = fdfgen.forge_fdf(pdf_url, data_strings, data_names, hidden, readonly) fdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".fdf", delete=False) fdf_file.write(fdf) fdf_file.close() pdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf", delete=False) subprocess_arguments = [PDFTK_PATH, template, 'fill_form', fdf_file.name,'output', pdf_file.name, 'flatten'] result = call(subprocess_arguments) if result != 0: logmessage("Failed to fill PDF form " + str(template)) raise DAError("Call to pdftk failed for template " + str(template) + " where arguments were " + " ".join(subprocess_arguments)) if len(images): fields = dict() for field, default, pageno, rect, field_type in read_fields(template): if str(field_type) == '/Sig': fields[field] = {'pageno': pageno, 'rect': rect} for field, file_info in images: if field not in fields: logmessage("field name " + str(field) + " not found in PDF file") continue logmessage("Need to put image on page " + str(fields[field]['pageno'])) temp_png = tempfile.NamedTemporaryFile(mode="wb", suffix=".png") args = ["convert", file_info['fullpath'], "-trim", "+repage", temp_png.name] result = call(args) if result == 1: logmessage("failed to trim file: " + " ".join(args)) continue im = Image.open(temp_png.name) width, height = im.size xone, yone, xtwo, ytwo = fields[field]['rect'] dppx = width/(xtwo-xone) dppy = height/(ytwo-yone) if (dppx > dppy): dpp = dppx else: dpp = dppy extent_x, extent_y = xone*dpp+width, yone*dpp+height overlay_pdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") args = ["convert", temp_png.name, "-background", "none", "-density", str(int(dpp*72)), "-gravity", "NorthEast", "-extent", str(int(extent_x)) + 'x' + str(int(extent_y)), overlay_pdf_file.name] result = call(args) if result == 1: logmessage("failed to make overlay: " + " ".join(args)) continue new_pdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") with open(pdf_file.name, "rb") as inFile, open(overlay_pdf_file.name, "rb") as overlay: original = pypdf.PdfFileReader(inFile) background = original.getPage(fields[field]['pageno']-1) foreground = pypdf.PdfFileReader(overlay).getPage(0) background.mergePage(foreground) writer = pypdf.PdfFileWriter() for i in range(original.getNumPages()): page = original.getPage(i) writer.addPage(page) with open(new_pdf_file.name, "wb") as outFile: writer.write(outFile) shutil.copyfile(new_pdf_file.name, pdf_file.name) return pdf_file.name
def fetch_url_post(self, url, post_args, **kwargs): filename = kwargs.get('filename', self.filename) self.fix() r = requests.post(url_sanitize(url), data=post_args) if r.status_code != 200: raise DAError('fetch_url_post: retrieval from ' + url + 'failed') with open(os.path.join(self.directory, filename), 'wb') as fp: for block in r.iter_content(1024): fp.write(block) self.save()
def overlay_pdf_multi(main_file, logo_file, out_file): subprocess_arguments = [PDFTK_PATH, main_file, 'multistamp', logo_file, 'output', out_file] try: result = subprocess.run(subprocess_arguments, timeout=60, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("overlay_pdf_multi: call to pdftk took too long") if result != 0: logmessage("Failed to overlay PDF") raise DAError("Call to pdftk failed for overlay where arguments were " + " ".join(subprocess_arguments))
def flatten_pdf(filename): #logmessage("flatten_pdf: running") outfile = tempfile.NamedTemporaryFile(prefix="datemp", suffix=".pdf", delete=False) subprocess_arguments = [PDFTK_PATH, filename, 'output', outfile.name, 'flatten'] #logmessage("Arguments are " + str(subprocess_arguments)) result = call(subprocess_arguments) if result != 0: logmessage("Failed to flatten PDF form " + str(template)) raise DAError("Call to pdftk failed for template " + str(template) + " where arguments were " + " ".join(subprocess_arguments)) commands = [] shutil.move(outfile.name, filename)
def pdf_to_pdfa(filename): logmessage("pdf_to_pdfa: running") outfile = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) directory = tempfile.mkdtemp() commands = ['gs', '-dPDFA', '-dBATCH', '-dNOPAUSE', '-sProcessColorModel=DeviceCMYK', '-sDEVICE=pdfwrite', '-sPDFACompatibilityPolicy=1', '-sOutputFile=' + outfile.name, filename] try: output = subprocess.check_output(commands, cwd=directory, stderr=subprocess.STDOUT) except subprocess.CalledProcessError as err: output = err.output raise DAError("pdf_to_pdfa: error running ghostscript. " + output) logmessage(output) shutil.move(outfile.name, filename)
def __init__(self, section=''): if docassemble.base.functions.this_thread.current_info['user'][ 'is_anonymous']: raise DAError( "Users must be logged in to create Playground objects") self.user_id = docassemble.base.functions.this_thread.current_info[ 'user']['theid'] self.current_info = docassemble.base.functions.this_thread.current_info self.section = section self.area = SavedFile(self.user_id, fix=True, section='playground' + self.section) self._update_file_list()
def flatten_pdf(filename): #logmessage("flatten_pdf: running") outfile = tempfile.NamedTemporaryFile(prefix="datemp", suffix=".pdf", delete=False) subprocess_arguments = [PDFTK_PATH, filename, 'output', outfile.name, 'flatten'] #logmessage("Arguments are " + str(subprocess_arguments)) try: result = subprocess.run(subprocess_arguments, timeout=60, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("flatten_pdf: call to pdftk took too long") if result != 0: logmessage("Failed to flatten PDF form") raise DAError("Call to pdftk failed for template where arguments were " + " ".join(subprocess_arguments)) shutil.move(outfile.name, filename)
def finalize(self): #sys.stderr.write("finalize: starting " + str(self.section) + '/' + str(self.file_number) + "\n") if cloud is None: return if not self.fixed: raise DAError("SavedFile: finalize called before fix") for filename in listfiles(self.directory): fullpath = os.path.join(self.directory, filename) #logmessage("Found " + fullpath) if os.path.isfile(fullpath): save = True if filename in self.keydict: key = self.keydict[filename] if self.modtimes[filename] == os.path.getmtime(fullpath): save = False else: key = cloud.get_key( str(self.section) + '/' + str(self.file_number) + '/' + path_to_key(filename)) if save: if self.extension is not None and filename == self.filename: extension, mimetype = get_ext_and_mimetype( filename + '.' + self.extension) else: extension, mimetype = get_ext_and_mimetype(filename) key.content_type = mimetype #sys.stderr.write("finalize: saving " + str(self.section) + '/' + str(self.file_number) + '/' + str(filename) + "\n") if not os.path.isfile(fullpath): continue try: key.set_contents_from_filename(fullpath) self.modtimes[filename] = key.get_epoch_modtime() except FileNotFoundError: sys.stderr.write("finalize: error while saving " + str(self.section) + '/' + str(self.file_number) + '/' + str(filename) + "; path " + str(fullpath) + " disappeared\n") for filename, key in self.keydict.items(): if not os.path.isfile(os.path.join(self.directory, filename)): sys.stderr.write("finalize: deleting " + str(self.section) + '/' + str(self.file_number) + '/' + path_to_key(filename) + "\n") try: key.delete() except: pass #sys.stderr.write("finalize: ending " + str(self.section) + '/' + str(self.file_number) + "\n") return
def safe_pypdf_reader(filename): try: return pypdf.PdfFileReader(open(filename, 'rb'), overwriteWarnings=False) except pypdf.utils.PdfReadError: new_filename = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) qpdf_subprocess_arguments = [QPDF_PATH, filename, new_filename.name] try: result = subprocess.run(qpdf_subprocess_arguments, timeout=60, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("fill_template: call to qpdf took too long") if result != 0: logmessage("Failed to convert PDF template " + str(filename)) raise DAError("Call to qpdf failed for template " + str(filename) + " where arguments were " + " ".join(qpdf_subprocess_arguments)) return pypdf.PdfFileReader(open(new_filename.name, 'rb'), overwriteWarnings=False)
def get_passwords(password): if password is None: return (None, None) if isinstance(password, (str, bool, int, float)): owner_password = str(password).strip() user_password = str(password).strip() elif isinstance(password, list): owner_password = str(password[0]).strip() user_password = str(password[1]).strip() elif isinstance(password, dict): owner_password = str(password.get('owner', 'password')).strip() user_password = str(password.get('user', 'password')).strip() else: raise DAError("get_passwords: invalid password") return (owner_password, user_password)
def get_passwords(password): if password is None: return (None, None) if type(password) in (str, unicode, bool, int, float): owner_password = text_type(password).strip() user_password = text_type(password).strip() elif type(password) is list: owner_password = text_type(password[0]).strip() user_password = text_type(password[1]).strip() elif type(password) is dict: owner_password = text_type(password.get('owner', 'password')).strip() user_password = text_type(password.get('user', 'password')).strip() else: raise DAError("get_passwords: invalid password") return (owner_password, user_password)
dbpassword = daconfig['db'].get('password', None) dbhost = daconfig['db'].get('host', None) if dbhost is None and dbuser is not None: dbhost = 'localhost' dbport = daconfig['db'].get('port', None) dbprefix = daconfig['db'].get('prefix', 'postgresql+psycopg2://') dbname = daconfig['db'].get('name', 'docassemble') dbtableprefix = daconfig['db'].get('table prefix', None) if not dbtableprefix: dbtableprefix = '' connect_string = "" if dbname is not None: connect_string += "dbname=" + dbname else: raise DAError("No database name provided") if dbuser is not None: connect_string += " user="******" password="******"" if dbprefix is not None: alchemy_connect_string += dbprefix if dbuser is not None: alchemy_connect_string += dbuser if dbpassword is not None: alchemy_connect_string += ":" + dbpassword else: alchemy_connect_string += ":" if dbhost is not None:
def ocr_pdf(*pargs, target=None, filename=None, lang=None, psm=6, dafilelist=None, preserve_color=False): if preserve_color: device = 'tiff48nc' else: device = 'tiffgray' docs = [] all_pdf = True if not isinstance(target, DAFile): raise DAError("ocr_pdf: target must be a DAFile") for other_file in pargs: if isinstance(other_file, DAFileList): for other_file_sub in other_file.elements: if not other_file._is_pdf(): all_pdf = False docs.append(other_file_sub) elif isinstance(other_file, DAFileCollection): if not hasattr(other_file, 'pdf'): raise DAError( 'ocr_pdf: DAFileCollection object did not have pdf attribute.' ) docs.append(other_file.pdf) elif isinstance(other_file, DAStaticFile): if not other_file._is_pdf(): all_pdf = False docs.append(other_file) elif isinstance(other_file, (str, DAFile)): all_pdf = False docs.append(other_file) if len(docs) == 0: if not target._is_pdf(): all_pdf = False docs.append(target) if len(docs) > 1 or not all_pdf: import docassemble.base.util doc = docassemble.base.util.pdf_concatenate(docs) else: doc = docs[0] if psm is None: psm = 6 if filename is None: filename = 'file.pdf' if not hasattr(doc, 'extension'): return None if doc.extension not in ['pdf', 'png', 'jpg', 'gif']: raise DAError("ocr_pdf: not a readable image file") path = doc.path() pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", delete=False) pdf_file.close() if doc.extension == 'pdf': tiff_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".tiff", delete=False) params = [ 'gs', '-q', '-dNOPAUSE', '-sDEVICE=' + device, '-r600', '-sOutputFile=' + tiff_file.name, path, '-c', 'quit' ] try: result = subprocess.run(params, timeout=60 * 60).returncode except subprocess.TimeoutExpired: result = 1 logmessage("ocr_pdf: call to gs took too long") if result != 0: raise Exception("ocr_pdf: failed to run gs with command " + " ".join(params)) params = [ 'tesseract', tiff_file.name, pdf_file.name, '-l', str(lang), '--psm', str(psm), 'pdf' ] try: result = subprocess.run(params, timeout=60 * 60).returncode except subprocess.TimeoutExpired: result = 1 logmessage("ocr_pdf: call to tesseract took too long") if result != 0: raise Exception("ocr_pdf: failed to run tesseract with command " + " ".join(params)) else: params = [ 'tesseract', path, pdf_file.name, '-l', str(lang), '--psm', str(psm), 'pdf' ] try: result = subprocess.run(params, timeout=60 * 60).returncode except subprocess.TimeoutExpired: result = 1 logmessage("ocr_pdf: call to tesseract took too long") if result != 0: raise Exception("ocr_pdf: failed to run tesseract with command " + " ".join(params)) target.initialize(filename=filename, extension='pdf', mimetype='application/pdf', reinitialize=True) shutil.copyfile(pdf_file.name + '.pdf', target.file_info['path']) del target.file_info target._make_pdf_thumbnail(1, both_formats=True) target.commit() target.retrieve() return target
def fill_template(template, data_strings=None, data_names=None, hidden=None, readonly=None, images=None, pdf_url=None, editable=True, pdfa=False, password=None, template_password=None, default_export_value=None): if data_strings is None: data_strings = [] if data_names is None: data_names = [] if hidden is None: hidden = [] if readonly is None: readonly = [] if images is None: images = [] if pdf_url is None: pdf_url = 'file.pdf' if not pdf_url.endswith('.pdf'): pdf_url += '.pdf' the_fields = read_fields(template) if len(the_fields) == 0: raise DAError("PDF template has no fields in it.") export_values = {} for field, default, pageno, rect, field_type, export_value in the_fields: field_type = re.sub(r'[^/A-Za-z]', '', str(field_type)) if field_type in ('/Btn', "/'Btn'"): export_values[ field] = export_value or default_export_value or 'Yes' if len(export_values) > 0: new_data_strings = [] for key, val in data_strings: if key in export_values: if str(val) in ('Yes', 'yes', 'True', 'true', 'On', 'on', export_values[key]): val = export_values[key] else: if export_values[key] == 'On': val = 'Off' elif export_values[key] == 'on': val = 'off' elif export_values[key] == 'yes': val = 'no' else: val = 'No' new_data_strings.append((key, val)) data_strings = new_data_strings data_dict = {} for key, val in data_strings: data_dict[key] = val fdf = Xfdf(pdf_url, data_dict) #fdf = fdfgen.forge_fdf(pdf_url, data_strings, data_names, hidden, readonly) fdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".xfdf", delete=False) #fdf_file.write(fdf) fdf_file.close() fdf.write_xfdf(fdf_file.name) # if False: # fdf_dict = {} # for key, val in data_strings: # fdf_dict[key] = val # xfdf_temp_filename = pypdftk.gen_xfdf(fdf_dict) # xfdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=\ # ".xfdf", delete=False) # shutil.copyfile(xfdf_temp_filename, xfdf_file.name) pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) if template_password is not None: template_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) qpdf_subprocess_arguments = [ QPDF_PATH, '--decrypt', '--password='******'fill_form', fdf_file.name, 'output', pdf_file.name ] #logmessage("Arguments are " + str(subprocess_arguments)) if editable or len(images) > 0: subprocess_arguments.append('need_appearances') else: subprocess_arguments.append('flatten') try: result = subprocess.run(subprocess_arguments, timeout=600, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("fill_template: call to pdftk fill_form took too long") if result != 0: logmessage("Failed to fill PDF form " + str(template)) raise DAError("Call to pdftk failed for template " + str(template) + " where arguments were " + " ".join(subprocess_arguments)) if len(images) > 0: fields = {} for field, default, pageno, rect, field_type, export_value in the_fields: if str(field_type) in ('/Sig', "/'Sig'"): fields[field] = {'pageno': pageno, 'rect': rect} image_todo = [] for field, file_info in images: if field not in fields: logmessage("field name " + str(field) + " not found in PDF file") continue #logmessage("Need to put image on page " + str(fields[field]['pageno'])) temp_png = tempfile.NamedTemporaryFile(mode="wb", suffix=".png") args = [ daconfig.get('imagemagick', 'convert'), file_info['fullpath'], "-trim", "+repage", "+profile", '*', '-density', '0', temp_png.name ] try: result = subprocess.run(args, timeout=60, check=False).returncode except subprocess.TimeoutExpired: logmessage("fill_template: convert took too long") result = 1 if result == 1: logmessage("failed to trim file: " + " ".join(args)) continue im = Image.open(temp_png.name) width, height = im.size xone, yone, xtwo, ytwo = fields[field]['rect'] dppx = width / (xtwo - xone) dppy = height / (ytwo - yone) if dppx > dppy: dpp = dppx else: dpp = dppy extent_x, extent_y = xone * dpp + width, yone * dpp + height overlay_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) args = [ daconfig.get('imagemagick', 'convert'), temp_png.name, "-background", "none", "-density", str(int(dpp * 72)), "-gravity", "NorthEast", "-extent", str(int(extent_x)) + 'x' + str(int(extent_y)), overlay_pdf_file.name ] try: result = subprocess.run(args, timeout=60, check=False).returncode except subprocess.TimeoutExpired: result = 1 logmessage("fill_template: call to convert took too long") if result == 1: logmessage("failed to make overlay: " + " ".join(args)) continue image_todo.append({ 'overlay_file': overlay_pdf_file.name, 'pageno': fields[field]['pageno'] }) if len(image_todo) > 0: new_pdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") original = safe_pypdf_reader(pdf_file.name) original.idnum_to_page = get_page_hash(original.trailer) catalog = original.trailer["/Root"] writer = DAPdfFileWriter() tree = {} for part in pdf_parts: if part in catalog: tree[part] = catalog[part] for i in range(original.getNumPages()): for item in image_todo: if (item['pageno'] - 1) == i: page = original.getPage(i) foreground_file = safe_pypdf_reader( item['overlay_file']) foreground_page = foreground_file.getPage(0) page.mergePage(foreground_page) for i in range(original.getNumPages()): newpage = original.getPage(i) writer.addPage(newpage) for key, val in tree.items(): writer._root_object.update( {pypdf.generic.NameObject(key): val}) writer.page_list = [] recursive_get_pages(writer._root_object['/Pages'], writer.page_list) try: recursive_add_bookmark(original, writer, original.getOutlines()) except: pass with open(new_pdf_file.name, "wb") as outFile: writer.write(outFile) shutil.copyfile(new_pdf_file.name, pdf_file.name) if (not editable) and len(images) > 0: flatten_pdf(pdf_file.name) if pdfa: pdf_to_pdfa(pdf_file.name) if editable: replicate_js_and_calculations(template, pdf_file.name, password) elif password: pdf_encrypt(pdf_file.name, password) return pdf_file.name
def fill_template(template, data_strings=[], data_names=[], hidden=[], readonly=[], images=[], pdf_url=None, editable=True, pdfa=False, password=None, template_password=None): if pdf_url is None: pdf_url = '' fdf = fdfgen.forge_fdf(pdf_url, data_strings, data_names, hidden, readonly) fdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".fdf", delete=False) fdf_file.write(fdf) fdf_file.close() if False: fdf_dict = dict() for key, val in data_strings: fdf_dict[key] = val xfdf_temp_filename = pypdftk.gen_xfdf(fdf_dict) xfdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=\ ".xfdf", delete=False) shutil.copyfile(xfdf_temp_filename, xfdf_file.name) pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) if template_password is not None: template_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) qpdf_subprocess_arguments = [ QPDF_PATH, '--decrypt', '--password='******'fill_form', fdf_file.name, 'output', pdf_file.name ] #logmessage("Arguments are " + str(subprocess_arguments)) if editable or len(images): subprocess_arguments.append('need_appearances') else: subprocess_arguments.append('flatten') result = call(subprocess_arguments) if result != 0: logmessage("Failed to fill PDF form " + str(template)) raise DAError("Call to pdftk failed for template " + str(template) + " where arguments were " + " ".join(subprocess_arguments)) if len(images): fields = dict() for field, default, pageno, rect, field_type in read_fields(template): if str(field_type) in ('/Sig', "/u'Sig'", "/'Sig'"): fields[field] = {'pageno': pageno, 'rect': rect} image_todo = list() for field, file_info in images: if field not in fields: logmessage("field name " + str(field) + " not found in PDF file") continue #logmessage("Need to put image on page " + str(fields[field]['pageno'])) temp_png = tempfile.NamedTemporaryFile(mode="wb", suffix=".png") args = [ "convert", file_info['fullpath'], "-trim", "+repage", temp_png.name ] result = call(args) if result == 1: logmessage("failed to trim file: " + " ".join(args)) continue im = Image.open(temp_png.name) width, height = im.size xone, yone, xtwo, ytwo = fields[field]['rect'] dppx = width / (xtwo - xone) dppy = height / (ytwo - yone) if (dppx > dppy): dpp = dppx else: dpp = dppy extent_x, extent_y = xone * dpp + width, yone * dpp + height overlay_pdf_file = tempfile.NamedTemporaryFile(prefix="datemp", mode="wb", suffix=".pdf", delete=False) args = [ "convert", temp_png.name, "-background", "none", "-density", str(int(dpp * 72)), "-gravity", "NorthEast", "-extent", str(int(extent_x)) + 'x' + str(int(extent_y)), overlay_pdf_file.name ] result = call(args) if result == 1: logmessage("failed to make overlay: " + " ".join(args)) continue image_todo.append({ 'overlay_stream': open(overlay_pdf_file.name, "rb"), 'pageno': fields[field]['pageno'] }) if len(image_todo): new_pdf_file = tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") with open(pdf_file.name, "rb") as inFile: original = pypdf.PdfFileReader(inFile) original.idnum_to_page = get_page_hash(original.trailer) catalog = original.trailer["/Root"] writer = DAPdfFileWriter() tree = dict() for part in pdf_parts: if part in catalog: tree[part] = catalog[part] for i in range(original.getNumPages()): for item in image_todo: if (item['pageno'] - 1) == i: page = original.getPage(i) foreground_file = pypdf.PdfFileReader( item['overlay_stream']) foreground_page = foreground_file.getPage(0) page.mergePage(foreground_page) for i in range(original.getNumPages()): newpage = original.getPage(i) writer.addPage(newpage) for key, val in tree.items(): writer._root_object.update( {pypdf.generic.NameObject(key): val}) writer.page_list = list() recursive_get_pages(writer._root_object['/Pages'], writer.page_list) recursive_add_bookmark(original, writer, original.getOutlines()) with open(new_pdf_file.name, "wb") as outFile: writer.write(outFile) shutil.copyfile(new_pdf_file.name, pdf_file.name) for item in image_todo: item['overlay_stream'].close() if (not editable) and len(images): flatten_pdf(pdf_file.name) if pdfa: pdf_to_pdfa(pdf_file.name) if editable: replicate_js_and_calculations(template, pdf_file.name, password) elif password: pdf_encrypt(pdf_file.name, password) return pdf_file.name