def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: #TODO: XHTMLParser doesn't see to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) if type(src) in StringTypes: if type(src) is unicode: # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # Test for the restrictions of html5lib if encoding: # Workaround for html5lib<0.11.1 if hasattr(inputstream, "isValidEncoding"): if encoding.strip().lower() == "utf8": encoding = "utf-8" if not inputstream.isValidEncoding(encoding): log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) else: if inputstream.codecName(encoding) is None: log.error("%r is not a valid encoding", encoding) document = parser.parse( src, encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) #try: context.parseCSS() #except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context
def test_unicode(self): """Asserts bytes generated by reportlab are returned""" src = pisaTempFile() value = b'%PDF-1.4\r\n%\x93\x8c\x8b\x9e ReportLab Generated PDF document http://www.reportlab.com' try: src.write(value) except UnicodeDecodeError as error: self.fail(error)
def pisaErrorDocument(dest, c): out = pisaTempFile(capacity=c.capacity) out.write("<p style='background-color:red;'><strong>%d error(s) occured:</strong><p>" % c.err) for mode, line, msg, _ in c.log: if mode=="error": out.write("<pre>%s in line %d: %s</pre>" % (mode, line, cgi.escape(msg))) out.write("<p><strong>%d warning(s) occured:</strong><p>" % c.warn) for mode, line, msg, _ in c.log: if mode=="warning": out.write("<p>%s in line %d: %s</p>" % (mode, line, cgi.escape(msg))) return pisaDocument(out.getvalue(), dest, raise_exception=False)
def join(self, file=None): output = PyPDF2.PdfFileWriter() for pdffile in self.files: input = PyPDF2.PdfFileReader(pdffile) for pageNumber in xrange(input.getNumPages()): output.addPage(input.getPage(pageNumber)) if file is not None: output.write(file) return file out = pisaTempFile(capacity=self.capacity) output.write(out) return out.getvalue()
def join(self, file=None): import pyPdf # TODO: Why is this in the middle of everything? if pyPdf: output = pyPdf.PdfFileWriter() for pdffile in self.files: input = pyPdf.PdfFileReader(pdffile) for pageNumber in xrange(input.getNumPages()): output.addPage(input.getPage(pageNumber)) if file is not None: output.write(file) return file out = pisaTempFile(capacity=self.capacity) output.write(out) return out.getvalue()
def pisaDocument(src, dest=None, path=None, link_callback=None, debug=0, default_css=None, xhtml=False, encoding=None, xml_output=None, raise_exception=True, capacity=100 * 1024, **kw): log.debug( "pisaDocument options:\n src = %r\n dest = %r\n path = %r\n link_callback = %r\n xhtml = %r", src, dest, path, link_callback, xhtml) # Prepare simple context context = pisaContext(path, debug=debug, capacity=capacity) context.pathCallback = link_callback # Build story context = pisaStory(src, path, link_callback, debug, default_css, xhtml, encoding, context=context, xml_output=xml_output) # Buffer PDF into memory out = pisaTempFile(capacity=context.capacity) doc = PmlBaseDoc(out, pagesize=context.pageSize, author=context.meta["author"].strip(), subject=context.meta["subject"].strip(), keywords=[ x.strip() for x in context.meta["keywords"].strip().split(",") if x ], title=context.meta["title"].strip(), showBoundary=0, allowSplitting=1) # Prepare templates and their frames if "body" in context.templateList: body = context.templateList["body"] del context.templateList["body"] else: x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize) body = PmlPageTemplate(id="body", frames=[ Frame(x, y, w, h, id="body", leftPadding=0, rightPadding=0, bottomPadding=0, topPadding=0) ], pagesize=context.pageSize) doc.addPageTemplates([body] + context.templateList.values()) # Use multibuild e.g. if a TOC has to be created if context.multiBuild: doc.multiBuild(context.story) else: doc.build(context.story) # Add watermarks if pyPdf: for bgouter in context.pisaBackgroundList: # If we have at least one background, then lets do it if bgouter: istream = out output = pyPdf.PdfFileWriter() input1 = pyPdf.PdfFileReader(istream) ctr = 0 # TODO: Why do we loop over the same list again? # see bgouter at line 137 for bg in context.pisaBackgroundList: page = input1.getPage(ctr) if (bg and not bg.notFound() and (bg.mimetype == "application/pdf")): bginput = pyPdf.PdfFileReader(bg.getFile()) pagebg = bginput.getPage(0) pagebg.mergePage(page) page = pagebg else: log.warn( context.warning("Background PDF %s doesn't exist.", bg)) output.addPage(page) ctr += 1 out = pisaTempFile(capacity=context.capacity) output.write(out) # data = sout.getvalue() # Found a background? So leave loop after first occurence break else: log.warn(context.warning("pyPDF not installed!")) # Get the resulting PDF and write it to the file object # passed from the caller if dest is None: # No output file was passed - Let's use a pisaTempFile dest = pisaTempFile(capacity=context.capacity) context.dest = dest data = out.getvalue( ) # TODO: That load all the tempfile in RAM - Why bother with a swapping tempfile then? context.dest.write(data) # TODO: context.dest is a tempfile as well... return context
def pisaDocument(src, dest=None, path=None, link_callback=None, debug=0, default_css=None, xhtml=False, encoding=None, xml_output=None, raise_exception=True, capacity=100*1024, **kw): log.debug("pisaDocument options:\n src = %r\n dest = %r\n path = %r\n link_callback = %r\n xhtml = %r", src, dest, path, link_callback, xhtml) # Prepare simple context context = pisaContext(path, debug=debug, capacity=capacity) context.pathCallback = link_callback # Build story context = pisaStory(src, path, link_callback, debug, default_css, xhtml, encoding, context=context, xml_output=xml_output) # Buffer PDF into memory out = pisaTempFile(capacity=context.capacity) doc = PmlBaseDoc( out, pagesize=context.pageSize, author=context.meta["author"].strip(), subject=context.meta["subject"].strip(), keywords=[x.strip() for x in context.meta["keywords"].strip().split(",") if x], title=context.meta["title"].strip(), showBoundary=0, allowSplitting=1) # Prepare templates and their frames if "body" in context.templateList: body = context.templateList["body"] del context.templateList["body"] else: x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize) body = PmlPageTemplate( id="body", frames=[ Frame(x, y, w, h, id="body", leftPadding=0, rightPadding=0, bottomPadding=0, topPadding=0)], pagesize = context.pageSize) doc.addPageTemplates([body] + context.templateList.values()) # Use multibuild e.g. if a TOC has to be created if context.multiBuild: doc.multiBuild(context.story) else: doc.build(context.story) # Add watermarks if pyPdf: for bgouter in context.pisaBackgroundList: # If we have at least one background, then lets do it if bgouter: istream = out output = pyPdf.PdfFileWriter() input1 = pyPdf.PdfFileReader(istream) ctr = 0 # TODO: Why do we loop over the same list again? # see bgouter at line 137 for bg in context.pisaBackgroundList: page = input1.getPage(ctr) if (bg and not bg.notFound() and (bg.mimetype=="application/pdf")): bginput = pyPdf.PdfFileReader(bg.getFile()) pagebg = bginput.getPage(0) pagebg.mergePage(page) page = pagebg else: log.warn(context.warning( "Background PDF %s doesn't exist.", bg)) output.addPage(page) ctr += 1 out = pisaTempFile(capacity=context.capacity) output.write(out) # data = sout.getvalue() # Found a background? So leave loop after first occurence break else: log.warn(context.warning("pyPDF not installed!")) # Get the resulting PDF and write it to the file object # passed from the caller if dest is None: # No output file was passed - Let's use a pisaTempFile dest = pisaTempFile(capacity=context.capacity) context.dest = dest data = out.getvalue() # TODO: That load all the tempfile in RAM - Why bother with a swapping tempfile then? context.dest.write(data) # TODO: context.dest is a tempfile as well... return context
def addFromString(self, data): self.files.append(pisaTempFile(data, capacity=self.capacity))
def pisaDocument(src, dest=None, path=None, link_callback=None, debug=0, default_css=None, xhtml=False, encoding=None, xml_output=None, raise_exception=True, capacity=100 * 1024, context_meta=None, **kw): log.debug( "pisaDocument options:\n src = %r\n dest = %r\n path = %r\n link_callback = %r\n xhtml = %r\n context_meta = %r", src, dest, path, link_callback, xhtml, context_meta) # Prepare simple context context = pisaContext(path, debug=debug, capacity=capacity) if context_meta is not None: context.meta.update(context_meta) context.pathCallback = link_callback # Build story context = pisaStory(src, path, link_callback, debug, default_css, xhtml, encoding, context=context, xml_output=xml_output) # Buffer PDF into memory out = io.BytesIO() doc = PmlBaseDoc(out, pagesize=context.pageSize, author=context.meta["author"].strip(), subject=context.meta["subject"].strip(), keywords=[ x.strip() for x in context.meta["keywords"].strip().split(",") if x ], title=context.meta["title"].strip(), showBoundary=0, allowSplitting=1) # Prepare templates and their frames multi_template_list = False if "body" in context.templateList: body = context.templateList["body"] del context.templateList["body"] else: x, y, w, h = getBox("1cm 1cm -1cm -1cm", context.pageSize) body = PmlPageTemplate(id="body", frames=[ Frame(x, y, w, h, id="body", leftPadding=0, rightPadding=0, bottomPadding=0, topPadding=0) ], pagesize=context.pageSize) ptl = build_grid_templates(doc, context) if ptl == []: doc.addPageTemplates([body] + list(context.templateList.values())) if ptl != []: if out_grid == []: doc.addPageTemplates(ptl) else: doc.addPageTemplates([body] + ptl) # Use multibuild e.g. if a TOC has to be created if context.multiBuild: doc.multiBuild(context.story) else: doc.build(context.story) # Add watermarks if PyPDF2: file_handler = None for bgouter in context.pisaBackgroundList: # If we have at least one background, then lets do it if bgouter: istream = out output = PyPDF2.PdfFileWriter() input1 = PyPDF2.PdfFileReader(istream) ctr = 0 # TODO: Why do we loop over the same list again? # see bgouter at line 137 for bg in context.pisaBackgroundList: page = input1.getPage(ctr) if (bg and not bg.notFound() and (bg.mimetype == "application/pdf")): file_handler = open(bg.uri, 'rb') bginput = PyPDF2.PdfFileReader(file_handler) pagebg = bginput.getPage(0) pagebg.mergePage(page) page = pagebg # Todo: the else-statement doesn't make a lot of sense to me; it's just throwing warnings # on unittesting \tests. Probably we have to rewrite the whole "background-image" stuff # to deal with cases like: # Page1 .jpg background # Page1 .pdf background # Page1 .jpg background, Page2 no background # Page1 .pdf background, Page2 no background # Page1 .jpg background, Page2 .pdf background # Page1 .pdf background, Page2 .jpg background # etc. # Right now it's kind of confusing. (fbernhart) # else: # log.warning(context.warning( # "Background PDF %s doesn't exist.", bg)) output.addPage(page) ctr += 1 out = pisaTempFile(capacity=context.capacity) output.write(out) if file_handler: file_handler.close() # data = sout.getvalue() # Found a background? So leave loop after first occurence break else: log.warning(context.warning("PyPDF2 not installed!")) # Get the resulting PDF and write it to the file object # passed from the caller if dest is None: # No output file was passed - Let's use a pisaTempFile dest = io.BytesIO() context.dest = dest data = out.getvalue() context.dest.write(data) # TODO: context.dest is a tempfile as well... return context
def pisaParser(src, context, default_css="", xhtml=False, encoding=None, xml_output=None): """ - Parse HTML and get miniDOM - Extract CSS informations, add default CSS, parse CSS - Handle the document DOM itself and build reportlab story - Return Context object """ global CSSAttrCache CSSAttrCache = {} if xhtml: # TODO: XHTMLParser doesn't seem to exist... parser = html5lib.XHTMLParser(tree=treebuilders.getTreeBuilder("dom")) else: parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom")) parser_kwargs = {} if isinstance(src, six.text_type): # If an encoding was provided, do not change it. if not encoding: encoding = "utf-8" src = src.encode(encoding) src = pisaTempFile(src, capacity=context.capacity) # To pass the encoding used to convert the text_type src to binary_type # on to html5lib's parser to ensure proper decoding parser_kwargs['transport_encoding'] = encoding # # Test for the restrictions of html5lib # if encoding: # # Workaround for html5lib<0.11.1 # if hasattr(inputstream, "isValidEncoding"): # if encoding.strip().lower() == "utf8": # encoding = "utf-8" # if not inputstream.isValidEncoding(encoding): # log.error("%r is not a valid encoding e.g. 'utf8' is not valid but 'utf-8' is!", encoding) # else: # if inputstream.codecName(encoding) is None: # log.error("%r is not a valid encoding", encoding) document = parser.parse(src, **parser_kwargs) # encoding=encoding) if xml_output: if encoding: xml_output.write(document.toprettyxml(encoding=encoding)) else: xml_output.write(document.toprettyxml(encoding="utf8")) if default_css: context.addDefaultCSS(default_css) pisaPreLoop(document, context) # try: context.parseCSS() # except: # context.cssText = DEFAULT_CSS # context.parseCSS() # context.debug(9, pprint.pformat(context.css)) pisaLoop(document, context) return context
def pisaDocument( src, dest = None, path = None, link_callback = None, debug = 0, show_error_as_pdf = False, default_css = None, xhtml = False, encoding = None, xml_output = None, raise_exception = True, capacity = 100 * 1024, # -1, **kw): c = None if show_error_as_pdf: raise_exception = False try: log.debug("pisaDocument options:\n src = %r\n dest = %r\n path = %r\n link_callback = %r\n xhtml = %r", src, dest, path, link_callback, xhtml) # Prepare simple context c = pisaContext(path, debug=debug, capacity=capacity) c.pathCallback = link_callback if dest is None: dest = pisaTempFile(capacity=c.capacity) c.dest = dest # Build story c = pisaStory(src, path, link_callback, debug, default_css, xhtml, encoding, c=c, xml_output=xml_output) # Buffer PDF into memory out = pisaTempFile(capacity=c.capacity) doc = PmlBaseDoc( out, pagesize = c.pageSize, author = c.meta["author"].strip(), subject = c.meta["subject"].strip(), keywords = [x.strip() for x in c.meta["keywords"].strip().split(",") if x], title = c.meta["title"].strip(), showBoundary = 0, allowSplitting = 1) # XXX It is not possible to access PDF info, because it is private in canvas # doc.info.producer = "pisa <http://www.holtwick.it>" # Prepare templates and their frames if c.templateList.has_key("body"): body = c.templateList["body"] del c.templateList["body"] else: x, y, w, h = getBox("1cm 1cm -1cm -1cm", c.pageSize) body = PmlPageTemplate( id="body", frames=[ Frame(x, y, w, h, id = "body", leftPadding = 0, rightPadding = 0, bottomPadding = 0, topPadding = 0)], pagesize = c.pageSize) # print body.frames # print [body] + c.templateList.values() doc.addPageTemplates([body] + c.templateList.values()) doc._pisa_page_counter = 0 def _page_counter(page_no): doc._pisa_page_counter += 1 doc.setPageCallBack(_page_counter) # Use multibuild e.g. if a TOC has to be created if c.multiBuild: doc.multiBuild(c.story) else: doc.build(c.story) c._pisa_page_counter = doc._pisa_page_counter # Add watermarks if pyPdf: for bgouter in c.pisaBackgroundList: # If we have at least one background, then lets do it if bgouter: istream = out try: output = pyPdf.PdfFileWriter() input1 = pyPdf.PdfFileReader(istream) ctr = 0 for bg in c.pisaBackgroundList: page = input1.getPage(ctr) if bg and not bg.notFound() and (bg.mimetype=="application/pdf"): bginput = pyPdf.PdfFileReader(bg.getFile()) pagebg = bginput.getPage(0) pagebg.mergePage(page) page = pagebg else: log.warn(c.warning("Background PDF %s doesn't exist.", bg)) output.addPage(page) ctr += 1 out = pisaTempFile(capacity=c.capacity) output.write(out) # data = sout.getvalue() except Exception: log.exception(c.error("pyPDF error")) if raise_exception: raise # Found a background? So leave loop after first occurence break else: log.warn(c.warning("pyPDF not installed!")) # In web frameworks for debugging purposes maybe an output of # errors in a PDF is preferred if show_error_as_pdf and c and c.err: return pisaErrorDocument(c.dest, c) # Get the resulting PDF and write it to the file object # passed from the caller data = out.getvalue() c.dest.write(data) c.dest.close() except: # TODO: Kill catch-all! # log.exception(c.error("Document error")) log.exception("Document error") c.err += 1 if raise_exception: raise if raise_exception and c.err: raise Exception("Errors occured, please see log files for more informations") return c