def cleanData(data): from org.w3c.tidy import Tidy tidy = Tidy() tidy.setXHTML(True) tidy.setInputEncoding("UTF-8") tidy.setOutputEncoding("UTF-8") tidy.setMakeClean(False) tidy.setDropEmptyParas(False) tidy.setPrintBodyOnly(True) tidy.setQuoteAmpersand(True) tidy.setTrimEmptyElements(False) inputStream = ByteArrayInputStream(String(data).getBytes("UTF-8")) outputStream = ByteArrayOutputStream() tidy.parseDOM(inputStream, outputStream) return outputStream.toString("UTF-8")
def __tidy(self, content): tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(False) tidy.setNumEntities(True) out = ByteArrayOutputStream() doc = tidy.parseDOM(ByteArrayInputStream(String(content).getBytes()), out) content = out.toString("UTF-8") return content, doc
def __getContent(self, oid): slash = oid.rfind("/") pid = os.path.splitext(oid[slash+1:])[0] + ".htm" payload = Services.storage.getObject(oid).getPayload(pid) tidy = Tidy() tidy.setIndentAttributes(False) tidy.setIndentContent(False) tidy.setPrintBodyOnly(True) tidy.setSmartIndent(False) tidy.setWraplen(0) tidy.setXHTML(False) tidy.setNumEntities(True) out = ByteArrayOutputStream() try: doc = tidy.parseDOM(payload.getInputStream(), out) content = out.toString("UTF-8") content = self.__processMedia(oid, doc, content) #print "[\n%s\n]" % content except Exception, e: print " * blog.py: Failed to get content: %s" % e.getMessage()