Пример #1
0
def cleanData(data):
    from org.w3c.tidy import Tidy
    tidy = Tidy()
    tidy.setXHTML(True)
    tidy.setInputEncoding("UTF-8")
    tidy.setOutputEncoding("UTF-8")

    tidy.setMakeClean(False)
    tidy.setDropEmptyParas(False)
    tidy.setPrintBodyOnly(True)
    tidy.setQuoteAmpersand(True)
    tidy.setTrimEmptyElements(False)

    inputStream = ByteArrayInputStream(String(data).getBytes("UTF-8"))
    outputStream = ByteArrayOutputStream()

    tidy.parseDOM(inputStream, outputStream)

    return outputStream.toString("UTF-8")