def normalize_html(html_text): html_text = u'\n'.join(html.tostring(el, encoding=unicode) for el in antimarkdown.parse_fragments(html_text)) tidied = tidy( u'\n'.join(line.strip() for line in nodes.whitespace(html_text.strip()).replace(u'>', u'>\n').splitlines()), pretty_print=True) return tidied.decode('utf-8')
def _fix_document(self, doc, use_soup=False): if use_soup: soup = BeautifulSoup(doc) soup.prettify() doc = unicode(soup) else: doc = tidy(doc) return doc
def dump(self, item): fullpath = item.filename + item.extension dirname, basename = os.path.split(fullpath) target_dir = os.path.join(self.folder, dirname) if not os.path.isdir(target_dir): os.makedirs(target_dir) target_file = os.path.join(target_dir, basename) request = urllib2.Request( item.url, item.data, item.headers ) with open(target_file, 'wb') as target_stream: input_stream = urllib2.urlopen(request) if CAN_TIDY: data = tidy(input_stream, pretty_print=True, encoding="utf-8") else: data = input_stream.read() target_stream.write(data)
def html5tidy (src): return mark_safe(tidy(src, fragment=True))