def convert(self, filename, encoding=None, mimetype=None): # XXX: dont read entire file into memory doc = open(filename, 'r').read() # convert to unicode if not encoding: mo = charset_reg.search(doc) encoding = mo.group(1) doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) result = html2text(doc) # convert back to utf-8 return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
def extract_text_from_html(text): if not isinstance(text, unicode): text = unicode(text, 'utf-8', 'replace') return html2text(convert_entities(text))