예제 #1
0
    def convert(self, filename, encoding=None, mimetype=None):
        # XXX: dont read entire file into memory
        doc = open(filename, 'r').read()

        # convert to unicode
        if not encoding:
            mo = charset_reg.search(doc)
            encoding = mo.group(1)
        doc = unicode(doc, encoding, 'replace')
        doc = convert_entities(doc)
        result = html2text(doc)

        # convert back to utf-8
        return StringIO.StringIO(result.encode('utf-8')), 'utf-8'
예제 #2
0
def extract_text_from_html(text):
    if not isinstance(text, unicode):
        text = unicode(text, 'utf-8', 'replace')
    return html2text(convert_entities(text))