def convert2(self, doc, encoding, mimetype): # convert to unicode if not isinstance(doc, unicode): doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) result = self.convert(doc) # convert back to utf-8 return result.encode('utf-8'), 'utf-8'
def convert2(self, doc, encoding, mimetype): # Use encoding from XML preamble if present mo = encoding_reg.search(doc) if mo: encoding = mo.group(1) if not isinstance(doc, unicode): doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) doc = doc.encode('utf-8') return self.convert(doc), 'utf-8'
def convert(self, doc, encoding=None, mimetype=None, logError=False, raiseException=False): # convert to unicode if not isinstance(doc, unicode): if not encoding: mo = charset_reg.search(doc) if mo is not None: encoding = mo.group(1) else: encoding = 'ascii' # guess doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) result = html2text(doc) # convert back to utf-8 return result.encode('utf-8'), 'utf-8'
def convert(self, doc, encoding, mimetype, logError=False, raiseException=False): # Use encoding from XML preamble if present mo = encoding_reg.search(doc) if mo: encoding = mo.group(1) if not encoding: encoding = default_encoding if not isinstance(doc, unicode): doc = unicode(doc, encoding, 'replace') doc = convert_entities(doc) doc = doc.encode('utf-8') p = StripTagParser() p.feed(doc) p.close() return str(p), 'utf-8'