class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' entitydefs['apos'] = '\'' def __init__(self, tagReplace=None): self.data = [] self.tagReplace = tagReplace super(HtmlToText, self).__init__() def append(self, data): self.data.append(data) def getTagReplace(self, tag): if self.tagReplace is None: if tag in _block_elements: return ' ' else: return '' else: return self.tagReplace def handle_starttag(self, tag, attr): self.append(self.getTagReplace(tag)) def handle_endtag(self, tag): self.append(self.getTagReplace(tag)) def handle_data(self, data): self.append(data) def handle_entityref(self, data): if minisix.PY3: if data in name2codepoint: self.append(chr(name2codepoint[data])) elif isinstance(data, bytes): self.append(data.decode()) else: self.append(data) else: if data in name2codepoint: self.append(unichr(name2codepoint[data])) elif isinstance(data, str): self.append(data.decode('utf8', errors='replace')) else: self.append(data) def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text) def handle_charref(self, name): self.append(self.unescape('&#%s;' % name))
class HtmlToText(HTMLParser, object): """Taken from some eff-bot code on c.l.p.""" entitydefs = entitydefs.copy() entitydefs['nbsp'] = ' ' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace super(HtmlToText, self).__init__() def handle_starttag(self, tag, attr): self.data.append(self.tagReplace) def handle_endtag(self, tag): self.data.append(self.tagReplace) def handle_data(self, data): self.data.append(data) def handle_entityref(self, data): if minisix.PY3: if data in name2codepoint: self.data.append(chr(name2codepoint[data])) elif isinstance(data, bytes): self.data.append(data.decode()) else: self.data.append(data) else: if data in name2codepoint: self.data.append(unichr(name2codepoint[data])) elif isinstance(data, str): self.data.append(data.decode('utf8', errors='replace')) else: self.data.append(data) def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text)
d[k] = re_names.sub(ur"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(ur'_\1_ (qv)', v) if re_characters: d[k] = re_characters.sub(ur'#\1# (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, re_characters, lastKey=lastKey) # Handle HTML/XML/SGML entities. from htmlentitydefs import entitydefs entitydefs = entitydefs.copy() entitydefsget = entitydefs.get entitydefs['nbsp'] = ' ' sgmlentity = { 'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\'', 'ndash': '-' } sgmlentityget = sgmlentity.get _sgmlentkeys = sgmlentity.keys() entcharrefs = {}
if isinstance(v, (unicode, str)): if lastKey in _modify_keys: if re_names: d[k] = re_names.sub(ur"'\1' (qv)", v) if re_titles: d[k] = re_titles.sub(ur'_\1_ (qv)', v) if re_characters: d[k] = re_characters.sub(ur'#\1# (qv)', v) elif isinstance(v, (list, dict)): _putRefs(d[k], re_titles, re_names, re_characters, lastKey=lastKey) # Handle HTML/XML/SGML entities. from htmlentitydefs import entitydefs entitydefs = entitydefs.copy() entitydefsget = entitydefs.get entitydefs['nbsp'] = ' ' sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''} sgmlentityget = sgmlentity.get _sgmlentkeys = sgmlentity.keys() entcharrefs = {} entcharrefsget = entcharrefs.get for _k, _v in entitydefs.items(): if _k in _sgmlentkeys: continue if _v[0:2] == '&#': dec_code = _v[1:-1] _v = unichr(int(_v[2:-1])) entcharrefs[dec_code] = _v