Пример #1
0
class HtmlToText(HTMLParser, object):
    """Taken from some eff-bot code on c.l.p."""
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '
    entitydefs['apos'] = '\''

    def __init__(self, tagReplace=None):
        self.data = []
        self.tagReplace = tagReplace
        super(HtmlToText, self).__init__()

    def append(self, data):
        self.data.append(data)

    def getTagReplace(self, tag):
        if self.tagReplace is None:
            if tag in _block_elements:
                return ' '
            else:
                return ''
        else:
            return self.tagReplace

    def handle_starttag(self, tag, attr):
        self.append(self.getTagReplace(tag))

    def handle_endtag(self, tag):
        self.append(self.getTagReplace(tag))

    def handle_data(self, data):
        self.append(data)

    def handle_entityref(self, data):
        if minisix.PY3:
            if data in name2codepoint:
                self.append(chr(name2codepoint[data]))
            elif isinstance(data, bytes):
                self.append(data.decode())
            else:
                self.append(data)
        else:
            if data in name2codepoint:
                self.append(unichr(name2codepoint[data]))
            elif isinstance(data, str):
                self.append(data.decode('utf8', errors='replace'))
            else:
                self.append(data)

    def getText(self):
        text = ''.join(self.data).strip()
        return normalizeWhitespace(text)

    def handle_charref(self, name):
        self.append(self.unescape('&#%s;' % name))
Пример #2
0
class HtmlToText(HTMLParser, object):
    """Taken from some eff-bot code on c.l.p."""
    entitydefs = entitydefs.copy()
    entitydefs['nbsp'] = ' '

    def __init__(self, tagReplace=' '):
        self.data = []
        self.tagReplace = tagReplace
        super(HtmlToText, self).__init__()

    def handle_starttag(self, tag, attr):
        self.data.append(self.tagReplace)

    def handle_endtag(self, tag):
        self.data.append(self.tagReplace)

    def handle_data(self, data):
        self.data.append(data)

    def handle_entityref(self, data):
        if minisix.PY3:
            if data in name2codepoint:
                self.data.append(chr(name2codepoint[data]))
            elif isinstance(data, bytes):
                self.data.append(data.decode())
            else:
                self.data.append(data)
        else:
            if data in name2codepoint:
                self.data.append(unichr(name2codepoint[data]))
            elif isinstance(data, str):
                self.data.append(data.decode('utf8', errors='replace'))
            else:
                self.data.append(data)

    def getText(self):
        text = ''.join(self.data).strip()
        return normalizeWhitespace(text)
Пример #3
0
                        d[k] = re_names.sub(ur"'\1' (qv)", v)
                    if re_titles:
                        d[k] = re_titles.sub(ur'_\1_ (qv)', v)
                    if re_characters:
                        d[k] = re_characters.sub(ur'#\1# (qv)', v)
            elif isinstance(v, (list, dict)):
                _putRefs(d[k],
                         re_titles,
                         re_names,
                         re_characters,
                         lastKey=lastKey)


# Handle HTML/XML/SGML entities.
from htmlentitydefs import entitydefs
entitydefs = entitydefs.copy()
entitydefsget = entitydefs.get
entitydefs['nbsp'] = ' '

sgmlentity = {
    'lt': '<',
    'gt': '>',
    'amp': '&',
    'quot': '"',
    'apos': '\'',
    'ndash': '-'
}
sgmlentityget = sgmlentity.get
_sgmlentkeys = sgmlentity.keys()

entcharrefs = {}
Пример #4
0
            if isinstance(v, (unicode, str)):
                if lastKey in _modify_keys:
                    if re_names:
                        d[k] = re_names.sub(ur"'\1' (qv)", v)
                    if re_titles:
                        d[k] = re_titles.sub(ur'_\1_ (qv)', v)
                    if re_characters:
                        d[k] = re_characters.sub(ur'#\1# (qv)', v)
            elif isinstance(v, (list, dict)):
                _putRefs(d[k], re_titles, re_names, re_characters,
                        lastKey=lastKey)


# Handle HTML/XML/SGML entities.
from htmlentitydefs import entitydefs
entitydefs = entitydefs.copy()
entitydefsget = entitydefs.get
entitydefs['nbsp'] = ' '

sgmlentity = {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
sgmlentityget = sgmlentity.get
_sgmlentkeys = sgmlentity.keys()

entcharrefs = {}
entcharrefsget = entcharrefs.get
for _k, _v in entitydefs.items():
    if _k in _sgmlentkeys: continue
    if _v[0:2] == '&#':
        dec_code = _v[1:-1]
        _v = unichr(int(_v[2:-1]))
        entcharrefs[dec_code] = _v