def htmlentityreplace_errors(exc): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): res = [] codepoints = [] skip = False for i, c in enumerate(exc.object[exc.start:exc.end]): if skip: skip = False continue index = i + exc.start if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]): codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2]) skip = True else: codepoint = ord(c) codepoints.append(codepoint) for cp in codepoints: e = encode_entity_map.get(cp) if e: res.append("&") res.append(e) if not e.endswith(";"): res.append(";") else: res.append("&#x%s;"%(hex(cp)[2:])) return (u"".join(res), exc.end) else: return xmlcharrefreplace_errors(exc)
def htmlentityreplace_errors(exc): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): res = [] codepoints = [] skip = False for i, c in enumerate(exc.object[exc.start:exc.end]): if skip: skip = False continue index = i + exc.start if utils.isSurrogatePair( exc.object[index:min([exc.end, index + 2])]): codepoint = utils.surrogatePairToCodepoint( exc.object[index:index + 2]) skip = True else: codepoint = ord(c) codepoints.append(codepoint) for cp in codepoints: e = encode_entity_map.get(cp) if e: res.append(u"&") res.append(e) if not e.endswith(u";"): res.append(u";") else: res.append(u"&#x%s;" % (hex(cp)[2:])) return (u"".join(res), exc.end) else: return xmlcharrefreplace_errors(exc)
except ImportError: unicode_encode_errors = u"strict" else: unicode_encode_errors = u"htmlentityreplace" from html5lib.constants import entities encode_entity_map = {} is_ucs4 = len(u"\U0010FFFF") == 1 for k, v in list(entities.items()): #skip multi-character entities if ((is_ucs4 and len(v) > 1) or (not is_ucs4 and len(v) > 2)): continue if v != u"&": if len(v) == 2: v = utils.surrogatePairToCodepoint(v) else: try: v = ord(v) except: print v raise if not v in encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. encode_entity_map[v] = k def htmlentityreplace_errors(exc): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): res = [] codepoints = [] skip = False
unicode_encode_errors = "strict" else: unicode_encode_errors = "htmlentityreplace" from html5lib.constants import entities encode_entity_map = {} is_ucs4 = len(u"\U0010FFFF") == 1 for k, v in entities.items(): #skip multi-character entities if ((is_ucs4 and len(v) > 1) or (not is_ucs4 and len(v) > 2)): continue if v != "&": if len(v) == 2: v = utils.surrogatePairToCodepoint(v) else: try: v = ord(v) except: print(v) raise if not v in encode_entity_map or k.islower(): # prefer < over < and similarly for &, >, etc. encode_entity_map[v] = k def htmlentityreplace_errors(exc): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): res = [] codepoints = [] skip = False