def htmlentityreplace_errors(exc):
     if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
         res = []
         codepoints = []
         skip = False
         for i, c in enumerate(exc.object[exc.start:exc.end]):
             if skip:
                 skip = False
                 continue
             index = i + exc.start
             if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
                 codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
                 skip = True
             else:
                 codepoint = ord(c)
             codepoints.append(codepoint)
         for cp in codepoints:
             e = encode_entity_map.get(cp)
             if e:
                 res.append("&")
                 res.append(e)
                 if not e.endswith(";"):
                     res.append(";")
             else:
                 res.append("&#x%s;"%(hex(cp)[2:]))
         return (u"".join(res), exc.end)
     else:
         return xmlcharrefreplace_errors(exc)
예제 #2
0
 def htmlentityreplace_errors(exc):
     if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
         res = []
         codepoints = []
         skip = False
         for i, c in enumerate(exc.object[exc.start:exc.end]):
             if skip:
                 skip = False
                 continue
             index = i + exc.start
             if utils.isSurrogatePair(
                     exc.object[index:min([exc.end, index + 2])]):
                 codepoint = utils.surrogatePairToCodepoint(
                     exc.object[index:index + 2])
                 skip = True
             else:
                 codepoint = ord(c)
             codepoints.append(codepoint)
         for cp in codepoints:
             e = encode_entity_map.get(cp)
             if e:
                 res.append(u"&")
                 res.append(e)
                 if not e.endswith(u";"):
                     res.append(u";")
             else:
                 res.append(u"&#x%s;" % (hex(cp)[2:]))
         return (u"".join(res), exc.end)
     else:
         return xmlcharrefreplace_errors(exc)
예제 #3
0
except ImportError:
    unicode_encode_errors = u"strict"
else:
    unicode_encode_errors = u"htmlentityreplace"

    from html5lib.constants import entities

    encode_entity_map = {}
    is_ucs4 = len(u"\U0010FFFF") == 1
    for k, v in list(entities.items()):
        #skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or (not is_ucs4 and len(v) > 2)):
            continue
        if v != u"&":
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                try:
                    v = ord(v)
                except:
                    print v
                    raise
            if not v in encode_entity_map or k.islower():
                # prefer < over < and similarly for &, >, etc.
                encode_entity_map[v] = k

    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
            codepoints = []
            skip = False
예제 #4
0
    unicode_encode_errors = "strict"
else:
    unicode_encode_errors = "htmlentityreplace"

    from html5lib.constants import entities

    encode_entity_map = {}
    is_ucs4 = len(u"\U0010FFFF") == 1
    for k, v in entities.items():
        #skip multi-character entities
        if ((is_ucs4 and len(v) > 1) or
            (not is_ucs4 and len(v) > 2)):
            continue
        if v != "&":
            if len(v) == 2:
                v = utils.surrogatePairToCodepoint(v)
            else:
                try:
                    v = ord(v)
                except:
                    print(v)
                    raise
            if not v in encode_entity_map or k.islower():
                # prefer < over < and similarly for &, >, etc.
                encode_entity_map[v] = k

    def htmlentityreplace_errors(exc):
        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
            res = []
            codepoints = []
            skip = False