def hexescape(builder, s, pos, digits, encoding, errorhandler, message, errors): from rpython.rlib.rarithmetic import r_uint from rpython.rlib.runicode import MAXUNICODE, UNICHR chr = 0 if pos + digits > len(s): message = "end of string in escape sequence" res, pos = errorhandler(errors, "unicodeescape", message, s, pos - 2, len(s)) builder.append(res) else: try: chr = r_uint(int(str(s[pos:pos + digits]), 16)) except ValueError: endinpos = pos while s[endinpos] in hexdigits: endinpos += 1 res, pos = errorhandler(errors, encoding, message, s, pos - 2, endinpos + 1) builder.append(res) else: # when we get here, chr is a 32-bit unicode character if chr <= MAXUNICODE: builder.append(UNICHR(chr)) pos += digits elif chr <= 0x10ffff: chr -= 0x10000L builder.append(unichr(0xD800 + (chr >> 10))) builder.append(unichr(0xDC00 + (chr & 0x03FF))) pos += digits else: message = "illegal Unicode character" res, pos = errorhandler(errors, encoding, message, s, pos - 2, pos + digits) builder.append(res) return pos
def hex_to_utf8(state, token, s): try: uchr = UNICHR(int(s, 16)) return unicode_encode_utf_8(uchr, len(uchr), 'strict') except (ValueError, UnicodeDecodeError): # XXX better error message raise errorhandler(state, token, msg="Error encoding %s" % s)
def chr(space, code): "Return a Unicode string of one character with the given ordinal." try: c = UNICHR(code) except ValueError: raise oefmt(space.w_ValueError, "chr() arg out of range") return space.newunicode(c)
def chr(space, code): "Return a Unicode string of one character with the given ordinal." try: c = UNICHR(code) except ValueError: raise OperationError(space.w_ValueError, space.wrap("chr() arg out of range")) return space.wrap(c)
def hex_to_utf8(s): uchr = UNICHR(int(s, 16)) return unicode_encode_utf_8(uchr, len(uchr), 'strict')
def f(x): try: return ord(UNICHR(x)) except ValueError: return -42
def str_decode_utf_32_helper(s, size, errors, final=True, errorhandler=None, byteorder="native", public_encoding_name='utf32', allow_surrogates=True): if errorhandler is None: errorhandler = default_unicode_error_decode bo = 0 if BYTEORDER == 'little': iorder = [0, 1, 2, 3] else: iorder = [3, 2, 1, 0] # Check for BOM marks (U+FEFF) in the input and adjust current # byte order setting accordingly. In native mode, the leading BOM # mark is skipped, in all other modes, it is copied to the output # stream as-is (giving a ZWNBSP character). pos = 0 if byteorder == 'native': if size >= 4: bom = intmask((ord(s[iorder[3]]) << 24) | (ord(s[iorder[2]]) << 16) | (ord(s[iorder[1]]) << 8) | ord(s[iorder[0]])) if BYTEORDER == 'little': if bom == BOM32_DIRECT: pos += 4 bo = -1 elif bom == BOM32_REVERSE: pos += 4 bo = 1 else: if bom == BOM32_DIRECT: pos += 4 bo = 1 elif bom == BOM32_REVERSE: pos += 4 bo = -1 elif byteorder == 'little': bo = -1 else: bo = 1 if size == 0: return u'', 0, bo if bo == -1: # force little endian iorder = [0, 1, 2, 3] elif bo == 1: # force big endian iorder = [3, 2, 1, 0] result = UnicodeBuilder(size // 4) while pos < size: # remaining bytes at the end? (size should be divisible by 4) if len(s) - pos < 4: if not final: break r, pos = errorhandler(errors, public_encoding_name, "truncated data", s, pos, len(s)) result.append(r) if len(s) - pos < 4: break continue ch = ((ord(s[pos + iorder[3]]) << 24) | (ord(s[pos + iorder[2]]) << 16) | (ord(s[pos + iorder[1]]) << 8) | ord(s[pos + iorder[0]])) if not allow_surrogates and 0xD800 <= ch <= 0xDFFF: r, pos = errorhandler( errors, public_encoding_name, "code point in surrogate code point " "range(0xd800, 0xe000)", s, pos, pos + 4) result.append(r) continue elif ch >= 0x110000: r, pos = errorhandler(errors, public_encoding_name, "codepoint not in range(0x110000)", s, pos, pos + 4) result.append(r) continue if MAXUNICODE < 65536 and ch >= 0x10000: ch -= 0x10000L result.append(unichr(0xD800 + (ch >> 10))) result.append(unichr(0xDC00 + (ch & 0x03FF))) else: result.append(UNICHR(ch)) pos += 4 return result.build(), pos, bo
def unichr(space, code): "Return a Unicode string of one character with the given ordinal." if code < 0 or code > 0x10FFFF: raise oefmt(space.w_ValueError, "unichr() arg out of range") c = UNICHR(code) return space.newunicode(c)
def str_decode_utf_16_helper(s, size, errors, final=True, errorhandler=None, byteorder="native", public_encoding_name='utf16'): if errorhandler is None: errorhandler = default_unicode_error_decode bo = 0 if BYTEORDER == 'little': ihi = 1 ilo = 0 else: ihi = 0 ilo = 1 # Check for BOM marks (U+FEFF) in the input and adjust current # byte order setting accordingly. In native mode, the leading BOM # mark is skipped, in all other modes, it is copied to the output # stream as-is (giving a ZWNBSP character). pos = 0 if byteorder == 'native': if size >= 2: bom = (ord(s[ihi]) << 8) | ord(s[ilo]) if BYTEORDER == 'little': if bom == 0xFEFF: pos += 2 bo = -1 elif bom == 0xFFFE: pos += 2 bo = 1 else: if bom == 0xFEFF: pos += 2 bo = 1 elif bom == 0xFFFE: pos += 2 bo = -1 elif byteorder == 'little': bo = -1 else: bo = 1 if size == 0: return u'', 0, bo if bo == -1: # force little endian ihi = 1 ilo = 0 elif bo == 1: # force big endian ihi = 0 ilo = 1 result = UnicodeBuilder(size // 2) #XXX I think the errors are not correctly handled here while pos < size: # remaining bytes at the end? (size should be even) if len(s) - pos < 2: if not final: break r, pos = errorhandler(errors, public_encoding_name, "truncated data", s, pos, len(s)) result.append(r) if len(s) - pos < 2: break ch = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo]) pos += 2 if ch < 0xD800 or ch > 0xDFFF: result.append(unichr(ch)) continue # UTF-16 code pair: if len(s) - pos < 2: pos -= 2 if not final: break errmsg = "unexpected end of data" r, pos = errorhandler(errors, public_encoding_name, errmsg, s, pos, len(s)) result.append(r) if len(s) - pos < 2: break elif 0xD800 <= ch <= 0xDBFF: ch2 = (ord(s[pos + ihi]) << 8) | ord(s[pos + ilo]) pos += 2 if 0xDC00 <= ch2 <= 0xDFFF: if MAXUNICODE < 65536: result.append(unichr(ch)) result.append(unichr(ch2)) else: result.append( UNICHR((((ch & 0x3FF) << 10) | (ch2 & 0x3FF)) + 0x10000)) continue else: r, pos = errorhandler(errors, public_encoding_name, "illegal UTF-16 surrogate", s, pos - 4, pos - 2) result.append(r) else: r, pos = errorhandler(errors, public_encoding_name, "illegal encoding", s, pos - 2, pos) result.append(r) return result.build(), pos, bo