def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if (space.isinstance_w(w_exc, space.w_UnicodeEncodeError) or space.isinstance_w(w_exc, space.w_UnicodeTranslateError)): obj = space.realunicode_w(space.getattr(w_exc, space.newtext('object'))) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: oc = ord(obj[pos]) raw_unicode_escape_helper_unicode(builder, oc) pos += 1 return space.newtuple([space.newunicode(builder.build()), w_end]) elif space.isinstance_w(w_exc, space.w_UnicodeDecodeError): obj = space.bytes_w(space.getattr(w_exc, space.newtext('object'))) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: oc = ord(obj[pos]) raw_unicode_escape_helper_unicode(builder, oc) pos += 1 return space.newtuple([space.newunicode(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def test_unicode_builder(): s = UnicodeBuilder() s.append(u'a') s.append(u'abc') s.append_slice(u'abcdef', 1, 2) assert s.getlength() == len('aabcb') s.append_multiple_char(u'd', 4) assert s.build() == 'aabcbdddd' assert isinstance(s.build(), unicode)
def xmlcharrefreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) obj = space.realunicode_w(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: code = ord(obj[pos]) if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and pos + 1 < end and 0xDC00 <= ord(obj[pos + 1]) <= 0xDFFF): code = (code & 0x03FF) << 10 code |= ord(obj[pos + 1]) & 0x03FF code += 0x10000 pos += 1 builder.append(u"&#") builder.append(unicode(str(code))) builder.append(u";") pos += 1 return space.newtuple([space.newunicode(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def string_append(args): if not args: return W_String.fromascii("") builder = StringBuilder() unibuilder = None ascii_idx = 0 try: for ascii_idx in range(len(args)): arg = args[ascii_idx] if not isinstance(arg, W_String): raise SchemeException("string-append: expected a string") builder.append(arg.as_str_ascii()) except ValueError: unibuilder = UnicodeBuilder() unibuilder.append(unicode(builder.build())) builder = None for i in range(ascii_idx, len(args)): arg = args[i] if not isinstance(arg, W_String): raise SchemeException("string-append: expected a string") unibuilder.append(arg.as_unicode()) if unibuilder is None: assert builder is not None return W_String.fromascii(builder.build()) else: assert unibuilder is not None return W_String.fromunicode(unibuilder.build())
def rawwcharp2unicoden(wcp, maxlen): b = UnicodeBuilder(maxlen) i = 0 while i < maxlen and rffi.cast(lltype.Signed, wcp[i]) != 0: b.append(code_to_unichr(wcp[i])) i += 1 return assert_str0(b.build())
def upper(self, w_str): # copy paste from above, but the types are different value = self.unerase(w_str.get_storage()) builder = UnicodeBuilder(len(value)) for i, ch in enumerate(value): builder.append(unichr(unicodedb.toupper(ord(ch)))) return W_MutableString(self, self.erase(list(builder.build())))
def xmlcharrefreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object'))) start = space.int_w(space.getattr(w_exc, space.wrap('start'))) w_end = space.getattr(w_exc, space.wrap('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: code = ord(obj[pos]) if (MAXUNICODE == 0xffff and 0xD800 <= code <= 0xDBFF and pos + 1 < end and 0xDC00 <= ord(obj[pos+1]) <= 0xDFFF): code = (code & 0x03FF) << 10 code |= ord(obj[pos+1]) & 0x03FF code += 0x10000 pos += 1 builder.append(u"&#") builder.append(unicode(str(code))) builder.append(u";") pos += 1 return space.newtuple([space.wrap(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def rawwcharp2unicoden(wcp, maxlen): b = UnicodeBuilder(maxlen) i = 0 while i < maxlen and rffi.cast(lltype.Signed, wcp[i]) != 0: b.append(code_to_unichr(wcp[i])) i += 1 return assert_str0(b.build())
def string_append(args): if jit.isconstant(len(args)): return string_append_fastpath(args) if not args: return W_String.fromascii("") builder = StringBuilder(len(args)) unibuilder = None ascii_idx = 0 try: for ascii_idx in range(len(args)): arg = args[ascii_idx] if not isinstance(arg, W_String): raise SchemeException("string-append: expected a string") builder.append(arg.as_str_ascii()) except ValueError: unibuilder = UnicodeBuilder(len(args)) unibuilder.append(unicode(builder.build())) builder = None for i in range(ascii_idx, len(args)): arg = args[i] if not isinstance(arg, W_String): raise SchemeException("string-append: expected a string") unibuilder.append(arg.as_unicode()) if unibuilder is None: assert builder is not None return W_String.fromascii(builder.build()) else: assert unibuilder is not None return W_String.fromunicode(unibuilder.build())
def escape_string(string): out = UnicodeBuilder() out.append(u'"') for ch in string: n = ord(ch) if 0x20 <= n and n <= 0x7E or 0xFF < n: # remove the last part in cond if you don't want if ch == u'\\': # unicode printed out for some reason. ch = u'\\\\' elif ch == u'"': ch = u'\\"' else: #if n <= 0xFF: c = u"0123456789abcdef"[n >> 4 & 15] d = u"0123456789abcdef"[n & 15] ch = u'x' + c + d #else: # for unicode escapes. # a = u"0123456789abcdef"[n >> 12] # b = u"0123456789abcdef"[n >> 8 & 15] # c = u"0123456789abcdef"[n >> 4 & 15] # d = u"0123456789abcdef"[n & 15] # ch = u'u' + a + b + c + d ch = u'\\' + character_escapes.get(n, ch) out.append(ch) out.append(u'"') return out.build()
def char_utf_8_length(char): # same as (bytes-length (string->bytes/utf-8 (string char))) builder = UnicodeBuilder() builder.append(char.value) w_str = W_String.fromunicode(builder.build()) w_bytes = values.W_Bytes.from_charlist(w_str.as_charlist_utf8()) return values.W_Fixnum(w_bytes.length())
def upper(self, w_str): # copy paste from above, but the types are different value = self.unerase(w_str.get_storage()) builder = UnicodeBuilder(len(value)) for i, ch in enumerate(value): builder.append(unichr(unicodedb.toupper(ord(ch)))) return W_MutableString(self, self.erase(list(builder.build())))
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object'))) start = space.int_w(space.getattr(w_exc, space.wrap('start'))) w_end = space.getattr(w_exc, space.wrap('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: oc = ord(obj[pos]) num = hex(oc) if (oc >= 0x10000): builder.append(u"\\U") zeros = 8 elif (oc >= 0x100): builder.append(u"\\u") zeros = 4 else: builder.append(u"\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char(u'0', nb) builder.append_slice(unicode(num), 2, lnum) pos += 1 return space.newtuple([space.wrap(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def read_w(self, space, w_size=None): self._check_attached(space) self._check_closed(space) if not self.w_decoder: raise oefmt(space.w_IOError, "not readable") size = convert_size(space, w_size) self._writeflush(space) if size < 0: # Read everything w_bytes = space.call_method(self.w_buffer, "read") w_decoded = space.call_method(self.w_decoder, "decode", w_bytes, space.w_True) check_decoded(space, w_decoded) w_result = space.newunicode(self.decoded.get_chars(-1)) w_final = space.add(w_result, w_decoded) self.snapshot = None return w_final remaining = size builder = UnicodeBuilder(size) # Keep reading chunks until we have n characters to return while remaining > 0: if not self._ensure_data(space): break data = self.decoded.get_chars(remaining) builder.append(data) remaining -= len(data) return space.newunicode(builder.build())
class W_UnicodeBuilder(W_Root): def __init__(self, space, size): if size < 0: self.builder = UnicodeBuilder() else: self.builder = UnicodeBuilder(size) @unwrap_spec(size=int) def descr__new__(space, w_subtype, size=-1): return W_UnicodeBuilder(space, size) @unwrap_spec(s=unicode) def descr_append(self, space, s): self.builder.append(s) @unwrap_spec(s=unicode, start=int, end=int) def descr_append_slice(self, space, s, start, end): if not 0 <= start <= end <= len(s): raise oefmt(space.w_ValueError, "bad start/stop") self.builder.append_slice(s, start, end) def descr_build(self, space): w_s = space.newunicode(self.builder.build()) # after build(), we can continue to append more strings # to the same builder. This is supported since # 2ff5087aca28 in RPython. return w_s def descr_len(self, space): if self.builder is None: raise oefmt(space.w_ValueError, "no length of built builder") return space.newint(self.builder.getlength())
def backslashreplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): obj = space.realunicode_w(space.getattr(w_exc, space.wrap('object'))) start = space.int_w(space.getattr(w_exc, space.wrap('start'))) w_end = space.getattr(w_exc, space.wrap('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: oc = ord(obj[pos]) num = hex(oc) if (oc >= 0x10000): builder.append(u"\\U") zeros = 8 elif (oc >= 0x100): builder.append(u"\\u") zeros = 4 else: builder.append(u"\\x") zeros = 2 lnum = len(num) nb = zeros + 2 - lnum # num starts with '0x' if nb > 0: builder.append_multiple_char(u'0', nb) builder.append_slice(unicode(num), 2, lnum) pos += 1 return space.newtuple([space.wrap(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def namereplace_errors(space, w_exc): check_exception(space, w_exc) if space.isinstance_w(w_exc, space.w_UnicodeEncodeError): w_obj = space.getattr(w_exc, space.newtext('object')) obj = space.realunicode_w(w_obj) start = space.int_w(space.getattr(w_exc, space.newtext('start'))) w_end = space.getattr(w_exc, space.newtext('end')) end = space.int_w(w_end) builder = UnicodeBuilder() pos = start while pos < end: oc = ord(obj[pos]) try: name = unicodedb.name(oc) except KeyError: raw_unicode_escape_helper_unicode(builder, oc) else: builder.append(u'\\N{') builder.append(unicode(name)) builder.append(u'}') pos += 1 return space.newtuple([space.newunicode(builder.build()), w_end]) else: raise oefmt(space.w_TypeError, "don't know how to handle %T in error callback", w_exc)
def char_utf_8_length(char): # same as (bytes-length (string->bytes/utf-8 (string char))) builder = UnicodeBuilder() builder.append(char.value) w_str = W_String.fromunicode(builder.build()) w_bytes = values.W_Bytes.from_charlist(w_str.as_charlist_utf8()) return values.W_Fixnum(w_bytes.length())
def func(): s = UnicodeBuilder(32) s.append(u'a') s.append(u'abc') s.append(u'abcdef') s.append_slice(u'abc', 1, 2) s.append_multiple_char(u'u', 40) return s.build()
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() if sb.build() != u"": raise ValueError n -= 1 return n
def toLowerCase(self): # Use current size as a size hint. In the best case, characters # are one-to-one; in the next-best case, we overestimate and end # up with a couple bytes of slop. ub = UnicodeBuilder(len(self._s)) for char in self._s: ub.append(unichr(unicodedb.tolower(ord(char)))) return ub.build()
def toLowerCase(self): # Use current size as a size hint. In the best case, characters # are one-to-one; in the next-best case, we overestimate and end # up with a couple bytes of slop. ub = UnicodeBuilder(len(self._s)) for char in self._s: ub.append(unichr(unicodedb.tolower(ord(char)))) return ub.build()
def func(): s = UnicodeBuilder(32) s.append(u'a') s.append(u'abc') s.append(u'abcdef') s.append_slice(u'abc', 1, 2) s.append_multiple_char(u'u', 40) return s.build()
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append(u"") s = sb.build() if len(s) != 0: raise ValueError n -= 1 return n
def _parse_plain_flags(source): b = UnicodeBuilder(4) while True: ch = source.get() if ch == u":": break else: b.append(ch) return b.build()
def unicode_capitalize__Unicode(space, w_self): input = w_self._value if len(input) == 0: return W_UnicodeObject.EMPTY builder = UnicodeBuilder(len(input)) builder.append(unichr(unicodedb.toupper(ord(input[0])))) for i in range(1, len(input)): builder.append(unichr(unicodedb.tolower(ord(input[i])))) return W_UnicodeObject(builder.build())
def _parse_plain_flags(source): b = UnicodeBuilder(4) while True: ch = source.get() if ch == u":": break else: b.append(ch) return b.build()
def ll_decode_utf8(self, llvalue): from rpython.rtyper.annlowlevel import hlstr value = hlstr(llvalue) assert value is not None result = UnicodeBuilder(len(value)) self.rstr_decode_utf_8( value, len(value), 'strict', final=True, errorhandler=self.ll_raise_unicode_exception_decode, allow_surrogates=False, result=result) return self.ll.llunicode(result.build())
def to_upper_case(this, args): from rpython.rlib.unicodedata import unicodedb string = this.to_string() builder = UnicodeBuilder(len(string)) for char in string: builder.append(unichr(unicodedb.toupper(ord(char)))) return builder.build()
def test_unicode_builder(): s = UnicodeBuilder() s.append(u'a') s.append(u'abc') s.append_slice(u'abcdef', 1, 2) assert s.getlength() == len('aabcb') s.append_multiple_char(u'd', 4) result = s.build() assert result == 'aabcbdddd' assert isinstance(result, unicode)
def from_char_code(this, args): builder = UnicodeBuilder(len(args)) for arg in args: i = arg.ToInt16() c = unichr(i) builder.append(c) s = builder.build() return s
def test_unicode_builder(): s = UnicodeBuilder() s.append(u"a") s.append(u"abc") s.append_slice(u"abcdef", 1, 2) assert s.getlength() == len("aabcb") s.append_multiple_char(u"d", 4) result = s.build() assert result == "aabcbdddd" assert isinstance(result, unicode)
def from_char_code(this, args): builder = UnicodeBuilder(len(args)) for arg in args: i = arg.ToInt16() c = unichr(i) builder.append(c) s = builder.build() return s
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append_slice(u"abcdefghij", 1, n) sb.append_slice(u"abcdefghij", 0, n) s = sb.build() if len(s) != 2 * n - 1: raise ValueError n -= 1 return n
def to_upper_case(this, args): from rpython.rlib.unicodedata import unicodedb string = this.to_string() builder = UnicodeBuilder(len(string)) for char in string: builder.append(unichr(unicodedb.toupper(ord(char)))) return builder.build()
def readline_w(self, space, w_limit=None): self._check_attached(space) self._check_closed(space) self._writeflush(space) limit = convert_size(space, w_limit) remnant = None builder = UnicodeBuilder() while True: # First, get some data if necessary has_data = self._ensure_data(space) if not has_data: # end of file if remnant: builder.append(remnant) break if remnant: assert not self.readtranslate and self.readnl == u'\r\n' assert self.decoded.pos == 0 if remnant == u'\r' and self.decoded.text[0] == u'\n': builder.append(u'\r\n') self.decoded.pos = 1 remnant = None break else: builder.append(remnant) remnant = None continue if limit >= 0: remaining = limit - builder.getlength() assert remaining >= 0 else: remaining = -1 start = self.decoded.pos assert start >= 0 found = self._scan_line_ending(remaining) end_scan = self.decoded.pos if end_scan > start: s = self.decoded.text[start:end_scan] builder.append(s) if found or (limit >= 0 and builder.getlength() >= limit): break # There may be some remaining chars we'll have to prepend to the # next chunk of data if not self.decoded.exhausted(): remnant = self.decoded.get_chars(-1) # We have consumed the buffer self.decoded.reset() result = builder.build() return space.newunicode(result)
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append(u"ab") s = sb.build() if len(s) != 2: raise ValueError if s[0] != u"a": raise ValueError if s[1] != u"b": raise ValueError n -= 1 return n
def _parse_count(source): b = UnicodeBuilder(2) while True: here = source.pos ch = source.get() if is_digit(ord(ch[0])): b.append(ch) else: source.pos = here break return b.build()
def _parse_count(source): b = UnicodeBuilder(2) while True: here = source.pos ch = source.get() if is_digit(ord(ch[0])): b.append(ch) else: source.pos = here break return b.build()
def string(args): if len(args) == 0: return W_String.fromascii("") assert len(args) > 0 builder = UnicodeBuilder() # XXX could do one less copy in the ascii case for char in args: if not isinstance(char, values.W_Character): raise SchemeException("string: expected a character") builder.append(char.value) return W_String.fromunicode(builder.build())
def configured_stringify(obj, config): if config is None: ub = UnicodeBuilder() quick_stringify(ub, obj) return ub.build() scan = Scanner() scan.indent = space.to_int(get_config(config, u"indent", space.Integer(2))) scan.sort_keys = space.is_true(get_config(config, u"sort_keys", space.false)) stringify(scan, obj) scan.finish() return scan.printer.result.build()
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append_slice(u"fOo!", 1, 3) s = sb.build() if len(s) != 2: raise ValueError if s[0] != u"O": raise ValueError if s[1] != u"o": raise ValueError n -= 1 return n
def string(args): if len(args) == 0: return W_String.fromascii("") assert len(args) > 0 builder = UnicodeBuilder() # XXX could do one less copy in the ascii case for char in args: if not isinstance(char, values.W_Character): raise SchemeException("string: expected a character") builder.append(char.value) return W_String.fromunicode(builder.build())
def f(n): s1 = unicode(str(n) * 16) while n > 0: jitdriver.jit_merge_point(n=n, s1=s1) sb = UnicodeBuilder(32) sb.append(s1) sb.append(u"\n\n") s = sb.build() if len(s) != 34: raise ValueError n -= 1 return n
def configured_stringify(obj, config): if config is None: ub = UnicodeBuilder() quick_stringify(ub, obj) return ub.build() scan = Scanner() scan.indent = space.to_int(get_config(config, u"indent", space.Integer(2))) scan.sort_keys = space.is_true( get_config(config, u"sort_keys", space.false)) stringify(scan, obj) scan.finish() return scan.printer.result.build()
def list_to_string(w_list): if not w_list.is_proper_list(): raise SchemeException("list->string: expected proper list") if not isinstance(w_list, values.W_Cons): return W_String.fromascii("") builder = UnicodeBuilder() while isinstance(w_list, values.W_Cons): char, w_list = w_list.car(), w_list.cdr() if not isinstance(char, values.W_Character): raise SchemeException("list->string: expected list of characters") builder.append(char.value) return W_String.fromunicode(builder.build())
def unicode_swapcase__Unicode(space, w_self): input = w_self._value builder = UnicodeBuilder(len(input)) for i in range(len(input)): unichar = ord(input[i]) if unicodedb.islower(unichar): builder.append(unichr(unicodedb.toupper(unichar))) elif unicodedb.isupper(unichar): builder.append(unichr(unicodedb.tolower(unichar))) else: builder.append(input[i]) return W_UnicodeObject(builder.build())
def list_to_string(w_list): if not w_list.is_proper_list(): raise SchemeException("list->string: expected proper list") if not isinstance(w_list, values.W_Cons): return W_String.fromascii("") builder = UnicodeBuilder() while isinstance(w_list, values.W_Cons): char, w_list = w_list.car(), w_list.cdr() if not isinstance(char, values.W_Character): raise SchemeException("list->string: expected list of characters") builder.append(char.value) return W_String.fromunicode(builder.build())
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append_multiple_char(u"x", 35) s = sb.build() if len(s) != 35: raise ValueError for c in s: if c != u"x": raise ValueError n -= 1 return n
def _parse_name(source): b = UnicodeBuilder(5) while True: here = source.pos ch = source.get() if ch in u")>": source.pos = here break elif not ch: break else: b.append(ch) return b.build()
def _parse_name(source): b = UnicodeBuilder(5) while True: here = source.pos ch = source.get() if ch in u")>": source.pos = here break elif not ch: break else: b.append(ch) return b.build()
def f(n): while n > 0: jitdriver.jit_merge_point(n=n) sb = UnicodeBuilder() sb.append_multiple_char(u"x", 5) s = sb.build() if len(s) != 5: raise ValueError if s[0] != u"x": raise ValueError if s[1] != u"x": raise ValueError if s[2] != u"x": raise ValueError if s[3] != u"x": raise ValueError if s[4] != u"x": raise ValueError n -= 1 return n
def entry_point(argv): b = UnicodeBuilder(32) for x in to_do: if x < 1500: print "``%s''" % str(b.build()) if x < 1000: b = UnicodeBuilder(32) elif x < 20000: b.append(unichr(32 + (x & 63))) elif x < 30000: b.append_multiple_char(unichr(32 + (x & 63)), x % 93) else: b.append(unicode(str(x))) return 0