Exemplo n.º 1
0
def test_encode_utf8():
    space = FakeSpace()
    assert encode_utf8(space, u"abc") == "abc"
    assert encode_utf8(space, u"\u1234") == "\xe1\x88\xb4"
    assert encode_utf8(space, u"\ud800") == "\xed\xa0\x80"
    assert encode_utf8(space, u"\udc00") == "\xed\xb0\x80"
    # for the following test, go to lengths to avoid CPython's optimizer
    # and .pyc file storage, which collapse the two surrogates into one
    c = u"\udc00"
    assert encode_utf8(space, u"\ud800" + c) == "\xf0\x90\x80\x80"
Exemplo n.º 2
0
def unicode_to_decimal_w(space, w_unistr, allow_surrogates=False):
    if not isinstance(w_unistr, W_UnicodeObject):
        raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
    value = _rpy_unicode_to_decimal_w(space, w_unistr._value)
    return unicodehelper.encode_utf8(space,
                                     value,
                                     allow_surrogates=allow_surrogates)
Exemplo n.º 3
0
def unicode_to_decimal_w(space, w_unistr):
    if not isinstance(w_unistr, W_UnicodeObject):
        raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
    unistr = w_unistr._value
    result = [u'\0'] * len(unistr)
    for i in xrange(len(unistr)):
        uchr = ord(unistr[i])
        if uchr > 127:
            if unicodedb.isspace(uchr):
                result[i] = ' '
                continue
            try:
                uchr = ord(u'0') + unicodedb.decimal(uchr)
            except KeyError:
                pass
        result[i] = unichr(uchr)
    return unicodehelper.encode_utf8(space, u''.join(result))
Exemplo n.º 4
0
def unicode_to_decimal_w(space, w_unistr):
    if not isinstance(w_unistr, W_UnicodeObject):
        raise oefmt(space.w_TypeError, "expected unicode, got '%T'", w_unistr)
    unistr = w_unistr._value
    result = [u'\0'] * len(unistr)
    for i in xrange(len(unistr)):
        uchr = ord(unistr[i])
        if uchr > 127:
            if unicodedb.isspace(uchr):
                result[i] = ' '
                continue
            try:
                uchr = ord(u'0') + unicodedb.decimal(uchr)
            except KeyError:
                pass
        result[i] = unichr(uchr)
    return unicodehelper.encode_utf8(space, u''.join(result))
Exemplo n.º 5
0
    def Parse(self, space, w_data, isfinal=False):
        """Parse(data[, isfinal])
Parse XML data.  `isfinal' should be true at end of input."""
        if space.isinstance_w(w_data, space.w_unicode):
            u = w_data.unicode_w(space)
            data = encode_utf8(space, w_data.unicode_w(space))
            # Explicitly set UTF-8 encoding. Return code ignored.
            XML_SetEncoding(self.itself, "utf-8")
        else:
            data = space.bufferstr_w(w_data)
        res = XML_Parse(self.itself, data, len(data), isfinal)
        if self._exc_info:
            e = self._exc_info
            self._exc_info = None
            raise e
        elif res == 0:
            exc = self.set_error(space, XML_GetErrorCode(self.itself))
            raise exc
        self.flush_character_buffer(space)
        return space.wrap(res)
Exemplo n.º 6
0
 def decode_escape_sequence_unicode(self, i, builder):
     # at this point we are just after the 'u' of the \u1234 sequence.
     start = i
     i += 4
     hexdigits = self.getslice(start, i)
     try:
         val = int(hexdigits, 16)
         if val & 0xfc00 == 0xd800:
             # surrogate pair
             val = self.decode_surrogate_pair(i, val)
             i += 6
     except ValueError:
         self._raise("Invalid \uXXXX escape (char %d)", i-1)
         return # help the annotator to know that we'll never go beyond
                # this point
     #
     uchr = runicode.code_to_unichr(val)     # may be a surrogate pair again
     utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
     builder.append(utf8_ch)
     return i
Exemplo n.º 7
0
 def decode_escape_sequence_unicode(self, i, builder):
     # at this point we are just after the 'u' of the \u1234 sequence.
     start = i
     i += 4
     hexdigits = self.getslice(start, i)
     try:
         val = int(hexdigits, 16)
         if val & 0xfc00 == 0xd800:
             # surrogate pair
             val = self.decode_surrogate_pair(i, val)
             i += 6
     except ValueError:
         self._raise("Invalid \uXXXX escape (char %d)", i - 1)
         return  # help the annotator to know that we'll never go beyond
         # this point
     #
     uchr = runicode.code_to_unichr(val)  # may be a surrogate pair again
     utf8_ch = unicodehelper.encode_utf8(self.space, uchr)
     builder.append(utf8_ch)
     return i
Exemplo n.º 8
0
    def Parse(self, space, w_data, isfinal=False):
        """Parse(data[, isfinal])
Parse XML data.  `isfinal' should be true at end of input."""
        if space.isinstance_w(w_data, space.w_unicode):
            data = encode_utf8(space, w_data.unicode_w(space))
            # Explicitly set UTF-8 encoding. Return code ignored.
            XML_SetEncoding(self.itself, "utf-8")
        else:
            data = space.charbuf_w(w_data)
        isfinal = bool(isfinal)
        res = XML_Parse(self.itself, data, len(data), isfinal)
        if self._exc_info:
            e = self._exc_info
            self._exc_info = None
            raise e
        elif res == 0:
            exc = self.set_error(space, XML_GetErrorCode(self.itself))
            raise exc
        self.flush_character_buffer(space)
        return space.newint(res)
Exemplo n.º 9
0
 def decode_escape_sequence_unicode(self, i, builder):
     # at this point we are just after the 'u' of the \u1234 sequence.
     start = i
     i += 4
     hexdigits = self.getslice(start, i)
     try:
         val = int(hexdigits, 16)
         if sys.maxunicode > 65535 and 0xd800 <= val <= 0xdfff:
             # surrogate pair
             if self.ll_chars[i] == '\\' and self.ll_chars[i + 1] == 'u':
                 val = self.decode_surrogate_pair(i, val)
                 i += 6
     except ValueError:
         raise DecoderError("Invalid \\uXXXX escape", i - 1)
     #
     uchr = runicode.code_to_unichr(val)  # may be a surrogate pair again
     utf8_ch = unicodehelper.encode_utf8(self.space,
                                         uchr,
                                         allow_surrogates=True)
     builder.append(utf8_ch)
     return i
Exemplo n.º 10
0
def marshal_w__Unicode(space, w_unicode, m):
    s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode))
    m.atom_str(TYPE_UNICODE, s)
Exemplo n.º 11
0
def marshal_unicode(space, w_unicode, m):
    if not isinstance(w_unicode, W_UnicodeObject):
        raise_exception(space, "unmarshallable object")
    s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode))
    m.atom_str(TYPE_UNICODE, s)
Exemplo n.º 12
0
def marshal_unicode(space, w_unicode, m):
    s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode),
                                  allow_surrogates=True)
    m.atom_str(TYPE_UNICODE, s)
Exemplo n.º 13
0
def marshal_unicode(space, w_unicode, m):
    s = unicodehelper.encode_utf8(space, space.unicode_w(w_unicode))
    m.atom_str(TYPE_UNICODE, s)
Exemplo n.º 14
0
def _gettmarg(space, w_tup, allowNone=True):
    if space.is_none(w_tup):
        if not allowNone:
            raise oefmt(space.w_TypeError, "tuple expected")
        # default to the current local time
        tt = rffi.cast(rffi.TIME_T, pytime.time())
        t_ref = lltype.malloc(rffi.TIME_TP.TO, 1, flavor='raw')
        t_ref[0] = tt
        pbuf = c_localtime(t_ref)
        rffi.setintfield(pbuf, "c_tm_year",
                         rffi.getintfield(pbuf, "c_tm_year") + 1900)
        lltype.free(t_ref, flavor='raw')
        if not pbuf:
            raise OperationError(space.w_ValueError,
                                 space.newunicode(_get_error_msg()))
        return pbuf

    tup_w = space.fixedview(w_tup)
    if len(tup_w) < 9:
        raise oefmt(space.w_TypeError,
                    "argument must be sequence of at least length 9, not %d",
                    len(tup_w))

    y = space.c_int_w(tup_w[0])
    tm_mon = space.c_int_w(tup_w[1])
    if tm_mon == 0:
        tm_mon = 1
    tm_mday = space.c_int_w(tup_w[2])
    if tm_mday == 0:
        tm_mday = 1
    tm_yday = space.c_int_w(tup_w[7])
    if tm_yday == 0:
        tm_yday = 1
    rffi.setintfield(glob_buf, 'c_tm_mon', tm_mon)
    rffi.setintfield(glob_buf, 'c_tm_mday', tm_mday)
    rffi.setintfield(glob_buf, 'c_tm_hour', space.c_int_w(tup_w[3]))
    rffi.setintfield(glob_buf, 'c_tm_min', space.c_int_w(tup_w[4]))
    rffi.setintfield(glob_buf, 'c_tm_sec', space.c_int_w(tup_w[5]))
    rffi.setintfield(glob_buf, 'c_tm_wday', space.c_int_w(tup_w[6]))
    rffi.setintfield(glob_buf, 'c_tm_yday', tm_yday)
    rffi.setintfield(glob_buf, 'c_tm_isdst', space.c_int_w(tup_w[8]))
    #
    if HAS_TM_ZONE:
        old_tm_zone = glob_buf.c_tm_zone
        glob_buf.c_tm_zone = lltype.nullptr(rffi.CCHARP.TO)
        rffi.setintfield(glob_buf, 'c_tm_gmtoff', 0)
        if len(tup_w) >= 10:
            # NOTE this is not cleanly solved!
            # it saves the string that is later deleted when this
            # function is called again. A refactoring of this module
            # could remove this
            tm_zone = encode_utf8(space,
                                  space.unicode_w(tup_w[9]),
                                  allow_surrogates=True)
            malloced_str = rffi.str2charp(tm_zone, track_allocation=False)
            if old_tm_zone != lltype.nullptr(rffi.CCHARP.TO):
                rffi.free_charp(old_tm_zone, track_allocation=False)
            glob_buf.c_tm_zone = malloced_str
        if len(tup_w) >= 11:
            rffi.setintfield(glob_buf, 'c_tm_gmtoff', space.c_int_w(tup_w[10]))

    # tm_wday does not need checking of its upper-bound since taking "%
    #  7" in _gettmarg() automatically restricts the range.
    if rffi.getintfield(glob_buf, 'c_tm_wday') < -1:
        raise oefmt(space.w_ValueError, "day of week out of range")

    rffi.setintfield(glob_buf, 'c_tm_year', y)
    rffi.setintfield(glob_buf, 'c_tm_mon',
                     rffi.getintfield(glob_buf, 'c_tm_mon') - 1)
    rffi.setintfield(glob_buf, 'c_tm_wday',
                     (rffi.getintfield(glob_buf, 'c_tm_wday') + 1) % 7)
    rffi.setintfield(glob_buf, 'c_tm_yday',
                     rffi.getintfield(glob_buf, 'c_tm_yday') - 1)

    return glob_buf