class NamePrep: """ Implements preparation of internationalized domain names. This class implements preparing internationalized domain names using the rules defined in RFC 3491, section 4 (Conversion operations). We do not perform step 4 since we deal with unicode representations of domain names and do not convert from or to ASCII representations using punycode encoding. When such a conversion is needed, the C{idna} standard library provides the C{ToUnicode()} and C{ToASCII()} functions. Note that C{idna} itself assumes UseSTD3ASCIIRules to be false. The following steps are performed by C{prepare()}: - Split the domain name in labels at the dots (RFC 3490, 3.1) - Apply nameprep proper on each label (RFC 3491) - Enforce the restrictions on ASCII characters in host names by assuming STD3ASCIIRules to be true. (STD 3) - Rejoin the labels using the label separator U+002E (full stop). """ # Prohibited characters. prohibiteds = [ unichr(n) for n in chain(range(0x00, 0x2c + 1), range(0x2e, 0x2f + 1), range(0x3a, 0x40 + 1), range(0x5b, 0x60 + 1), range(0x7b, 0x7f + 1)) ] def prepare(self, string): result = [] labels = idna.dots.split(string) if labels and len(labels[-1]) == 0: trailing_dot = u'.' del labels[-1] else: trailing_dot = u'' for label in labels: result.append(self.nameprep(label)) return u".".join(result) + trailing_dot def check_prohibiteds(self, string): for c in string: if c in self.prohibiteds: raise UnicodeError("Invalid character %s" % repr(c)) def nameprep(self, label): label = idna.nameprep(label) self.check_prohibiteds(label) if label[0] == u'-': raise UnicodeError("Invalid leading hyphen-minus") if label[-1] == u'-': raise UnicodeError("Invalid trailing hyphen-minus") return label
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') HAS_UTF8 = re.compile(r'[\x80-\xff]') ESCAPE_DCT = { '\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f', '\n': '\\n', '\r': '\\r', '\t': '\\t', } for i in range(0x20): #ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i)) ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i,)) for i in [0x2028, 0x2029]: ESCAPE_DCT.setdefault(unichr(i), '\\u%04x' % (i,)) FLOAT_REPR = repr def encode_basestring(s, _PY3=PY3, _q=u('"')): """Return a JSON representation of a Python string """ if _PY3: if isinstance(s, binary_type): s = s.decode('utf-8') else: if isinstance(s, str) and HAS_UTF8.search(s) is not None: s = s.decode('utf-8') def replace(match): return ESCAPE_DCT[match.group(0)]
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match, _join=u('').join, _PY3=PY3, _maxunicode=sys.maxunicode): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError( "Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if not _PY3 and not isinstance(content, text_type): content = text_type(content, encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: msg = "Invalid control character %r at" raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError( "Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\X escape sequence %r" raise JSONDecodeError(msg, s, end) end += 1 else: # Unicode escape sequence msg = "Invalid \\uXXXX escape sequence" esc = s[end + 1:end + 5] escX = esc[1:2] if len(esc) != 4 or escX == 'x' or escX == 'X': raise JSONDecodeError(msg, s, end - 1) try: uni = int(esc, 16) except ValueError: raise JSONDecodeError(msg, s, end - 1) end += 5 # Check for surrogate pair on UCS-4 systems # Note that this will join high/low surrogate pairs # but will also pass unpaired surrogates through if (_maxunicode > 65535 and uni & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u'): esc2 = s[end + 2:end + 6] escX = esc2[1:2] if len(esc2) == 4 and not (escX == 'x' or escX == 'X'): try: uni2 = int(esc2, 16) except ValueError: raise JSONDecodeError(msg, s, end) if uni2 & 0xfc00 == 0xdc00: uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) end += 6 char = unichr(uni) # Append the unescaped character _append(char) return _join(chunks), end
def py_scanstring(s, end, encoding=None, strict=True, _b=BACKSLASH, _m=STRINGCHUNK.match, _join=u('').join, _PY3=PY3, _maxunicode=sys.maxunicode): """Scan the string s for a JSON string. End is the index of the character in s after the quote that started the JSON string. Unescapes all valid JSON string escape sequences and raises ValueError on attempt to decode an invalid string. If strict is False then literal control characters are allowed in the string. Returns a tuple of the decoded string and the index of the character in s after the end quote.""" if encoding is None: encoding = DEFAULT_ENCODING chunks = [] _append = chunks.append begin = end - 1 while 1: chunk = _m(s, end) if chunk is None: raise JSONDecodeError("Unterminated string starting at", s, begin) end = chunk.end() content, terminator = chunk.groups() # Content is contains zero or more unescaped string characters if content: if not _PY3 and not isinstance(content, text_type): content = text_type(content, encoding) _append(content) # Terminator is the end of string, a literal control character, # or a backslash denoting that an escape sequence follows if terminator == '"': break elif terminator != '\\': if strict: msg = "Invalid control character %r at" raise JSONDecodeError(msg, s, end) else: _append(terminator) continue try: esc = s[end] except IndexError: raise JSONDecodeError("Unterminated string starting at", s, begin) # If not a unicode escape sequence, must be in the lookup table if esc != 'u': try: char = _b[esc] except KeyError: msg = "Invalid \\X escape sequence %r" raise JSONDecodeError(msg, s, end) end += 1 else: # Unicode escape sequence msg = "Invalid \\uXXXX escape sequence" esc = s[end + 1:end + 5] escX = esc[1:2] if len(esc) != 4 or escX == 'x' or escX == 'X': raise JSONDecodeError(msg, s, end - 1) try: uni = int(esc, 16) except ValueError: raise JSONDecodeError(msg, s, end - 1) end += 5 # Check for surrogate pair on UCS-4 systems # Note that this will join high/low surrogate pairs # but will also pass unpaired surrogates through if (_maxunicode > 65535 and uni & 0xfc00 == 0xd800 and s[end:end + 2] == '\\u'): esc2 = s[end + 2:end + 6] escX = esc2[1:2] if len(esc2) == 4 and not (escX == 'x' or escX == 'X'): try: uni2 = int(esc2, 16) except ValueError: raise JSONDecodeError(msg, s, end) if uni2 & 0xfc00 == 0xdc00: uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) end += 6 char = unichr(uni) # Append the unescaped character _append(char) return _join(chunks), end
ESCAPE_ASCII = re.compile(r'([\\"]|[^\ -~])') HAS_UTF8 = re.compile(r'[\x80-\xff]') ESCAPE_DCT = { '\\': '\\\\', '"': '\\"', '\b': '\\b', '\f': '\\f', '\n': '\\n', '\r': '\\r', '\t': '\\t', } for i in range(0x20): #ESCAPE_DCT.setdefault(chr(i), '\\u{0:04x}'.format(i)) ESCAPE_DCT.setdefault(chr(i), '\\u%04x' % (i, )) for i in [0x2028, 0x2029]: ESCAPE_DCT.setdefault(unichr(i), '\\u%04x' % (i, )) FLOAT_REPR = repr def encode_basestring(s, _PY3=PY3, _q=u('"')): """Return a JSON representation of a Python string """ if _PY3: if isinstance(s, binary_type): s = s.decode('utf-8') else: if isinstance(s, str) and HAS_UTF8.search(s) is not None: s = s.decode('utf-8')