def fstring_find_literal(astbuilder, fstr, atom_node, rec): # Return the next literal part. Updates the current index inside 'fstr'. # Differs from CPython: this version handles double-braces on its own. s = fstr.unparsed literal_start = fstr.current_index in_named_escape = False # Get any literal string. It ends when we hit an un-doubled left # brace (which isn't part of a unicode name escape such as # "\N{EULER CONSTANT}"), or the end of the string. i = literal_start builder = StringBuilder() while i < len(s): ch = s[i] if (not in_named_escape and ch == '{' and i - literal_start >= 2 and s[i - 2] == '\\' and s[i - 1] == 'N'): in_named_escape = True elif in_named_escape and ch == '}': in_named_escape = False elif ch == '{' or ch == '}': # Check for doubled braces, but only at the top level. If # we checked at every level, then f'{0:{3}}' would fail # with the two closing braces. if rec == 0 and i + 1 < len(s) and s[i + 1] == ch: i += 1 # skip over the second brace elif rec == 0 and ch == '}': # Where a single '{' is the start of a new expression, a # single '}' is not allowed. astbuilder.error("f-string: single '}' is not allowed", atom_node) else: # We're either at a '{', which means we're starting another # expression; or a '}', which means we're at the end of this # f-string (for a nested format_spec). break builder.append(ch) i += 1 fstr.current_index = i literal = builder.build() if not fstr.raw_mode and '\\' in literal: space = astbuilder.space literal = parsestring.decode_unicode_utf8(space, literal, 0, len(literal)) return unicodehelper.decode_unicode_escape(space, literal) else: return literal.decode('utf-8')
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote: # triple quotes ps += 2 if s[q-1] != quote or s[q-2] != quote: raise_app_valueerror(space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. buf = s bufp = ps bufq = q u = None else: # String is utf8-encoded, but 'unicode_escape' expects # latin-1; So multibyte sequences must be escaped. lis = [] # using a list to assemble the value end = q # Worst case: "\XX" may become "\u005c\uHHLL" (12 bytes) while ps < end: if s[ps] == '\\': lis.append(s[ps]) ps += 1 if ord(s[ps]) & 0x80: # A multibyte sequence will follow, it will be # escaped like \u1234. To avoid confusion with # the backslash we just wrote, we emit "\u005c" # instead. lis.append("u005c") if ord(s[ps]) & 0x80: # XXX inefficient w, ps = decode_utf8(space, s, ps, end, "utf-16-be") rn = len(w) assert rn % 2 == 0 for i in range(0, rn, 2): lis.append('\\u') lis.append(hexbyte(ord(w[i]))) lis.append(hexbyte(ord(w[i+1]))) else: lis.append(s[ps]) ps += 1 buf = ''.join(lis) bufp = 0 bufq = len(buf) assert 0 <= bufp <= bufq substr = buf[bufp:bufq] if rawmode: v = unicodehelper.decode_raw_unicode_escape(space, substr) else: v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.wrap(unicodehelper.decode_utf8(space, substr)) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.wrap(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, enc) return space.wrap(v)
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q-ps >= 4 and s[ps] == quote and s[ps+1] == quote: # triple quotes ps += 2 if s[q-1] != quote or s[q-2] != quote: raise_app_valueerror(space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: # XXX Py_UnicodeFlag is ignored for now if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) if rawmode: v = unicodehelper.decode_raw_unicode_escape(space, substr) else: v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps : q] if rawmode or '\\' not in s[ps:]: if need_encoding: w_u = space.wrap(unicodehelper.decode_utf8(space, substr)) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.wrap(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, 'strict', enc) return space.wrap(v)
def parsestr(space, encoding, s, unicode_literal=False): """Parses a string or unicode literal, and return a wrapped value. If encoding=iso8859-1, the source string is also in this encoding. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] unicode_literal = True if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror( space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror( space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal: if encoding is None or encoding == "iso-8859-1": # 'unicode_escape' expects latin-1 bytes, string is ready. assert 0 <= ps <= q substr = s[ps:q] else: unicodehelper.check_utf8_or_raise(space, s, ps, q) substr = decode_unicode_utf8(space, s, ps, q) if rawmode: r = unicodehelper.decode_raw_unicode_escape(space, substr) else: r = unicodehelper.decode_unicode_escape(space, substr) v, length = r return space.newutf8(v, length) need_encoding = (encoding is not None and encoding != "utf-8" and encoding != "utf8" and encoding != "iso-8859-1") assert 0 <= ps <= q substr = s[ps:q] if rawmode or '\\' not in s[ps:]: if need_encoding: lgt = unicodehelper.check_utf8_or_raise(space, substr) w_u = space.newutf8(substr, lgt) w_v = unicodehelper.encode(space, w_u, encoding) return w_v else: return space.newbytes(substr) enc = None if need_encoding: enc = encoding v = PyString_DecodeEscape(space, substr, 'strict', enc) return space.newbytes(v)
def parsestr(space, encoding, s): """Parses a string or unicode literal, and return a wrapped value. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False unicode_literal = True saw_u = False # string decoration handling if quote == "b" or quote == "B": ps += 1 quote = s[ps] unicode_literal = False elif quote == "u" or quote == "U": ps += 1 quote = s[ps] saw_u = True if not saw_u and quote == "r" or quote == "R": ps += 1 quote = s[ps] rawmode = True if quote != "'" and quote != '"': raise_app_valueerror(space, "Internal error: parser passed unquoted literal") ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror(space, "Internal error: parser passed unmatched " "quotes in literal") if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror(space, "Internal error: parser passed " "unmatched triple quotes in literal") q -= 2 if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now if encoding is None: assert 0 <= ps <= q substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) v = unicodehelper.decode_unicode_escape(space, substr) return space.wrap(v) assert 0 <= ps <= q substr = s[ps:q] if not unicode_literal: # Disallow non-ascii characters (but not escapes) for c in substr: if ord(c) > 0x80: raise OperationError( space.w_SyntaxError, space.wrap("bytes can only contain ASCII literal characters.") ) if rawmode or "\\" not in substr: if not unicode_literal: return space.wrapbytes(substr) else: v = unicodehelper.decode_utf8(space, substr) return space.wrap(v) v = PyString_DecodeEscape(space, substr, "strict", encoding) return space.wrapbytes(v)
def parsestr(space, encoding, s): """Parses a string or unicode literal, and return usually a wrapped value. If we get an f-string, then instead return an unparsed but unquoted W_FString instance. If encoding=None, the source string is ascii only. In other cases, the source string is in utf-8 encoding. When a bytes string is returned, it will be encoded with the original encoding. Yes, it's very inefficient. Yes, CPython has very similar code. """ # we use ps as "pointer to s" # q is the virtual last char index of the string ps = 0 quote = s[ps] rawmode = False unicode_literal = True saw_u = False saw_f = False # string decoration handling if quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'u' or quote == 'U': ps += 1 quote = s[ps] saw_u = True elif quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True elif quote == 'f' or quote == 'F': ps += 1 quote = s[ps] saw_f = True if not saw_u: if quote == 'r' or quote == 'R': ps += 1 quote = s[ps] rawmode = True elif quote == 'b' or quote == 'B': ps += 1 quote = s[ps] unicode_literal = False elif quote == 'f' or quote == 'F': ps += 1 quote = s[ps] saw_f = True if quote != "'" and quote != '"': raise_app_valueerror(space, 'Internal error: parser passed unquoted literal') ps += 1 q = len(s) - 1 if s[q] != quote: raise_app_valueerror( space, 'Internal error: parser passed unmatched ' 'quotes in literal') if q - ps >= 4 and s[ps] == quote and s[ps + 1] == quote: # triple quotes ps += 2 if s[q - 1] != quote or s[q - 2] != quote: raise_app_valueerror( space, 'Internal error: parser passed ' 'unmatched triple quotes in literal') q -= 2 if unicode_literal and not rawmode: # XXX Py_UnicodeFlag is ignored for now assert 0 <= ps <= q if saw_f: return W_FString(s[ps:q], rawmode) if encoding is None: substr = s[ps:q] else: substr = decode_unicode_utf8(space, s, ps, q) v = unicodehelper.decode_unicode_escape(space, substr) return space.newunicode(v) assert 0 <= ps <= q substr = s[ps:q] if not unicode_literal: # Disallow non-ascii characters (but not escapes) for c in substr: if ord(c) > 0x80: raise oefmt( space.w_SyntaxError, "bytes can only contain ASCII literal characters.") if rawmode or '\\' not in substr: if not unicode_literal: return space.newbytes(substr) elif saw_f: return W_FString(substr, rawmode) else: v = unicodehelper.decode_utf8(space, substr) return space.newunicode(v) v = PyString_DecodeEscape(space, substr, 'strict', encoding) return space.newbytes(v)
def fstring_find_literal(astbuilder, fstr, atom_node, rec): space = astbuilder.space raw = fstr.raw_mode # Return the next literal part. Updates the current index inside 'fstr'. # Differs from CPython: this version handles double-braces on its own. s = fstr.unparsed literal_start = fstr.current_index assert literal_start >= 0 # Get any literal string. It ends when we hit an un-doubled left # brace (which isn't part of a unicode name escape such as # "\N{EULER CONSTANT}"), or the end of the string. i = literal_start builder = StringBuilder() while i < len(s): ch = s[i] i += 1 if not raw and ch == '\\' and i < len(s): ch = s[i] i += 1 if ch == 'N': if i < len(s) and s[i] == '{': while i < len(s) and s[i] != '}': i += 1 if i < len(s): i += 1 continue elif i < len(s): i += 1 break if ch == '{': msg = "invalid escape sequence '%s'" try: space.warn(space.newtext(msg % ch), space.w_DeprecationWarning) except error.OperationError as e: if e.match(space, space.w_DeprecationWarning): astbuilder.error(msg % ch, atom_node) else: raise if ch == '{' or ch == '}': # Check for doubled braces, but only at the top level. If # we checked at every level, then f'{0:{3}}' would fail # with the two closing braces. if rec == 0 and i < len(s) and s[i] == ch: assert 0 <= i <= len(s) builder.append(s[literal_start:i]) i += 1 # skip over the second brace literal_start = i elif rec == 0 and ch == '}': i -= 1 assert i >= 0 fstr.current_index = i # Where a single '{' is the start of a new expression, a # single '}' is not allowed. astbuilder.error("f-string: single '}' is not allowed", atom_node) else: # We're either at a '{', which means we're starting another # expression; or a '}', which means we're at the end of this # f-string (for a nested format_spec). i -= 1 break assert 0 <= i <= len(s) assert i == len(s) or s[i] == '{' or s[i] == '}' builder.append(s[literal_start:i]) fstr.current_index = i literal = builder.build() lgt = codepoints_in_utf8(literal) if not raw and '\\' in literal: literal = parsestring.decode_unicode_utf8(space, literal, 0, len(literal)) literal, lgt, pos = unicodehelper.decode_unicode_escape(space, literal) return space.newtext(literal, lgt)