def _fix_encode_to_binary(tokens: List[Token], i: int) -> None: parts = rfind_string_parts(tokens, i - 2) if not parts: return # .encode() if ( i + 2 < len(tokens) and tokens[i + 1].src == '(' and tokens[i + 2].src == ')' ): victims = slice(i - 1, i + 3) latin1_ok = False # .encode('encoding') elif ( i + 3 < len(tokens) and tokens[i + 1].src == '(' and tokens[i + 2].name == 'STRING' and tokens[i + 3].src == ')' ): victims = slice(i - 1, i + 4) prefix, rest = parse_string_literal(tokens[i + 2].src) if 'f' in prefix.lower(): return encoding = ast.literal_eval(prefix + rest) if is_codec(encoding, 'ascii') or is_codec(encoding, 'utf-8'): latin1_ok = False elif is_codec(encoding, 'iso8859-1'): latin1_ok = True else: return else: return for part in parts: prefix, rest = parse_string_literal(tokens[part].src) escapes = set(ESCAPE_RE.findall(rest)) if ( not is_ascii(rest) or '\\u' in escapes or '\\U' in escapes or '\\N' in escapes or ('\\x' in escapes and not latin1_ok) or 'f' in prefix.lower() ): return for part in parts: prefix, rest = parse_string_literal(tokens[part].src) prefix = 'b' + prefix.replace('u', '').replace('U', '') tokens[part] = tokens[part]._replace(src=prefix + rest) del tokens[victims]
def _make_fstring(tokens): import tokenize_rt new_tokens = [] exprs = [] for i, token in enumerate(tokens): if token.name == 'STRING' and _is_f(token): prefix, s = tokenize_rt.parse_string_literal(token.src) parts = [] try: _fstring_parse_outer(s, 0, 0, parts, exprs) except SyntaxError as e: raise TokenSyntaxError(e, tokens[i - 1]) if 'r' in prefix.lower(): parts = [s.replace('\\', '\\\\') for s in parts] token = token._replace(src=''.join(parts)) elif token.name == 'STRING': new_src = token.src.replace('{', '{{').replace('}', '}}') token = token._replace(src=new_src) new_tokens.append(token) exprs = ('({})'.format(expr) for expr in exprs) format_src = '.format({})'.format(', '.join(exprs)) new_tokens.append(tokenize_rt.Token('FORMAT', src=format_src)) return new_tokens
def _fix_format_literal(tokens: List[Token], end: int) -> None: parts = rfind_string_parts(tokens, end) parsed_parts = [] last_int = -1 for i in parts: # f'foo {0}'.format(...) would get turned into a SyntaxError prefix, _ = parse_string_literal(tokens[i].src) if 'f' in prefix.lower(): return try: parsed = parse_format(tokens[i].src) except ValueError: # the format literal was malformed, skip it return # The last segment will always be the end of the string and not a # format, slice avoids the `None` format key for _, fmtkey, spec, _ in parsed[:-1]: if ( fmtkey is not None and inty(fmtkey) and int(fmtkey) == last_int + 1 and spec is not None and '{' not in spec ): last_int += 1 else: return parsed_parts.append(tuple(_remove_fmt(tup) for tup in parsed)) for i, parsed in zip(parts, parsed_parts): tokens[i] = tokens[i]._replace(src=unparse_parsed_string(parsed))
def _remove_u_prefix(token: Token) -> Token: prefix, rest = parse_string_literal(token.src) if 'u' not in prefix.lower(): return token else: new_prefix = prefix.replace('u', '').replace('U', '') return token._replace(src=new_prefix + rest)
def _make_gstring(tokens): new_tokens = [] for i, token in enumerate(tokens): if token.name == "STRING" and _is_g(token): prefix, s = tokenize_rt.parse_string_literal(token.src) for q in ('"' * 3, "'" * 3, '"', "'"): if s.startswith(q): s = s[len(q):len(s) - len(q)] break else: raise AssertionError("unreachable") parts = [q, s, q] if 'r' in prefix.lower(): parts = [s.replace('\\', "\\\\") for s in parts] parts = [emoji.emojize(s) for s in parts] token = token._replace(src=''.join(parts)) new_tokens.append(token) return new_tokens
def _fix_ur_literals(token: Token) -> Token: prefix, rest = parse_string_literal(token.src) if prefix.lower() != 'ur': return token else: def cb(match: Match[str]) -> str: escape = match.group() if escape[1].lower() == 'u': return escape else: return '\\' + match.group() rest = ESCAPE_RE.sub(cb, rest) prefix = prefix.replace('r', '').replace('R', '') return token._replace(src=prefix + rest)
def _fix_escape_sequences(token: Token) -> Token: prefix, rest = parse_string_literal(token.src) actual_prefix = prefix.lower() if 'r' in actual_prefix or '\\' not in rest: return token is_bytestring = 'b' in actual_prefix def _is_valid_escape(match: Match[str]) -> bool: c = match.group()[1] return ( c in ESCAPE_STARTS or (not is_bytestring and c in 'uU') or ( not is_bytestring and c == 'N' and bool(NAMED_ESCAPE_NAME.match(rest, match.end())) ) ) has_valid_escapes = False has_invalid_escapes = False for match in ESCAPE_RE.finditer(rest): if _is_valid_escape(match): has_valid_escapes = True else: has_invalid_escapes = True def cb(match: Match[str]) -> str: matched = match.group() if _is_valid_escape(match): return matched else: return fr'\{matched}' if has_invalid_escapes and (has_valid_escapes or 'u' in actual_prefix): return token._replace(src=prefix + ESCAPE_RE.sub(cb, rest)) elif has_invalid_escapes and not has_valid_escapes: return token._replace(src=prefix + 'r' + rest) else: return token
def _is_f(token): import tokenize_rt prefix, _ = tokenize_rt.parse_string_literal(token.src) return 'f' in prefix.lower()
def test_parse_string_literal(s, expected): assert parse_string_literal(s) == expected
def _is_g(token): prefix, _ = tokenize_rt.parse_string_literal(token.src) return 'g' in prefix.lower()