def decode(b, errors='strict'): u, length = utf_8.decode(b, errors) tokens = tokenize_rt.src_to_tokens(u) new_tokens = [] for token in tokens: if token.name == 'NUMBER': new_tokens.extend( tokenize_rt.src_to_tokens("blurse({})".format(token.src))) else: new_tokens.append(token) return tokenize_rt.tokens_to_src(new_tokens), length
def _upgrade(source: str) -> str: ast_obj = _ast_parse(source) visitor = _FindAssignment() visitor.visit(ast_obj) tokens = src_to_tokens(source) _mutate_found(tokens, visitor) return tokens_to_src(tokens)
def _fix_six(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindSixUsage() visitor.visit(ast_obj) tokens = src_to_tokens(contents_text) for i, token in reversed_enumerate(tokens): if token.offset in visitor.simple_names: node = visitor.simple_names[token.offset] tokens[i] = Token('CODE', SIX_SIMPLE_ATTRS[node.id]) elif token.offset in visitor.simple_attrs: node = visitor.simple_attrs[token.offset] if tokens[i + 1].src == '.' and tokens[i + 2].src == node.attr: tokens[i:i + 3] = [Token('CODE', SIX_SIMPLE_ATTRS[node.attr])] elif token.offset in visitor.remove_decorators: if tokens[i - 1].src == '@': end = i + 1 while tokens[end].name != 'NEWLINE': end += 1 del tokens[i - 1:end + 1] return tokens_to_src(tokens)
def _has_trailing_semicolon(src: str) -> Tuple[str, bool]: """ Check if cell has trailing semicolon. Parameters ---------- src Notebook cell source. Returns ------- bool Whether notebook has trailing semicolon. """ tokens = tokenize_rt.src_to_tokens(src) trailing_semicolon = False for idx, token in tokenize_rt.reversed_enumerate(tokens): if not token.src.strip(" \n") or token.name == "COMMENT": continue if token.name == "OP" and token.src == ";": tokens[idx] = token._replace(src="") trailing_semicolon = True break if not trailing_semicolon: return src, False return tokenize_rt.tokens_to_src(tokens), True
def test_src_to_tokens_octal_literal_normalization(): ret = src_to_tokens('0755\n') assert ret == [ Token('NUMBER', '0755', line=1, utf8_byte_offset=0), Token('NEWLINE', '\n', line=1, utf8_byte_offset=4), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ]
def decode(b, errors='strict'): import tokenize_rt # pip install future-fstrings[rewrite] u, length = utf_8.decode(b, errors) tokens = tokenize_rt.src_to_tokens(u) to_replace = [] start = end = seen_f = None for i, token in enumerate(tokens): if start is None: if token.name == 'STRING': start, end = i, i + 1 seen_f = _is_f(token) elif token.name == 'STRING': end = i + 1 seen_f |= _is_f(token) elif token.name not in tokenize_rt.NON_CODING_TOKENS: if seen_f: to_replace.append((start, end)) start = end = seen_f = None for start, end in reversed(to_replace): try: tokens[start:end] = _make_fstring(tokens[start:end]) except TokenSyntaxError as e: msg = str(e.e) line = u.splitlines()[e.token.line - 1] bts = line.encode('UTF-8')[:e.token.utf8_byte_offset] indent = len(bts.decode('UTF-8')) raise SyntaxError(msg + '\n\n' + line + '\n' + ' ' * indent + '^') return tokenize_rt.tokens_to_src(tokens), length
def _fix_src(contents_text: str, min_version: Tuple[int, ...]) -> str: try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text callbacks = visit(FUNCS, ast_obj, min_version) tokens = src_to_tokens(contents_text) for i, token in _changing_list(tokens): # DEDENT is a zero length token if not token.src: continue # though this is a defaultdict, by using `.get()` this function's # self time is almost 50% faster for callback in callbacks.get(token.offset, ()): callback(i, tokens) if token.src in START_BRACES: fix_brace( tokens, find_simple(i, tokens), add_comma=False, remove_comma=False, ) return tokens_to_src(tokens)
def _fix_format_literals(contents_text): tokens = src_to_tokens(contents_text) to_replace = [] string_start = None string_end = None seen_dot = False for i, token in enumerate(tokens): if string_start is None and token.name == 'STRING': string_start = i string_end = i + 1 elif string_start is not None and token.name == 'STRING': string_end = i + 1 elif string_start is not None and token.src == '.': seen_dot = True elif seen_dot and token.src == 'format': to_replace.append((string_start, string_end)) string_start, string_end, seen_dot = None, None, False elif token.name not in NON_CODING_TOKENS: string_start, string_end, seen_dot = None, None, False for start, end in reversed(to_replace): src = tokens_to_src(tokens[start:end]) new_src = _rewrite_string_literal(src) tokens[start:end] = [Token('STRING', new_src)] return tokens_to_src(tokens)
def _fix_fstrings(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindSimpleFormats() visitor.visit(ast_obj) tokens = src_to_tokens(contents_text) for i, token in reversed(tuple(enumerate(tokens))): node = visitor.found.get(Offset(token.line, token.utf8_byte_offset)) if node is None: continue if _is_bytestring(token.src): # pragma: no cover (py2-only) continue paren = i + 3 if tokens_to_src(tokens[i + 1:paren + 1]) != '.format(': continue # we don't actually care about arg position, so we pass `node` victims = _victims(tokens, paren, node, gen=False) end = victims.ends[-1] # if it spans more than one line, bail if tokens[end].line != token.line: continue tokens[i] = token._replace(src=_to_fstring(token.src, node)) del tokens[i + 1:end + 1] return tokens_to_src(tokens)
def _fix_percent_format(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindPercentFormats() visitor.visit(ast_obj) if not visitor.found: return contents_text tokens = src_to_tokens(contents_text) for i, token in reversed_enumerate(tokens): node = visitor.found.get(token.offset) if node is None: continue # no .format() equivalent for bytestrings in py3 # note that this code is only necessary when running in python2 if _is_bytestring(tokens[i].src): # pragma: no cover (py2-only) continue if isinstance(node.right, ast.Tuple): _fix_percent_format_tuple(tokens, i, node) elif isinstance(node.right, ast.Dict): _fix_percent_format_dict(tokens, i, node) return tokens_to_src(tokens)
def _fix_py2_compatible(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = Py2CompatibleVisitor() visitor.visit(ast_obj) if not any(( visitor.dicts, visitor.sets, visitor.set_empty_literals, visitor.is_literal, )): return contents_text tokens = src_to_tokens(contents_text) for i, token in reversed_enumerate(tokens): if token.offset in visitor.dicts: _process_dict_comp(tokens, i, visitor.dicts[token.offset]) elif token.offset in visitor.set_empty_literals: _process_set_empty_literal(tokens, i) elif token.offset in visitor.sets: _process_set_literal(tokens, i, visitor.sets[token.offset]) elif token.offset in visitor.is_literal: _process_is_literal(tokens, i, visitor.is_literal[token.offset]) return tokens_to_src(tokens)
def _fix_plugins(contents_text: str, settings: Settings) -> str: try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text callbacks = visit(FUNCS, ast_obj, settings) if not callbacks: return contents_text try: tokens = src_to_tokens(contents_text) except tokenize.TokenError: # pragma: no cover (bpo-2180) return contents_text _fixup_dedent_tokens(tokens) for i, token in reversed_enumerate(tokens): if not token.src: continue # though this is a defaultdict, by using `.get()` this function's # self time is almost 50% faster for callback in callbacks.get(token.offset, ()): callback(i, tokens) return tokens_to_src(tokens)
def remove_trailing_semicolon(src: str) -> Tuple[str, bool]: """Remove trailing semicolon from Jupyter notebook cell. For example, fig, ax = plt.subplots() ax.plot(x_data, y_data); # plot data would become fig, ax = plt.subplots() ax.plot(x_data, y_data) # plot data Mirrors the logic in `quiet` from `IPython.core.displayhook`, but uses ``tokenize_rt`` so that round-tripping works fine. """ from tokenize_rt import ( src_to_tokens, tokens_to_src, reversed_enumerate, ) tokens = src_to_tokens(src) trailing_semicolon = False for idx, token in reversed_enumerate(tokens): if token.name in TOKENS_TO_IGNORE: continue if token.name == "OP" and token.src == ";": del tokens[idx] trailing_semicolon = True break if not trailing_semicolon: return src, False return tokens_to_src(tokens), True
def fix_file(filename: str, show_diff: bool = False, dry_run: bool = False) -> int: with open(filename, 'rb') as f: contents_bytes = f.read() try: contents_text = contents_bytes.decode() except UnicodeDecodeError: print(f'{filename} is non-utf8 (not supported)') return 1 tokens = tokenize_rt.src_to_tokens(contents_text) tokens_no_comments = _remove_comments(tokens) src_no_comments = tokenize_rt.tokens_to_src(tokens_no_comments) if src_no_comments == contents_text: return 0 with tempfile.NamedTemporaryFile( dir=os.path.dirname(filename), prefix=os.path.basename(filename), suffix='.py', ) as tmpfile: tmpfile.write(src_no_comments.encode()) tmpfile.flush() flake8_results = _run_flake8(tmpfile.name) if any('E999' in v for v in flake8_results.values()): print(f'{filename}: syntax error (skipping)') return 0 for i, token in tokenize_rt.reversed_enumerate(tokens): if token.name != 'COMMENT': continue if NOQA_RE.search(token.src): _rewrite_noqa_comment(tokens, i, flake8_results) elif NOQA_FILE_RE.match(token.src) and not flake8_results: if i == 0 or tokens[i - 1].name == 'NEWLINE': del tokens[i: i + 2] else: _remove_comment(tokens, i) newsrc = tokenize_rt.tokens_to_src(tokens) if newsrc != contents_text: if (show_diff or dry_run): diff = difflib.unified_diff( contents_text.splitlines(keepends=True), newsrc.splitlines(keepends=True), fromfile=filename, tofile=filename, ) print(''.join(diff), end='') if (not dry_run): print(f'Rewriting {filename}') with open(filename, 'wb') as f: f.write(newsrc.encode()) return 1 else: return 0
def _fix_tokens(contents_text: str, min_version: Version) -> str: remove_u = (min_version >= (3, ) or _imports_future(contents_text, 'unicode_literals')) try: tokens = src_to_tokens(contents_text) except tokenize.TokenError: return contents_text for i, token in reversed_enumerate(tokens): if token.name == 'NUMBER': tokens[i] = token._replace(src=_fix_long(_fix_octal(token.src))) elif token.name == 'STRING': tokens[i] = _fix_ur_literals(tokens[i]) if remove_u: tokens[i] = _remove_u_prefix(tokens[i]) tokens[i] = _fix_escape_sequences(tokens[i]) elif token.src == '(': _fix_extraneous_parens(tokens, i) elif token.src == 'format' and i > 0 and tokens[i - 1].src == '.': _fix_format_literal(tokens, i - 2) elif token.src == 'encode' and i > 0 and tokens[i - 1].src == '.': _fix_encode_to_binary(tokens, i) elif (min_version >= (3, ) and token.utf8_byte_offset == 0 and token.line < 3 and token.name == 'COMMENT' and tokenize.cookie_re.match(token.src)): del tokens[i] assert tokens[i].name == 'NL', tokens[i].name del tokens[i] elif token.src == 'from' and token.utf8_byte_offset == 0: _fix_import_removals(tokens, i, min_version) return tokens_to_src(tokens).lstrip()
def test_src_to_tokens_string_prefix_normalization(prefix): src = f"{prefix}'foo'\n" ret = src_to_tokens(src) assert ret == [ Token('STRING', f"{prefix}'foo'", line=1, utf8_byte_offset=0), Token('NEWLINE', '\n', line=1, utf8_byte_offset=5 + len(prefix)), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ]
def test_src_to_tokens_long_literal_normalization(postfix): src = f'123{postfix}\n' ret = src_to_tokens(src) assert ret == [ Token('NUMBER', f'123{postfix}', line=1, utf8_byte_offset=0), Token('NEWLINE', '\n', line=1, utf8_byte_offset=4), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ]
def _fix_calls(contents_text: str) -> str: try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = Visitor() visitor.visit(ast_obj) if not visitor.calls: return contents_text try: tokens = src_to_tokens(contents_text) except tokenize.TokenError: # pragma: no cover (bpo-2180) return contents_text for i, token in reversed_enumerate(tokens): if token.offset in visitor.calls: visitor.calls.discard(token.offset) # search forward for the opening brace while tokens[i].src != '(': i += 1 call_start = i i += 1 brace_depth = 1 start = -1 end = -1 while brace_depth: if tokens[i].src in {'(', '{', '['}: if brace_depth == 1: start = i brace_depth += 1 elif tokens[i].src in {')', '}', ']'}: brace_depth -= 1 if brace_depth == 1: end = i i += 1 assert start != -1 assert end != -1 call_end = i - 1 # dedent everything inside the brackets for i in range(call_start, call_end): if (tokens[i - 1].name == 'NL' and tokens[i].name == UNIMPORTANT_WS): tokens[i] = tokens[i]._replace(src=tokens[i].src[4:]) del tokens[end + 1:call_end] del tokens[call_start + 1:start] return tokens_to_src(tokens)
def decode(b, errors="strict"): non_coding_tokens = frozenset( ("COMMENT", tokenize_rt.ESCAPED_NL, "NL", tokenize_rt.UNIMPORTANT_WS)) u, length = utf_8.decode(b, errors) tokens = tokenize_rt.src_to_tokens(u) to_replace = [] started = -1 end = -1 for i in range(0, 1 + len(tokens)): if i < len(tokens): token = tokens[i] else: token = None if token: if fstr(token): if started < 0: started = i continue end = i if started >= 0: if peek_is_fstr(tokens, i + 1): continue if peek_is_str(tokens, i + 1): continue if token is None: pass elif token.name in non_coding_tokens or peek_is_str(tokens, i): #multiline f-string+str continue to_replace.append((started, end)) started = -1 for start, end in reversed(to_replace): if end - start > 1: #move ending line away from format of multiline fstrings if tokens[end - 1].name in non_coding_tokens: end -= 1 try: tokens[start:end] = _make_fstring(tokens[start:end]) except TokenSyntaxError as e: msg = str(e.e) line = u.splitlines()[e.token.line - 1] bts = line.encode("UTF-8")[:e.token.utf8_byte_offset] indent = len(bts.decode("UTF-8")) raise SyntaxError(msg + "\n\n" + line + "\n" + " " * indent + "^") return tokenize_rt.tokens_to_src(tokens), length
def fix_file(filename: str) -> int: with open(filename, 'rb') as f: contents_bytes = f.read() try: contents_text = contents_bytes.decode() except UnicodeDecodeError: print(f'{filename} is non-utf8 (not supported)') return 1 tokens = tokenize_rt.src_to_tokens(contents_text) tokens_no_comments = _remove_comments(tokens) src_no_comments = tokenize_rt.tokens_to_src(tokens_no_comments) if src_no_comments == contents_text: return 0 fd, path = tempfile.mkstemp( dir=os.path.dirname(filename), prefix=os.path.basename(filename), suffix='.py', ) try: with open(fd, 'wb') as f: f.write(src_no_comments.encode()) flake8_results = _run_flake8(path) finally: os.remove(path) if any('E999' in v for v in flake8_results.values()): print(f'{filename}: syntax error (skipping)') return 0 for i, token in tokenize_rt.reversed_enumerate(tokens): if token.name != 'COMMENT': continue if NOQA_RE.search(token.src): _rewrite_noqa_comment(tokens, i, flake8_results) elif NOQA_FILE_RE.match(token.src) and not flake8_results: if i == 0 or tokens[i - 1].name == 'NEWLINE': del tokens[i:i + 2] else: _remove_comment(tokens, i) newsrc = tokenize_rt.tokens_to_src(tokens) if newsrc != contents_text: print(f'Rewriting {filename}') with open(filename, 'wb') as f: f.write(newsrc.encode()) return 1 else: return 0
def test_src_to_tokens_simple(): src = 'x = 5\n' ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=None, utf8_byte_offset=None), Token('OP', '=', line=1, utf8_byte_offset=2), Token(UNIMPORTANT_WS, ' ', line=None, utf8_byte_offset=None), Token('NUMBER', '5', line=1, utf8_byte_offset=4), Token('NEWLINE', '\n', line=1, utf8_byte_offset=5), Token('ENDMARKER', '', line=2, utf8_byte_offset=0), ]
def _fix_octal_literals(contents_text): def _fix_octal(s): if not s.startswith('0') or not s.isdigit() or s == len(s) * '0': return s else: # pragma: no cover (py2 only) return '0o' + s[1:] tokens = src_to_tokens(contents_text) for i, token in enumerate(tokens): if token.name == 'NUMBER': tokens[i] = token._replace(src=_fix_octal(token.src)) return tokens_to_src(tokens)
def test_src_to_tokens_escaped_nl_no_left_ws(): src = ('x =\\\n' ' 5\n') ret = src_to_tokens(src) assert ret == [ Token('NAME', 'x', line=1, utf8_byte_offset=0), Token(UNIMPORTANT_WS, ' ', line=None, utf8_byte_offset=None), Token('OP', '=', line=1, utf8_byte_offset=2), Token(ESCAPED_NL, '\\\n', line=None, utf8_byte_offset=None), Token(UNIMPORTANT_WS, ' ', line=None, utf8_byte_offset=None), Token('NUMBER', '5', line=2, utf8_byte_offset=4), Token('NEWLINE', '\n', line=2, utf8_byte_offset=5), Token('ENDMARKER', '', line=3, utf8_byte_offset=0), ]
def _fix_unicode_literals(contents_text, py3_plus): if not py3_plus and not _imports_unicode_literals(contents_text): return contents_text tokens = src_to_tokens(contents_text) for i, token in enumerate(tokens): if token.name != 'STRING': continue match = STRING_PREFIXES_RE.match(token.src) prefix = match.group(1) rest = match.group(2) new_prefix = prefix.replace('u', '').replace('U', '') tokens[i] = Token('STRING', new_prefix + rest) return tokens_to_src(tokens)
def test_reversed_enumerate(): tokens = src_to_tokens('x = 5\n') ret = reversed_enumerate(tokens) assert next(ret) == (6, Token('ENDMARKER', '', line=2, utf8_byte_offset=0)) rest = list(ret) assert rest == [ (5, Token(name='NEWLINE', src='\n', line=1, utf8_byte_offset=5)), (4, Token('NUMBER', '5', line=1, utf8_byte_offset=4)), (3, Token(UNIMPORTANT_WS, ' ')), (2, Token('OP', '=', line=1, utf8_byte_offset=2)), (1, Token(UNIMPORTANT_WS, ' ')), (0, Token('NAME', 'x', line=1, utf8_byte_offset=0)), ]
def _fix_dictcomps(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindDictsVisitor() visitor.visit(ast_obj) if not visitor.dicts: return contents_text tokens = src_to_tokens(contents_text) for i, token in reversed_enumerate(tokens): if token.offset in visitor.dicts: _process_dict_comp(tokens, i, visitor.dicts[token.offset]) return tokens_to_src(tokens)
def _fix_dictcomps(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindDictsVisitor() visitor.visit(ast_obj) if not visitor.dicts: return contents_text tokens = src_to_tokens(contents_text) for i, token in reversed(tuple(enumerate(tokens))): key = (token.line, token.utf8_byte_offset) if key in visitor.dicts: _process_dict_comp(tokens, i, visitor.dicts[key]) return tokens_to_src(tokens)
def _fix_sets(contents_text): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindSetsVisitor() visitor.visit(ast_obj) if not visitor.sets and not visitor.set_empty_literals: return contents_text tokens = src_to_tokens(contents_text) for i, token in reversed_enumerate(tokens): if token.offset in visitor.set_empty_literals: _process_set_empty_literal(tokens, i) elif token.offset in visitor.sets: _process_set_literal(tokens, i, visitor.sets[token.offset]) return tokens_to_src(tokens)
def _fix_src(contents_text, py35_plus, py36_plus): try: ast_obj = ast_parse(contents_text) except SyntaxError: return contents_text visitor = FindNodes() visitor.visit(ast_obj) tokens = src_to_tokens(contents_text) for i, token in _changing_list(tokens): # DEDENT is a zero length token if not token.src: continue key = Offset(token.line, token.utf8_byte_offset) fixes = [] if key in visitor.calls: for call in visitor.calls[key]: # Only fix stararg calls if asked to add_comma = not call.star_args or py35_plus fixes.append((add_comma, _find_call(call, i, tokens))) elif key in visitor.funcs: func = visitor.funcs[key] add_comma = not func.star_args or py36_plus # functions can be treated as calls fixes.append((add_comma, _find_call(func, i, tokens))) elif key in visitor.literals: fixes.append((True, _find_simple(i, tokens))) # Handle parenthesized things, unhug of tuples, and comprehensions elif token.src in START_BRACES: fixes.append((False, _find_simple(i, tokens))) for add_comma, fix_data in fixes: if fix_data is not None: _fix_brace(fix_data, add_comma, tokens) # need to handle tuples afterwards as tuples report their starting # starting index as the first element, which may be one of the above # things. if key in visitor.tuples: fix_data = _find_tuple(i, tokens) if fix_data is not None: _fix_brace(fix_data, True, tokens) return tokens_to_src(tokens)
def _fix_escape_sequences(contents_text): last_name = None tokens = src_to_tokens(contents_text) for i, token in enumerate(tokens): if token.name == 'NAME': last_name = token continue elif token.name != 'STRING': last_name = None continue match = STRING_PREFIXES_RE.match(token.src) prefix = match.group(1) rest = match.group(2) if last_name is not None: # pragma: no cover (py2 bug) actual_prefix = (last_name.src + prefix).lower() else: # pragma: no cover (py3 only) actual_prefix = prefix.lower() if 'r' in actual_prefix or '\\' not in rest: continue if 'b' in actual_prefix: valid_escapes = ESCAPE_STARTS_BYTES else: valid_escapes = ESCAPE_STARTS escape_sequences = {m[1] for m in ESCAPE_RE.findall(rest)} has_valid_escapes = escape_sequences & valid_escapes has_invalid_escapes = escape_sequences - valid_escapes def cb(match): matched = match.group() if matched[1] in valid_escapes: return matched else: return r'\{}'.format(matched) if has_invalid_escapes and (has_valid_escapes or 'u' in actual_prefix): tokens[i] = token._replace(src=prefix + ESCAPE_RE.sub(cb, rest)) elif has_invalid_escapes and not has_valid_escapes: tokens[i] = token._replace(src=prefix + 'r' + rest) return tokens_to_src(tokens)