def check(literal): io = StringIO(u(literal)) tokens = tokenize.generate_tokens(io.readline) token_list = list(tokens) typ, result_literal, _, _ = token_list[0] assert typ == STRING assert result_literal == literal
def test_function_whitespace(self): # Test function definition whitespace identification fundef = dedent( u(''' def test_whitespace(*args, **kwargs): x = 1 if x > 0: print(True) ''')) fundef_io = StringIO(fundef) tokens = tokenize.generate_tokens(fundef_io.readline) token_list = list(tokens) for _, value, _, prefix in token_list: if value == 'test_whitespace': assert prefix == ' ' if value == '(': assert prefix == '' if value == '*': assert prefix == '' if value == '**': assert prefix == ' ' if value == 'print': assert prefix == ' ' if value == 'if': assert prefix == ' '
def test_function_whitespace(self): # Test function definition whitespace identification fundef = dedent( u( """ def test_whitespace(*args, **kwargs): x = 1 if x > 0: print(True) """ ) ) fundef_io = StringIO(fundef) tokens = tokenize.generate_tokens(fundef_io.readline) token_list = list(tokens) for _, value, _, prefix in token_list: if value == "test_whitespace": assert prefix == " " if value == "(": assert prefix == "" if value == "*": assert prefix == "" if value == "**": assert prefix == " " if value == "print": assert prefix == " " if value == "if": assert prefix == " "
def test_simple_no_whitespace(self): # Test a simple one line string, no preceding whitespace simple_docstring = u('"""simple one line docstring"""') simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.generate_tokens(simple_docstring_io.readline) token_list = list(tokens) _, value, _, prefix = token_list[0] assert prefix == "" assert value == '"""simple one line docstring"""'
def test_simple_no_whitespace(self): # Test a simple one line string, no preceding whitespace simple_docstring = u('"""simple one line docstring"""') simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.generate_tokens(simple_docstring_io.readline) token_list = list(tokens) _, value, _, prefix = token_list[0] assert prefix == '' assert value == '"""simple one line docstring"""'
def test_simple_with_whitespace(self): # Test a simple one line string with preceding whitespace and newline simple_docstring = u(' """simple one line docstring""" \r\n') simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.generate_tokens(simple_docstring_io.readline) token_list = list(tokens) assert token_list[0][0] == INDENT typ, value, start_pos, prefix = token_list[1] assert prefix == ' ' assert value == '"""simple one line docstring"""' assert typ == STRING typ, value, start_pos, prefix = token_list[2] assert prefix == ' ' assert typ == NEWLINE
def __init__(self, filename, stream=None): close_stream = None if stream is None: stream = open(filename) close_stream = stream.close self.filename = filename self.stream = stream self.generator = tokenize.generate_tokens(stream.readline) self.gettoken() # Initialize lookahead self.dfas, self.startsymbol = self.parse() if close_stream is not None: close_stream() self.first = {} # map from symbol name to set of tokens self.addfirstsets()
def test_simple_with_whitespace(self): # Test a simple one line string with preceding whitespace and newline simple_docstring = u(' """simple one line docstring""" \r\n') simple_docstring_io = StringIO(simple_docstring) tokens = tokenize.generate_tokens(simple_docstring_io.readline) token_list = list(tokens) assert token_list[0][0] == INDENT typ, value, start_pos, prefix = token_list[1] assert prefix == " " assert value == '"""simple one line docstring"""' assert typ == STRING typ, value, start_pos, prefix = token_list[2] assert prefix == " " assert typ == NEWLINE
def test_identifier_contains_unicode(self): fundef = dedent(u(''' def 我あφ(): pass ''')) fundef_io = StringIO(fundef) tokens = tokenize.generate_tokens(fundef_io.readline) token_list = list(tokens) unicode_token = token_list[1] if is_py3: assert unicode_token[0] == NAME else: # Unicode tokens in Python 2 seem to be identified as operators. # They will be ignored in the parser, that's ok. assert unicode_token[0] == OP
def _diff_tokenize(self, lines, until_line, line_offset=0): is_first_token = True omitted_first_indent = False indents = [] l = iter(lines) tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True) stack = self._active_parser.pgen_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] if typ == INDENT: indents.append(start_pos[1]) if is_first_token: omitted_first_indent = True # We want to get rid of indents that are only here because # we only parse part of the file. These indents would only # get parsed as error leafs, which doesn't make any sense. is_first_token = False continue is_first_token = False if typ == DEDENT: indents.pop() if omitted_first_indent and not indents: # We are done here, only thing that can come now is an # endmarker or another dedented code block. typ, string, start_pos, prefix = next(tokens) if '\n' in prefix: prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix) else: prefix = '' yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix) break elif typ == NEWLINE and start_pos[0] >= until_line: yield TokenInfo(typ, string, start_pos, prefix) # Check if the parser is actually in a valid suite state. if suite_or_file_input_is_valid(self._grammar, stack): start_pos = start_pos[0] + 1, 0 while len(indents) > int(omitted_first_indent): indents.pop() yield TokenInfo(DEDENT, '', start_pos, '') yield TokenInfo(ENDMARKER, '', start_pos, '') break else: continue yield TokenInfo(typ, string, start_pos, prefix)
def _get_backwards_tokenizer(self, start_pos): line_gen = self._backwards_line_generator(start_pos) token_gen = tokenize.generate_tokens(lambda: next(line_gen)) for typ, tok_str, tok_start_pos, prefix in token_gen: line = self.get_line(self._line_temp) # Calculate the real start_pos of the token. if tok_start_pos[0] == 1: # We are in the first checked line column = start_pos[1] - tok_start_pos[1] else: column = len(line) - tok_start_pos[1] # Multi-line docstrings must be accounted for. first_line = (tok_str.splitlines() or [''])[0] column -= len(first_line) # Reverse the token again, so that it is in normal order again. yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
def _get_backwards_tokenizer(self, start_pos, line_gen=None): if line_gen is None: line_gen = self._backwards_line_generator(start_pos) token_gen = tokenize.generate_tokens(lambda: next(line_gen)) for typ, tok_str, tok_start_pos, prefix in token_gen: line = self.get_line(self._line_temp) # Calculate the real start_pos of the token. if tok_start_pos[0] == 1: # We are in the first checked line column = start_pos[1] - tok_start_pos[1] else: column = len(line) - tok_start_pos[1] # Multi-line docstrings must be accounted for. first_line = common.splitlines(tok_str)[0] column -= len(first_line) # Reverse the token again, so that it is in normal order again. yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
def test_function_whitespace(self): # Test function definition whitespace identification fundef = dedent(u(''' def test_whitespace(*args, **kwargs): x = 1 if x > 0: print(True) ''')) fundef_io = StringIO(fundef) tokens = tokenize.generate_tokens(fundef_io.readline) token_list = list(tokens) for _, value, _, prefix in token_list: if value == 'test_whitespace': assert prefix == ' ' if value == '(': assert prefix == '' if value == '*': assert prefix == '' if value == '**': assert prefix == ' ' if value == 'print': assert prefix == ' ' if value == 'if': assert prefix == ' '
def _calc_path_until_cursor(self, start_pos=None): """ Something like a reverse tokenizer that tokenizes the reversed strings. """ def fetch_line(): if self._is_first: self._is_first = False self._line_length = self._column_temp line = first_line else: line = self.get_line(self._line_temp) self._line_length = len(line) line = '\n' + line # add lines with a backslash at the end while True: self._line_temp -= 1 last_line = self.get_line(self._line_temp) if last_line and last_line[-1] == '\\': line = last_line[:-1] + ' ' + line self._line_length = len(last_line) else: break return line[::-1] self._is_first = True self._line_temp, self._column_temp = start_cursor = start_pos first_line = self.get_line(self._line_temp)[:self._column_temp] open_brackets = ['(', '[', '{'] close_brackets = [')', ']', '}'] gen = PushBackIterator(tokenize.generate_tokens(fetch_line)) string = u('') level = 0 force_point = False last_type = None is_first = True for tok in gen: tok_type = tok.type tok_str = tok.string end = tok.end_pos self._column_temp = self._line_length - end[1] if is_first: if tok.start_pos != (1, 0): # whitespace is not a path return u(''), start_cursor is_first = False # print 'tok', token_type, tok_str, force_point if last_type == tok_type == tokenize.NAME: string += ' ' if level > 0: if tok_str in close_brackets: level += 1 if tok_str in open_brackets: level -= 1 elif tok_str == '.': force_point = False elif force_point: # it is reversed, therefore a number is getting recognized # as a floating point number if tok_type == tokenize.NUMBER and tok_str[0] == '.': force_point = False else: break elif tok_str in close_brackets: level += 1 elif tok_type in [tokenize.NAME, tokenize.STRING]: force_point = True elif tok_type == tokenize.NUMBER: pass else: if tok_str == '-': next_tok = next(gen) if next_tok.string == 'e': gen.push_back(next_tok) else: break else: break x = start_pos[0] - end[0] + 1 l = self.get_line(x) l = first_line if x == start_pos[0] else l start_cursor = x, len(l) - end[1] string += tok_str last_type = tok_type # string can still contain spaces at the end return string[::-1].strip(), start_cursor
def parse(code=None, path=None, grammar=None, error_recovery=True, start_symbol='file_input', cache=False, diff_cache=False): """ If you want to parse a Python file you want to start here, most likely. If you need finer grained control over the parsed instance, there will be other ways to access it. :param code: A unicode string that contains Python code. :param path: The path to the file you want to open. Only needed for caching. :param grammar: A Python grammar file, created with load_grammar. You may not specify it. In that case it's the current Python version. :param error_recovery: If enabled, any code will be returned. If it is invalid, it will be returned as an error node. If disabled, you will get a ParseError when encountering syntax errors in your code. :param start_symbol: The grammar symbol that you want to parse. Only allowed to be used when error_recovery is disabled. :return: A syntax tree node. Typically the module. """ if code is None and path is None: raise TypeError("Please provide either code or a path.") if grammar is None: grammar = load_grammar() if cache and not code and path is not None: # In this case we do actual caching. We just try to load it. module_node = load_module(grammar, path) if module_node is not None: return module_node if code is None: with open(path, 'rb') as f: code = source_to_unicode(f.read()) if diff_cache and settings.fast_parser: try: module_cache_item = parser_cache[path] except KeyError: pass else: lines = splitlines(code, keepends=True) module_node = module_cache_item.node old_lines = module_cache_item.lines if old_lines == lines: save_module(grammar, path, module_node, lines, pickling=False) return module_node new_node = DiffParser(grammar, module_node).update( old_lines=old_lines, new_lines=lines ) save_module(grammar, path, new_node, lines, pickling=cache) return new_node added_newline = not code.endswith('\n') lines = tokenize_lines = splitlines(code, keepends=True) if added_newline: code += '\n' tokenize_lines = list(tokenize_lines) tokenize_lines[-1] += '\n' tokenize_lines.append('') tokens = generate_tokens(tokenize_lines, use_exact_op_types=True) p = Parser(grammar, error_recovery=error_recovery, start_symbol=start_symbol) root_node = p.parse(tokens=tokens) if added_newline: _remove_last_newline(root_node) if cache or diff_cache: save_module(grammar, path, root_node, lines, pickling=cache) return root_node
def _get_token_list(string): io = StringIO(u(string)) return list(tokenize.generate_tokens(io.readline))
def parse(code=None, path=None, grammar=None, error_recovery=True, start_symbol='file_input', cache=False, diff_cache=False): """ If you want to parse a Python file you want to start here, most likely. If you need finer grained control over the parsed instance, there will be other ways to access it. :param code: A unicode string that contains Python code. :param path: The path to the file you want to open. Only needed for caching. :param grammar: A Python grammar file, created with load_grammar. :param error_recovery: If enabled, any code will be returned. If it is invalid, it will be returned as an error node. If disabled, you will get a ParseError when encountering syntax errors in your code. :param start_symbol: The grammar symbol that you want to parse. Only allowed to be used when error_recovery is disabled. :return: A syntax tree node. Typically the module. """ if code is None and path is None: raise TypeError("Please provide either code or a path.") if grammar is None: grammar = load_grammar() if path is not None: path = os.path.expanduser(path) if cache and not code and path is not None: # In this case we do actual caching. We just try to load it. module_node = load_module(grammar, path) if module_node is not None: return module_node if code is None: with open(path, 'rb') as f: code = source_to_unicode(f.read()) if diff_cache and settings.fast_parser: try: module_cache_item = parser_cache[path] except KeyError: pass else: lines = splitlines(code, keepends=True) module_node = module_cache_item.node old_lines = module_cache_item.lines if old_lines == lines: save_module(grammar, path, module_node, lines, pickling=False) return module_node new_node = DiffParser(grammar, module_node).update( old_lines=old_lines, new_lines=lines ) save_module(grammar, path, new_node, lines, pickling=cache) return new_node added_newline = not code.endswith('\n') lines = tokenize_lines = splitlines(code, keepends=True) if added_newline: code += '\n' tokenize_lines = list(tokenize_lines) tokenize_lines[-1] += '\n' tokenize_lines.append('') tokens = generate_tokens(tokenize_lines, use_exact_op_types=True) p = Parser(grammar, error_recovery=error_recovery, start_symbol=start_symbol) root_node = p.parse(tokens=tokens) if added_newline: _remove_last_newline(root_node) if cache or diff_cache: save_module(grammar, path, root_node, lines, pickling=cache) return root_node