Пример #1
0
 def check(literal):
     io = StringIO(u(literal))
     tokens = tokenize.generate_tokens(io.readline)
     token_list = list(tokens)
     typ, result_literal, _, _ = token_list[0]
     assert typ == STRING
     assert result_literal == literal
Пример #2
0
 def test_function_whitespace(self):
     # Test function definition whitespace identification
     fundef = dedent(
         u('''
     def test_whitespace(*args, **kwargs):
         x = 1
         if x > 0:
             print(True)
     '''))
     fundef_io = StringIO(fundef)
     tokens = tokenize.generate_tokens(fundef_io.readline)
     token_list = list(tokens)
     for _, value, _, prefix in token_list:
         if value == 'test_whitespace':
             assert prefix == ' '
         if value == '(':
             assert prefix == ''
         if value == '*':
             assert prefix == ''
         if value == '**':
             assert prefix == ' '
         if value == 'print':
             assert prefix == '        '
         if value == 'if':
             assert prefix == '    '
Пример #3
0
 def test_function_whitespace(self):
     # Test function definition whitespace identification
     fundef = dedent(
         u(
             """
     def test_whitespace(*args, **kwargs):
         x = 1
         if x > 0:
             print(True)
     """
         )
     )
     fundef_io = StringIO(fundef)
     tokens = tokenize.generate_tokens(fundef_io.readline)
     token_list = list(tokens)
     for _, value, _, prefix in token_list:
         if value == "test_whitespace":
             assert prefix == " "
         if value == "(":
             assert prefix == ""
         if value == "*":
             assert prefix == ""
         if value == "**":
             assert prefix == " "
         if value == "print":
             assert prefix == "        "
         if value == "if":
             assert prefix == "    "
Пример #4
0
 def check(literal):
     io = StringIO(u(literal))
     tokens = tokenize.generate_tokens(io.readline)
     token_list = list(tokens)
     typ, result_literal, _, _ = token_list[0]
     assert typ == STRING
     assert result_literal == literal
Пример #5
0
 def test_simple_no_whitespace(self):
     # Test a simple one line string, no preceding whitespace
     simple_docstring = u('"""simple one line docstring"""')
     simple_docstring_io = StringIO(simple_docstring)
     tokens = tokenize.generate_tokens(simple_docstring_io.readline)
     token_list = list(tokens)
     _, value, _, prefix = token_list[0]
     assert prefix == ""
     assert value == '"""simple one line docstring"""'
Пример #6
0
 def test_simple_no_whitespace(self):
     # Test a simple one line string, no preceding whitespace
     simple_docstring = u('"""simple one line docstring"""')
     simple_docstring_io = StringIO(simple_docstring)
     tokens = tokenize.generate_tokens(simple_docstring_io.readline)
     token_list = list(tokens)
     _, value, _, prefix = token_list[0]
     assert prefix == ''
     assert value == '"""simple one line docstring"""'
Пример #7
0
 def test_simple_with_whitespace(self):
     # Test a simple one line string with preceding whitespace and newline
     simple_docstring = u('  """simple one line docstring""" \r\n')
     simple_docstring_io = StringIO(simple_docstring)
     tokens = tokenize.generate_tokens(simple_docstring_io.readline)
     token_list = list(tokens)
     assert token_list[0][0] == INDENT
     typ, value, start_pos, prefix = token_list[1]
     assert prefix == '  '
     assert value == '"""simple one line docstring"""'
     assert typ == STRING
     typ, value, start_pos, prefix = token_list[2]
     assert prefix == ' '
     assert typ == NEWLINE
Пример #8
0
 def __init__(self, filename, stream=None):
     close_stream = None
     if stream is None:
         stream = open(filename)
         close_stream = stream.close
     self.filename = filename
     self.stream = stream
     self.generator = tokenize.generate_tokens(stream.readline)
     self.gettoken()  # Initialize lookahead
     self.dfas, self.startsymbol = self.parse()
     if close_stream is not None:
         close_stream()
     self.first = {}  # map from symbol name to set of tokens
     self.addfirstsets()
Пример #9
0
 def __init__(self, filename, stream=None):
     close_stream = None
     if stream is None:
         stream = open(filename)
         close_stream = stream.close
     self.filename = filename
     self.stream = stream
     self.generator = tokenize.generate_tokens(stream.readline)
     self.gettoken()  # Initialize lookahead
     self.dfas, self.startsymbol = self.parse()
     if close_stream is not None:
         close_stream()
     self.first = {}  # map from symbol name to set of tokens
     self.addfirstsets()
Пример #10
0
 def test_simple_with_whitespace(self):
     # Test a simple one line string with preceding whitespace and newline
     simple_docstring = u('  """simple one line docstring""" \r\n')
     simple_docstring_io = StringIO(simple_docstring)
     tokens = tokenize.generate_tokens(simple_docstring_io.readline)
     token_list = list(tokens)
     assert token_list[0][0] == INDENT
     typ, value, start_pos, prefix = token_list[1]
     assert prefix == "  "
     assert value == '"""simple one line docstring"""'
     assert typ == STRING
     typ, value, start_pos, prefix = token_list[2]
     assert prefix == " "
     assert typ == NEWLINE
Пример #11
0
 def test_identifier_contains_unicode(self):
     fundef = dedent(u('''
     def 我あφ():
         pass
     '''))
     fundef_io = StringIO(fundef)
     tokens = tokenize.generate_tokens(fundef_io.readline)
     token_list = list(tokens)
     unicode_token = token_list[1]
     if is_py3:
         assert unicode_token[0] == NAME
     else:
         # Unicode tokens in Python 2 seem to be identified as operators.
         # They will be ignored in the parser, that's ok.
         assert unicode_token[0] == OP
Пример #12
0
 def test_identifier_contains_unicode(self):
     fundef = dedent(u('''
     def 我あφ():
         pass
     '''))
     fundef_io = StringIO(fundef)
     tokens = tokenize.generate_tokens(fundef_io.readline)
     token_list = list(tokens)
     unicode_token = token_list[1]
     if is_py3:
         assert unicode_token[0] == NAME
     else:
         # Unicode tokens in Python 2 seem to be identified as operators.
         # They will be ignored in the parser, that's ok.
         assert unicode_token[0] == OP
Пример #13
0
    def _diff_tokenize(self, lines, until_line, line_offset=0):
        is_first_token = True
        omitted_first_indent = False
        indents = []
        l = iter(lines)
        tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
        stack = self._active_parser.pgen_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == INDENT:
                indents.append(start_pos[1])
                if is_first_token:
                    omitted_first_indent = True
                    # We want to get rid of indents that are only here because
                    # we only parse part of the file. These indents would only
                    # get parsed as error leafs, which doesn't make any sense.
                    is_first_token = False
                    continue
            is_first_token = False

            if typ == DEDENT:
                indents.pop()
                if omitted_first_indent and not indents:
                    # We are done here, only thing that can come now is an
                    # endmarker or another dedented code block.
                    typ, string, start_pos, prefix = next(tokens)
                    if '\n' in prefix:
                        prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
                    else:
                        prefix = ''
                    yield TokenInfo(ENDMARKER, '',
                                    (start_pos[0] + line_offset, 0), prefix)
                    break
            elif typ == NEWLINE and start_pos[0] >= until_line:
                yield TokenInfo(typ, string, start_pos, prefix)
                # Check if the parser is actually in a valid suite state.
                if suite_or_file_input_is_valid(self._grammar, stack):
                    start_pos = start_pos[0] + 1, 0
                    while len(indents) > int(omitted_first_indent):
                        indents.pop()
                        yield TokenInfo(DEDENT, '', start_pos, '')

                    yield TokenInfo(ENDMARKER, '', start_pos, '')
                    break
                else:
                    continue

            yield TokenInfo(typ, string, start_pos, prefix)
Пример #14
0
 def _get_backwards_tokenizer(self, start_pos):
     line_gen = self._backwards_line_generator(start_pos)
     token_gen = tokenize.generate_tokens(lambda: next(line_gen))
     for typ, tok_str, tok_start_pos, prefix in token_gen:
         line = self.get_line(self._line_temp)
         # Calculate the real start_pos of the token.
         if tok_start_pos[0] == 1:
             # We are in the first checked line
             column = start_pos[1] - tok_start_pos[1]
         else:
             column = len(line) - tok_start_pos[1]
         # Multi-line docstrings must be accounted for.
         first_line = (tok_str.splitlines() or [''])[0]
         column -= len(first_line)
         # Reverse the token again, so that it is in normal order again.
         yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
Пример #15
0
    def _diff_tokenize(self, lines, until_line, line_offset=0):
        is_first_token = True
        omitted_first_indent = False
        indents = []
        l = iter(lines)
        tokens = generate_tokens(lambda: next(l, ''), use_exact_op_types=True)
        stack = self._active_parser.pgen_parser.stack
        for typ, string, start_pos, prefix in tokens:
            start_pos = start_pos[0] + line_offset, start_pos[1]
            if typ == INDENT:
                indents.append(start_pos[1])
                if is_first_token:
                    omitted_first_indent = True
                    # We want to get rid of indents that are only here because
                    # we only parse part of the file. These indents would only
                    # get parsed as error leafs, which doesn't make any sense.
                    is_first_token = False
                    continue
            is_first_token = False

            if typ == DEDENT:
                indents.pop()
                if omitted_first_indent and not indents:
                    # We are done here, only thing that can come now is an
                    # endmarker or another dedented code block.
                    typ, string, start_pos, prefix = next(tokens)
                    if '\n' in prefix:
                        prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix)
                    else:
                        prefix = ''
                    yield TokenInfo(ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix)
                    break
            elif typ == NEWLINE and start_pos[0] >= until_line:
                yield TokenInfo(typ, string, start_pos, prefix)
                # Check if the parser is actually in a valid suite state.
                if suite_or_file_input_is_valid(self._grammar, stack):
                    start_pos = start_pos[0] + 1, 0
                    while len(indents) > int(omitted_first_indent):
                        indents.pop()
                        yield TokenInfo(DEDENT, '', start_pos, '')

                    yield TokenInfo(ENDMARKER, '', start_pos, '')
                    break
                else:
                    continue

            yield TokenInfo(typ, string, start_pos, prefix)
Пример #16
0
 def _get_backwards_tokenizer(self, start_pos, line_gen=None):
     if line_gen is None:
         line_gen = self._backwards_line_generator(start_pos)
     token_gen = tokenize.generate_tokens(lambda: next(line_gen))
     for typ, tok_str, tok_start_pos, prefix in token_gen:
         line = self.get_line(self._line_temp)
         # Calculate the real start_pos of the token.
         if tok_start_pos[0] == 1:
             # We are in the first checked line
             column = start_pos[1] - tok_start_pos[1]
         else:
             column = len(line) - tok_start_pos[1]
         # Multi-line docstrings must be accounted for.
         first_line = common.splitlines(tok_str)[0]
         column -= len(first_line)
         # Reverse the token again, so that it is in normal order again.
         yield typ, tok_str[::-1], (self._line_temp, column), prefix[::-1]
Пример #17
0
 def test_function_whitespace(self):
     # Test function definition whitespace identification
     fundef = dedent(u('''
     def test_whitespace(*args, **kwargs):
         x = 1
         if x > 0:
             print(True)
     '''))
     fundef_io = StringIO(fundef)
     tokens = tokenize.generate_tokens(fundef_io.readline)
     token_list = list(tokens)
     for _, value, _, prefix in token_list:
         if value == 'test_whitespace':
             assert prefix == ' '
         if value == '(':
             assert prefix == ''
         if value == '*':
             assert prefix == ''
         if value == '**':
             assert prefix == ' '
         if value == 'print':
             assert prefix == '        '
         if value == 'if':
             assert prefix == '    '
Пример #18
0
    def _calc_path_until_cursor(self, start_pos=None):
        """
        Something like a reverse tokenizer that tokenizes the reversed strings.
        """
        def fetch_line():
            if self._is_first:
                self._is_first = False
                self._line_length = self._column_temp
                line = first_line
            else:
                line = self.get_line(self._line_temp)
                self._line_length = len(line)
            line = '\n' + line

            # add lines with a backslash at the end
            while True:
                self._line_temp -= 1
                last_line = self.get_line(self._line_temp)
                if last_line and last_line[-1] == '\\':
                    line = last_line[:-1] + ' ' + line
                    self._line_length = len(last_line)
                else:
                    break
            return line[::-1]

        self._is_first = True
        self._line_temp, self._column_temp = start_cursor = start_pos
        first_line = self.get_line(self._line_temp)[:self._column_temp]

        open_brackets = ['(', '[', '{']
        close_brackets = [')', ']', '}']

        gen = PushBackIterator(tokenize.generate_tokens(fetch_line))
        string = u('')
        level = 0
        force_point = False
        last_type = None
        is_first = True
        for tok in gen:
            tok_type = tok.type
            tok_str = tok.string
            end = tok.end_pos
            self._column_temp = self._line_length - end[1]
            if is_first:
                if tok.start_pos != (1, 0):  # whitespace is not a path
                    return u(''), start_cursor
                is_first = False

            # print 'tok', token_type, tok_str, force_point
            if last_type == tok_type == tokenize.NAME:
                string += ' '

            if level > 0:
                if tok_str in close_brackets:
                    level += 1
                if tok_str in open_brackets:
                    level -= 1
            elif tok_str == '.':
                force_point = False
            elif force_point:
                # it is reversed, therefore a number is getting recognized
                # as a floating point number
                if tok_type == tokenize.NUMBER and tok_str[0] == '.':
                    force_point = False
                else:
                    break
            elif tok_str in close_brackets:
                level += 1
            elif tok_type in [tokenize.NAME, tokenize.STRING]:
                force_point = True
            elif tok_type == tokenize.NUMBER:
                pass
            else:
                if tok_str == '-':
                    next_tok = next(gen)
                    if next_tok.string == 'e':
                        gen.push_back(next_tok)
                    else:
                        break
                else:
                    break

            x = start_pos[0] - end[0] + 1
            l = self.get_line(x)
            l = first_line if x == start_pos[0] else l
            start_cursor = x, len(l) - end[1]
            string += tok_str
            last_type = tok_type

        # string can still contain spaces at the end
        return string[::-1].strip(), start_cursor
Пример #19
0
def parse(code=None, path=None, grammar=None, error_recovery=True,
          start_symbol='file_input', cache=False, diff_cache=False):
    """
    If you want to parse a Python file you want to start here, most likely.

    If you need finer grained control over the parsed instance, there will be
    other ways to access it.

    :param code: A unicode string that contains Python code.
    :param path: The path to the file you want to open. Only needed for caching.
    :param grammar: A Python grammar file, created with load_grammar. You may
        not specify it. In that case it's the current Python version.
    :param error_recovery: If enabled, any code will be returned. If it is
        invalid, it will be returned as an error node. If disabled, you will
        get a ParseError when encountering syntax errors in your code.
    :param start_symbol: The grammar symbol that you want to parse. Only
        allowed to be used when error_recovery is disabled.

    :return: A syntax tree node. Typically the module.
    """
    if code is None and path is None:
        raise TypeError("Please provide either code or a path.")

    if grammar is None:
        grammar = load_grammar()

    if cache and not code and path is not None:
        # In this case we do actual caching. We just try to load it.
        module_node = load_module(grammar, path)
        if module_node is not None:
            return module_node

    if code is None:
        with open(path, 'rb') as f:
            code = source_to_unicode(f.read())

    if diff_cache and settings.fast_parser:
        try:
            module_cache_item = parser_cache[path]
        except KeyError:
            pass
        else:
            lines = splitlines(code, keepends=True)
            module_node = module_cache_item.node
            old_lines = module_cache_item.lines
            if old_lines == lines:
                save_module(grammar, path, module_node, lines, pickling=False)
                return module_node

            new_node = DiffParser(grammar, module_node).update(
                old_lines=old_lines,
                new_lines=lines
            )
            save_module(grammar, path, new_node, lines, pickling=cache)
            return new_node

    added_newline = not code.endswith('\n')
    lines = tokenize_lines = splitlines(code, keepends=True)
    if added_newline:
        code += '\n'
        tokenize_lines = list(tokenize_lines)
        tokenize_lines[-1] += '\n'
        tokenize_lines.append('')

    tokens = generate_tokens(tokenize_lines, use_exact_op_types=True)

    p = Parser(grammar, error_recovery=error_recovery, start_symbol=start_symbol)
    root_node = p.parse(tokens=tokens)
    if added_newline:
        _remove_last_newline(root_node)

    if cache or diff_cache:
        save_module(grammar, path, root_node, lines, pickling=cache)
    return root_node
Пример #20
0
    def _calc_path_until_cursor(self, start_pos=None):
        """
        Something like a reverse tokenizer that tokenizes the reversed strings.
        """
        def fetch_line():
            if self._is_first:
                self._is_first = False
                self._line_length = self._column_temp
                line = first_line
            else:
                line = self.get_line(self._line_temp)
                self._line_length = len(line)
            line = '\n' + line

            # add lines with a backslash at the end
            while True:
                self._line_temp -= 1
                last_line = self.get_line(self._line_temp)
                if last_line and last_line[-1] == '\\':
                    line = last_line[:-1] + ' ' + line
                    self._line_length = len(last_line)
                else:
                    break
            return line[::-1]

        self._is_first = True
        self._line_temp, self._column_temp = start_cursor = start_pos
        first_line = self.get_line(self._line_temp)[:self._column_temp]

        open_brackets = ['(', '[', '{']
        close_brackets = [')', ']', '}']

        gen = PushBackIterator(tokenize.generate_tokens(fetch_line))
        string = u('')
        level = 0
        force_point = False
        last_type = None
        is_first = True
        for tok in gen:
            tok_type = tok.type
            tok_str = tok.string
            end = tok.end_pos
            self._column_temp = self._line_length - end[1]
            if is_first:
                if tok.start_pos != (1, 0):  # whitespace is not a path
                    return u(''), start_cursor
                is_first = False

            # print 'tok', token_type, tok_str, force_point
            if last_type == tok_type == tokenize.NAME:
                string += ' '

            if level > 0:
                if tok_str in close_brackets:
                    level += 1
                if tok_str in open_brackets:
                    level -= 1
            elif tok_str == '.':
                force_point = False
            elif force_point:
                # it is reversed, therefore a number is getting recognized
                # as a floating point number
                if tok_type == tokenize.NUMBER and tok_str[0] == '.':
                    force_point = False
                else:
                    break
            elif tok_str in close_brackets:
                level += 1
            elif tok_type in [tokenize.NAME, tokenize.STRING]:
                force_point = True
            elif tok_type == tokenize.NUMBER:
                pass
            else:
                if tok_str == '-':
                    next_tok = next(gen)
                    if next_tok.string == 'e':
                        gen.push_back(next_tok)
                    else:
                        break
                else:
                    break

            x = start_pos[0] - end[0] + 1
            l = self.get_line(x)
            l = first_line if x == start_pos[0] else l
            start_cursor = x, len(l) - end[1]
            string += tok_str
            last_type = tok_type

        # string can still contain spaces at the end
        return string[::-1].strip(), start_cursor
Пример #21
0
def _get_token_list(string):
    io = StringIO(u(string))
    return list(tokenize.generate_tokens(io.readline))
Пример #22
0
def parse(code=None, path=None, grammar=None, error_recovery=True,
          start_symbol='file_input', cache=False, diff_cache=False):
    """
    If you want to parse a Python file you want to start here, most likely.

    If you need finer grained control over the parsed instance, there will be
    other ways to access it.

    :param code: A unicode string that contains Python code.
    :param path: The path to the file you want to open. Only needed for caching.
    :param grammar: A Python grammar file, created with load_grammar.
    :param error_recovery: If enabled, any code will be returned. If it is
        invalid, it will be returned as an error node. If disabled, you will
        get a ParseError when encountering syntax errors in your code.
    :param start_symbol: The grammar symbol that you want to parse. Only
        allowed to be used when error_recovery is disabled.

    :return: A syntax tree node. Typically the module.
    """
    if code is None and path is None:
        raise TypeError("Please provide either code or a path.")

    if grammar is None:
        grammar = load_grammar()

    if path is not None:
        path = os.path.expanduser(path)

    if cache and not code and path is not None:
        # In this case we do actual caching. We just try to load it.
        module_node = load_module(grammar, path)
        if module_node is not None:
            return module_node

    if code is None:
        with open(path, 'rb') as f:
            code = source_to_unicode(f.read())

    if diff_cache and settings.fast_parser:
        try:
            module_cache_item = parser_cache[path]
        except KeyError:
            pass
        else:
            lines = splitlines(code, keepends=True)
            module_node = module_cache_item.node
            old_lines = module_cache_item.lines
            if old_lines == lines:
                save_module(grammar, path, module_node, lines, pickling=False)
                return module_node

            new_node = DiffParser(grammar, module_node).update(
                old_lines=old_lines,
                new_lines=lines
            )
            save_module(grammar, path, new_node, lines, pickling=cache)
            return new_node

    added_newline = not code.endswith('\n')
    lines = tokenize_lines = splitlines(code, keepends=True)
    if added_newline:
        code += '\n'
        tokenize_lines = list(tokenize_lines)
        tokenize_lines[-1] += '\n'
        tokenize_lines.append('')

    tokens = generate_tokens(tokenize_lines, use_exact_op_types=True)

    p = Parser(grammar, error_recovery=error_recovery, start_symbol=start_symbol)
    root_node = p.parse(tokens=tokens)
    if added_newline:
        _remove_last_newline(root_node)

    if cache or diff_cache:
        save_module(grammar, path, root_node, lines, pickling=cache)
    return root_node