def get_stack_at_position(grammar, code_lines, module_node, pos): """ Returns the possible node names (e.g. import_from, xor_test or yield_stmt). """ class EndMarkerReached(Exception): pass def tokenize_without_endmarker(code): # TODO This is for now not an official parso API that exists purely # for Jedi. tokens = grammar._tokenize(code) for token_ in tokens: if token_.string == safeword: raise EndMarkerReached() elif token_.prefix.endswith(safeword): # This happens with comments. raise EndMarkerReached() else: yield token_ # The code might be indedented, just remove it. code = dedent(_get_code_for_stack(code_lines, module_node, pos)) # We use a word to tell Jedi when we have reached the start of the # completion. # Use Z as a prefix because it's not part of a number suffix. safeword = 'ZZZ_USER_WANTS_TO_COMPLETE_HERE_WITH_JEDI' code = code + ' ' + safeword p = Parser(grammar._pgen_grammar, error_recovery=True) try: p.parse(tokens=tokenize_without_endmarker(code)) except EndMarkerReached: return p.stack raise SystemError("This really shouldn't happen. There's a bug in Jedi.")
def _try_parse_part(self, until_line): """ Sets up a normal parser that uses a spezialized tokenizer to only parse until a certain position (or a bit longer if the statement hasn't ended. """ self._parser_count += 1 # TODO speed up, shouldn't copy the whole list all the time. # memoryview? parsed_until_line = self._nodes_stack.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] tokens = self._diff_tokenize(lines_after, until_line, line_offset=parsed_until_line) self._active_parser = Parser(self._pgen_grammar, error_recovery=True) return self._active_parser.parse(tokens=tokens)
def _try_parse_part(self, until_line): """ Sets up a normal parser that uses a spezialized tokenizer to only parse until a certain position (or a bit longer if the statement hasn't ended. """ self._parser_count += 1 # TODO speed up, shouldn't copy the whole list all the time. # memoryview? parsed_until_line = self._nodes_stack.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] tokens = self._diff_tokenize( lines_after, until_line, line_offset=parsed_until_line ) self._active_parser = Parser( self._pgen_grammar, error_recovery=True ) return self._active_parser.parse(tokens=tokens)
class DiffParser(object): """ An advanced form of parsing a file faster. Unfortunately comes with huge side effects. It changes the given module. """ def __init__(self, pgen_grammar, tokenizer, module): self._pgen_grammar = pgen_grammar self._tokenizer = tokenizer self._module = module def _reset(self): self._copy_count = 0 self._parser_count = 0 self._nodes_stack = _NodesStack(self._module) def update(self, old_lines, new_lines): ''' The algorithm works as follows: Equal: - Assure that the start is a newline, otherwise parse until we get one. - Copy from parsed_until_line + 1 to max(i2 + 1) - Make sure that the indentation is correct (e.g. add DEDENT) - Add old and change positions Insert: - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not much more. Returns the new module node. ''' LOG.debug('diff parser start') # Reset the used names cache so they get regenerated. self._module._used_names = None self._parser_lines_new = new_lines self._reset() line_length = len(new_lines) sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) opcodes = sm.get_opcodes() LOG.debug('diff parser calculated') LOG.debug('diff: line_lengths old: %s, new: %s' % (len(old_lines), line_length)) for operation, i1, i2, j1, j2 in opcodes: LOG.debug('diff code[%s] old[%s:%s] new[%s:%s]', operation, i1 + 1, i2, j1 + 1, j2) if j2 == line_length and new_lines[-1] == '': # The empty part after the last newline is not relevant. j2 -= 1 if operation == 'equal': line_offset = j1 - i1 self._copy_from_old_parser(line_offset, i2, j2) elif operation == 'replace': self._parse(until_line=j2) elif operation == 'insert': self._parse(until_line=j2) else: assert operation == 'delete' # With this action all change will finally be applied and we have a # changed module. self._nodes_stack.close() last_pos = self._module.end_pos[0] if last_pos != line_length: current_lines = split_lines(self._module.get_code(), keepends=True) diff = difflib.unified_diff(current_lines, new_lines) raise Exception( "There's an issue (%s != %s) with the diff parser. Please report:\n%s" % (last_pos, line_length, ''.join(diff)) ) LOG.debug('diff parser end') return self._module def _enabled_debugging(self, old_lines, lines_new): if self._module.get_code() != ''.join(lines_new): LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new): copied_nodes = [None] last_until_line = -1 while until_line_new > self._nodes_stack.parsed_until_line: parsed_until_line_old = self._nodes_stack.parsed_until_line - line_offset line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) if line_stmt is None: # Parse 1 line at least. We don't need more, because we just # want to get into a state where the old parser has statements # again that can be copied (e.g. not lines within parentheses). self._parse(self._nodes_stack.parsed_until_line + 1) elif not copied_nodes: # We have copied as much as possible (but definitely not too # much). Therefore we just parse the rest. # We might not reach the end, because there's a statement # that is not finished. self._parse(until_line_new) else: p_children = line_stmt.parent.children index = p_children.index(line_stmt) copied_nodes = self._nodes_stack.copy_nodes( p_children[index:], until_line_old, line_offset ) # Match all the nodes that are in the wanted range. if copied_nodes: self._copy_count += 1 from_ = copied_nodes[0].get_start_pos_of_prefix()[0] + line_offset to = self._nodes_stack.parsed_until_line LOG.debug('diff actually copy %s to %s', from_, to) # Since there are potential bugs that might loop here endlessly, we # just stop here. assert last_until_line != self._nodes_stack.parsed_until_line \ or not copied_nodes, last_until_line last_until_line = self._nodes_stack.parsed_until_line def _get_old_line_stmt(self, old_line): leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) if _ends_with_newline(leaf): leaf = leaf.get_next_leaf() if leaf.get_start_pos_of_prefix()[0] == old_line: node = leaf while node.parent.type not in ('file_input', 'suite'): node = node.parent return node # Must be on the same line. Otherwise we need to parse that bit. return None def _get_before_insertion_node(self): if self._nodes_stack.is_empty(): return None line = self._nodes_stack.parsed_until_line + 1 node = self._new_module.get_last_leaf() while True: parent = node.parent if parent.type in ('suite', 'file_input'): assert node.end_pos[0] <= line assert node.end_pos[1] == 0 or '\n' in self._prefix return node node = parent def _parse(self, until_line): """ Parses at least until the given line, but might just parse more until a valid state is reached. """ last_until_line = 0 while until_line > self._nodes_stack.parsed_until_line: node = self._try_parse_part(until_line) nodes = node.children self._nodes_stack.add_parsed_nodes(nodes) LOG.debug( 'parse_part from %s to %s (to %s in part parser)', nodes[0].get_start_pos_of_prefix()[0], self._nodes_stack.parsed_until_line, node.end_pos[0] - 1 ) # Since the tokenizer sometimes has bugs, we cannot be sure that # this loop terminates. Therefore assert that there's always a # change. assert last_until_line != self._nodes_stack.parsed_until_line, last_until_line last_until_line = self._nodes_stack.parsed_until_line def _try_parse_part(self, until_line): """ Sets up a normal parser that uses a spezialized tokenizer to only parse until a certain position (or a bit longer if the statement hasn't ended. """ self._parser_count += 1 # TODO speed up, shouldn't copy the whole list all the time. # memoryview? parsed_until_line = self._nodes_stack.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] tokens = self._diff_tokenize( lines_after, until_line, line_offset=parsed_until_line ) self._active_parser = Parser( self._pgen_grammar, error_recovery=True ) return self._active_parser.parse(tokens=tokens) def _diff_tokenize(self, lines, until_line, line_offset=0): is_first_token = True omitted_first_indent = False indents = [] tokens = self._tokenizer(lines, (1, 0)) stack = self._active_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] if typ == PythonTokenTypes.INDENT: indents.append(start_pos[1]) if is_first_token: omitted_first_indent = True # We want to get rid of indents that are only here because # we only parse part of the file. These indents would only # get parsed as error leafs, which doesn't make any sense. is_first_token = False continue is_first_token = False # In case of omitted_first_indent, it might not be dedented fully. # However this is a sign for us that a dedent happened. if typ == PythonTokenTypes.DEDENT \ or typ == PythonTokenTypes.ERROR_DEDENT \ and omitted_first_indent and len(indents) == 1: indents.pop() if omitted_first_indent and not indents: # We are done here, only thing that can come now is an # endmarker or another dedented code block. typ, string, start_pos, prefix = next(tokens) if '\n' in prefix: prefix = re.sub(r'(<=\n)[^\n]+$', '', prefix) else: prefix = '' yield PythonToken( PythonTokenTypes.ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix ) break elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line: yield PythonToken(typ, string, start_pos, prefix) # Check if the parser is actually in a valid suite state. if suite_or_file_input_is_valid(self._pgen_grammar, stack): start_pos = start_pos[0] + 1, 0 while len(indents) > int(omitted_first_indent): indents.pop() yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '') yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '') break else: continue yield PythonToken(typ, string, start_pos, prefix)
class DiffParser(object): """ An advanced form of parsing a file faster. Unfortunately comes with huge side effects. It changes the given module. """ def __init__(self, pgen_grammar, tokenizer, module): self._pgen_grammar = pgen_grammar self._tokenizer = tokenizer self._module = module def _reset(self): self._copy_count = 0 self._parser_count = 0 self._nodes_tree = _NodesTree(self._module) def update(self, old_lines, new_lines): ''' The algorithm works as follows: Equal: - Assure that the start is a newline, otherwise parse until we get one. - Copy from parsed_until_line + 1 to max(i2 + 1) - Make sure that the indentation is correct (e.g. add DEDENT) - Add old and change positions Insert: - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not much more. Returns the new module node. ''' LOG.debug('diff parser start') # Reset the used names cache so they get regenerated. self._module._used_names = None self._parser_lines_new = new_lines self._reset() line_length = len(new_lines) sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) opcodes = sm.get_opcodes() LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length)) for operation, i1, i2, j1, j2 in opcodes: LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]', operation, i1 + 1, i2, j1 + 1, j2) if j2 == line_length and new_lines[-1] == '': # The empty part after the last newline is not relevant. j2 -= 1 if operation == 'equal': line_offset = j1 - i1 self._copy_from_old_parser(line_offset, i2, j2) elif operation == 'replace': self._parse(until_line=j2) elif operation == 'insert': self._parse(until_line=j2) else: assert operation == 'delete' # With this action all change will finally be applied and we have a # changed module. self._nodes_tree.close() if DEBUG_DIFF_PARSER: # If there is reasonable suspicion that the diff parser is not # behaving well, this should be enabled. try: assert self._module.get_code() == ''.join(new_lines) _assert_valid_graph(self._module) except AssertionError: print(_get_debug_error_message(self._module, old_lines, new_lines)) raise last_pos = self._module.end_pos[0] if last_pos != line_length: raise Exception( ('(%s != %s) ' % (last_pos, line_length)) + _get_debug_error_message(self._module, old_lines, new_lines) ) LOG.debug('diff parser end') return self._module def _enabled_debugging(self, old_lines, lines_new): if self._module.get_code() != ''.join(lines_new): LOG.warning('parser issue:\n%s\n%s', ''.join(old_lines), ''.join(lines_new)) def _copy_from_old_parser(self, line_offset, until_line_old, until_line_new): last_until_line = -1 while until_line_new > self._nodes_tree.parsed_until_line: parsed_until_line_old = self._nodes_tree.parsed_until_line - line_offset line_stmt = self._get_old_line_stmt(parsed_until_line_old + 1) if line_stmt is None: # Parse 1 line at least. We don't need more, because we just # want to get into a state where the old parser has statements # again that can be copied (e.g. not lines within parentheses). self._parse(self._nodes_tree.parsed_until_line + 1) else: p_children = line_stmt.parent.children index = p_children.index(line_stmt) from_ = self._nodes_tree.parsed_until_line + 1 copied_nodes = self._nodes_tree.copy_nodes( p_children[index:], until_line_old, line_offset ) # Match all the nodes that are in the wanted range. if copied_nodes: self._copy_count += 1 to = self._nodes_tree.parsed_until_line LOG.debug('copy old[%s:%s] new[%s:%s]', copied_nodes[0].start_pos[0], copied_nodes[-1].end_pos[0] - 1, from_, to) else: # We have copied as much as possible (but definitely not too # much). Therefore we just parse a bit more. self._parse(self._nodes_tree.parsed_until_line + 1) # Since there are potential bugs that might loop here endlessly, we # just stop here. assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line last_until_line = self._nodes_tree.parsed_until_line def _get_old_line_stmt(self, old_line): leaf = self._module.get_leaf_for_position((old_line, 0), include_prefixes=True) if _ends_with_newline(leaf): leaf = leaf.get_next_leaf() if leaf.get_start_pos_of_prefix()[0] == old_line: node = leaf while node.parent.type not in ('file_input', 'suite'): node = node.parent # Make sure that if only the `else:` line of an if statement is # copied that not the whole thing is going to be copied. if node.start_pos[0] >= old_line: return node # Must be on the same line. Otherwise we need to parse that bit. return None def _parse(self, until_line): """ Parses at least until the given line, but might just parse more until a valid state is reached. """ last_until_line = 0 while until_line > self._nodes_tree.parsed_until_line: node = self._try_parse_part(until_line) nodes = node.children self._nodes_tree.add_parsed_nodes(nodes) LOG.debug( 'parse_part from %s to %s (to %s in part parser)', nodes[0].get_start_pos_of_prefix()[0], self._nodes_tree.parsed_until_line, node.end_pos[0] - 1 ) # Since the tokenizer sometimes has bugs, we cannot be sure that # this loop terminates. Therefore assert that there's always a # change. assert last_until_line != self._nodes_tree.parsed_until_line, last_until_line last_until_line = self._nodes_tree.parsed_until_line def _try_parse_part(self, until_line): """ Sets up a normal parser that uses a spezialized tokenizer to only parse until a certain position (or a bit longer if the statement hasn't ended. """ self._parser_count += 1 # TODO speed up, shouldn't copy the whole list all the time. # memoryview? parsed_until_line = self._nodes_tree.parsed_until_line lines_after = self._parser_lines_new[parsed_until_line:] tokens = self._diff_tokenize( lines_after, until_line, line_offset=parsed_until_line ) self._active_parser = Parser( self._pgen_grammar, error_recovery=True ) return self._active_parser.parse(tokens=tokens) def _diff_tokenize(self, lines, until_line, line_offset=0): is_first_token = True omitted_first_indent = False indents = [] tokens = self._tokenizer(lines, (1, 0)) stack = self._active_parser.stack for typ, string, start_pos, prefix in tokens: start_pos = start_pos[0] + line_offset, start_pos[1] if typ == PythonTokenTypes.INDENT: indents.append(start_pos[1]) if is_first_token: omitted_first_indent = True # We want to get rid of indents that are only here because # we only parse part of the file. These indents would only # get parsed as error leafs, which doesn't make any sense. is_first_token = False continue is_first_token = False # In case of omitted_first_indent, it might not be dedented fully. # However this is a sign for us that a dedent happened. if typ == PythonTokenTypes.DEDENT \ or typ == PythonTokenTypes.ERROR_DEDENT \ and omitted_first_indent and len(indents) == 1: indents.pop() if omitted_first_indent and not indents: # We are done here, only thing that can come now is an # endmarker or another dedented code block. typ, string, start_pos, prefix = next(tokens) if '\n' in prefix or '\r' in prefix: prefix = re.sub(r'[^\n\r]+\Z', '', prefix) else: assert start_pos[1] >= len(prefix), repr(prefix) if start_pos[1] - len(prefix) == 0: prefix = '' yield PythonToken( PythonTokenTypes.ENDMARKER, '', (start_pos[0] + line_offset, 0), prefix ) break elif typ == PythonTokenTypes.NEWLINE and start_pos[0] >= until_line: yield PythonToken(typ, string, start_pos, prefix) # Check if the parser is actually in a valid suite state. if _suite_or_file_input_is_valid(self._pgen_grammar, stack): start_pos = start_pos[0] + 1, 0 while len(indents) > int(omitted_first_indent): indents.pop() yield PythonToken(PythonTokenTypes.DEDENT, '', start_pos, '') yield PythonToken(PythonTokenTypes.ENDMARKER, '', start_pos, '') break else: continue yield PythonToken(typ, string, start_pos, prefix)
def update(self, old_lines, new_lines): ''' The algorithm works as follows: Equal: - Assure that the start is a newline, otherwise parse until we get one. - Copy from parsed_until_line + 1 to max(i2 + 1) - Make sure that the indentation is correct (e.g. add DEDENT) - Add old and change positions Insert: - Parse from parsed_until_line + 1 to min(j2 + 1), hopefully not much more. Returns the new module node. ''' LOG.debug('diff parser start') # Reset the used names cache so they get regenerated. self._module._used_names = None self._parser_lines_new = new_lines self._reset() line_length = len(new_lines) sm = difflib.SequenceMatcher(None, old_lines, self._parser_lines_new) opcodes = sm.get_opcodes() LOG.debug('line_lengths old: %s; new: %s' % (len(old_lines), line_length)) for operation, i1, i2, j1, j2 in opcodes: LOG.debug('-> code[%s] old[%s:%s] new[%s:%s]', operation, i1 + 1, i2, j1 + 1, j2) if j2 == line_length and new_lines[-1] == '': # The empty part after the last newline is not relevant. j2 -= 1 if operation == 'equal': line_offset = j1 - i1 self._copy_from_old_parser(line_offset, i1 + 1, i2, j2) elif operation == 'replace': self._parse(until_line=j2) elif operation == 'insert': self._parse(until_line=j2) else: assert operation == 'delete' # With this action all change will finally be applied and we have a # changed module. self._nodes_tree.close() if DEBUG_DIFF_PARSER: # If there is reasonable suspicion that the diff parser is not # behaving well, this should be enabled. try: code = ''.join(new_lines) assert self._module.get_code() == code _assert_valid_graph(self._module) without_diff_parser_module = Parser( self._pgen_grammar, error_recovery=True).parse(self._tokenizer(new_lines)) _assert_nodes_are_equal(self._module, without_diff_parser_module) except AssertionError: print( _get_debug_error_message(self._module, old_lines, new_lines)) raise last_pos = self._module.end_pos[0] if last_pos != line_length: raise Exception( ('(%s != %s) ' % (last_pos, line_length)) + _get_debug_error_message(self._module, old_lines, new_lines)) LOG.debug('diff parser end') return self._module