def _parse_file(self, file: AnnotationManager) -> None: """ Annotate source code with `RawTokenAnnotation`-s. Given the source text and the corresponding UAST this function covers all code with a `RawTokenAnnotation`-s. :param file: Source code annotated with `UASTAnnotation`. """ # TODO(zurk): rename this function when the refactoring is finished. contents = file.sequence # build the line mapping lines = contents.splitlines(keepends=True) # Check if there is a newline in the end of file. Yes, you can just check # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for # new line this condition gives wrong result. eof_new_line = lines[-1].splitlines()[0] != lines[-1] if eof_new_line: # We add last line as empty one because it actually exists, but .splitlines() does not # return it. lines.append("") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) line_offsets[-1] = pos + 1 # walk the tree: collect nodes with assigned tokens node_tokens = [] queue = [file.get(UASTAnnotation).uast] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: self.node_fixtures[node.internal_type](node) queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes pos = 0 parser = self.tokens.PARSER for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): offsets = [] for suboff in (match.start(), match.end()): offsets.append(pos + suboff) token = match.group() sumlen += len(token) file.add(RawTokenAnnotation(*offsets)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break uast_node_annot = list( VirtualNode.from_node(node, contents, self.token_unwrappers)) file.add(*uast_node_annot) pos = node.end_position.offset
def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \ Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]: """ Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent. Given the source text and the corresponding UAST this function compiles the list of `VirtualNode`-s and the parents mapping. That list of nodes equals to the original source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from `id(node)` to its parent `bblfsh.Node`. :param contents: source file text :param root: UAST root node :param path: path to the file, used for debugging :return: list of `VirtualNode`-s and the parents. """ # build the line mapping lines = contents.splitlines(keepends=True) # Check if there is a newline in the end of file. Yes, you can just check # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for # new line this condition gives wrong result. eof_new_line = lines[-1].splitlines()[0] != lines[-1] if eof_new_line: # We add last line as empty one because it actually exists, but .splitlines() does not # return it. lines.append("") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) line_offsets[-1] = pos + 1 # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: node = self.node_fixtures[node.internal_type](node) for child in node.children: parents[id(child)] = node queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes result = [] pos = 0 parser = self.tokens.PARSER searchsorted = numpy.searchsorted for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): positions = [] for suboff in (match.start(), match.end()): offset = pos + suboff line = searchsorted(line_offsets, offset, side="right") col = offset - line_offsets[line - 1] + 1 positions.append(Position(offset, line, col)) token = match.group() sumlen += len(token) result.append(VirtualNode(token, *positions, path=path)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers)) pos = node.end_position.offset return result, parents