def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \ Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]: """ Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent. Given the source text and the corresponding UAST this function compiles the list of `VirtualNode`-s and the parents mapping. That list of nodes equals to the original source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from `id(node)` to its parent `bblfsh.Node`. :param contents: source file text :param root: UAST root node :param path: path to the file, used for debugging :return: list of `VirtualNode`-s and the parents. """ # build the line mapping lines = contents.splitlines(keepends=True) # Check if there is a newline in the end of file. Yes, you can just check # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for # new line this condition gives wrong result. eof_new_line = lines[-1].splitlines()[0] != lines[-1] if eof_new_line: # We add last line as empty one because it actually exists, but .splitlines() does not # return it. lines.append("") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) line_offsets[-1] = pos + 1 # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: node = self.node_fixtures[node.internal_type](node) for child in node.children: parents[id(child)] = node queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes result = [] pos = 0 parser = self.tokens.PARSER searchsorted = numpy.searchsorted for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): positions = [] for suboff in (match.start(), match.end()): offset = pos + suboff line = searchsorted(line_offsets, offset, side="right") col = offset - line_offsets[line - 1] + 1 positions.append(Position(offset, line, col)) token = match.group() sumlen += len(token) result.append(VirtualNode(token, *positions, path=path)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers)) pos = node.end_position.offset return result, parents
def _parse_file(self, file: AnnotationManager) -> None: """ Annotate source code with `RawTokenAnnotation`-s. Given the source text and the corresponding UAST this function covers all code with a `RawTokenAnnotation`-s. :param file: Source code annotated with `UASTAnnotation`. """ # TODO(zurk): rename this function when the refactoring is finished. contents = file.sequence # build the line mapping lines = contents.splitlines(keepends=True) # Check if there is a newline in the end of file. Yes, you can just check # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for # new line this condition gives wrong result. eof_new_line = lines[-1].splitlines()[0] != lines[-1] if eof_new_line: # We add last line as empty one because it actually exists, but .splitlines() does not # return it. lines.append("") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) line_offsets[-1] = pos + 1 # walk the tree: collect nodes with assigned tokens node_tokens = [] queue = [file.get(UASTAnnotation).uast] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: self.node_fixtures[node.internal_type](node) queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes pos = 0 parser = self.tokens.PARSER for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): offsets = [] for suboff in (match.start(), match.end()): offsets.append(pos + suboff) token = match.group() sumlen += len(token) file.add(RawTokenAnnotation(*offsets)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break uast_node_annot = list( VirtualNode.from_node(node, contents, self.token_unwrappers)) file.add(*uast_node_annot) pos = node.end_position.offset
def analyze_uast(path: str, root: bblfsh.Node, roles: set, reserved: set): contents = Path(path).read_text() # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() for child in node.children: parents[id(child)] = node queue.extend(node.children) if node.token or node.start_position and node.end_position and not node.children: node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = contents.count("\n") node_tokens.append(sentinel) # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes pos = 0 ws = re.compile("\s+") alpha = re.compile("[a-zA-Z]+") IDENTIFIER = bblfsh.role_id("IDENTIFIER") log = logging.getLogger("analyze_uast") def ccheck(char: str) -> bool: return not char.isspace() and not char.isalnum( ) and not ord(char) >= 128 for node in node_tokens: token = node.token if node.token else \ contents[node.start_position.offset:node.end_position.offset] if node.start_position.offset > pos: diff = contents[pos:node.start_position.offset] parts = ws.split(diff) for part in parts: if len(part) >= 8: continue # for keyword in alpha.finditer(part): # reserved.add(keyword.group()) for nonalpha in alpha.split(part): for char in nonalpha: if ccheck(char): reserved.add(char) if node is sentinel: break pos = node.end_position.offset if IDENTIFIER not in node.roles: continue outer = contents[node.start_position.offset:node.end_position.offset] if outer == token: continue pos = outer.find(token) if pos < 0: log.warning( "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path, token, outer) break if pos > 0: for char in outer[:pos]: if ccheck(char): reserved.add(char) if pos + len(token) < len(outer): for char in outer[pos + len(token):]: if ccheck(char): reserved.add(char)
def tokenize_code(self, path: str, contents: str = None, root: bblfsh.Node = None) -> \ Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]: """ Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent. Given the source text and the corresponding UAST this function compiles the list of `VirtualNode`-s and the parents mapping. That list of nodes equals to the original source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from `id(node)` to its parent `bblfsh.Node`. :param path: path to the file. :param contents: source file text, if not provided - path will be used to read content. :param root: UAST root node. If None - the file will be parsed using bblfsh client. :return: list of `VirtualNode`-s, the parents and root. """ if contents is None: contents = self.client._get_contents(contents=contents, filename=path) if root is None: root = self.client.parse(filename=path, contents=contents).uast # build the line mapping contents = contents.decode() lines = contents.split("\n") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) + 1 line_offsets[-1] = pos # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() if node.internal_type in self.node_fixtures: node = self.node_fixtures[node.internal_type](node) for child in node.children: parents[id(child)] = node queue.extend(node.children) if (node.token or node.start_position and node.end_position and node.start_position != node.end_position and not node.children): node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes result = [] pos = 0 parser = self.tokens.PARSER searchsorted = numpy.searchsorted for node in node_tokens: if node.start_position.offset < pos: continue if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): positions = [] for suboff in (match.start(), match.end()): offset = pos + suboff line = searchsorted(line_offsets, offset, side="right") col = offset - line_offsets[line - 1] + 1 positions.append(Position(offset, line, col)) token = match.group() sumlen += len(token) result.append(VirtualNode(token, *positions, path=path)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers)) pos = node.end_position.offset return result, parents, root
def analyze_uast(path: str, content: str, root: bblfsh.Node, internal_types: dict, roles: dict, reserved: set): """ Fill internal types, roles and reserved dictionaries with statistics computed from an UAST. :param path: Path of the analyzed file. :param content: Content of the analyzed file. :param root: UAST of the analyzed file. :param internal_types: Dictionary containing the internal types statistics. :param roles: Dictionary containing the roles statistics. :param reserved: Dictionary containing the reserved (or tokens) statistics. """ # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() internal_types[node.internal_type] += 1 for role in node.roles: roles[role] += 1 for child in node.children: parents[id(child)] = node queue.extend(node.children) if node.token or node.start_position and node.end_position and not node.children: node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(content) sentinel.start_position.line = content.count("\n") node_tokens.append(sentinel) # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes pos = 0 ws = re.compile("\s+") alpha = re.compile("[a-zA-Z]+") IDENTIFIER = bblfsh.role_id("IDENTIFIER") log = logging.getLogger("analyze_uast") def ccheck(char: str) -> bool: return not char.isspace() and not char.isalnum( ) and not ord(char) >= 128 for node in node_tokens: token = node.token if node.token else \ content[node.start_position.offset:node.end_position.offset] if node.start_position.offset > pos: diff = content[pos:node.start_position.offset] parts = ws.split(diff) for part in parts: if len(part) >= 8: log.debug("Skipping weird part in code: %s. Path: %s", diff, path) continue for nonalpha in alpha.split(part): for char in nonalpha: if ccheck(char): reserved.add(char) if node is sentinel: break pos = node.end_position.offset if IDENTIFIER not in node.roles: continue outer = content[node.start_position.offset:node.end_position.offset] if outer == token: continue pos = outer.find(token) if pos < 0: log.warning( "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path, token, outer) break if pos > 0: for char in outer[:pos]: if ccheck(char): reserved.add(char) if pos + len(token) < len(outer): for char in outer[pos + len(token):]: if ccheck(char): reserved.add(char)
def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \ Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]: """ Given the source text and the corresponding UAST this function compiles the list of `VirtualNode`-s and the parents mapping. That list of nodes equals to the original source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from `id(node)` to its parent `bblfsh.Node`. :param contents: source file text :param root: UAST root node :return: list of `VirtualNode`-s and the parents. """ # build the line mapping lines = contents.split("\n") line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32) pos = 0 for i, line in enumerate(lines): line_offsets[i] = pos pos += len(line) + 1 line_offsets[-1] = pos # walk the tree: collect nodes with assigned tokens and build the parents map node_tokens = [] parents = {} queue = [root] while queue: node = queue.pop() for child in node.children: parents[id(child)] = node queue.extend(node.children) if node.token or node.start_position and node.end_position and not node.children: node_tokens.append(node) node_tokens.sort(key=lambda n: n.start_position.offset) sentinel = bblfsh.Node() sentinel.start_position.offset = len(contents) sentinel.start_position.line = len(lines) node_tokens.append(sentinel) # scan `node_tokens` and fill the gaps with imaginary nodes result = [] pos = 0 parser = self.tokens.PARSER searchsorted = numpy.searchsorted for node in node_tokens: if node.start_position.offset > pos: sumlen = 0 diff = contents[pos:node.start_position.offset] for match in parser.finditer(diff): positions = [] for suboff in (match.start(), match.end()): offset = pos + suboff line = searchsorted(line_offsets, offset, side="right") col = offset - line_offsets[line - 1] + 1 positions.append(Position(offset, line, col)) token = match.group() sumlen += len(token) result.append(VirtualNode(token, *positions, path=path)) assert sumlen == node.start_position.offset - pos, \ "missed some imaginary tokens: \"%s\"" % diff if node is sentinel: break result.extend(VirtualNode.from_node(node, contents, path)) pos = node.end_position.offset return result, parents
def SerializeToString(self): return bblfsh.Node().SerializeToString()