Пример #1
0
    def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \
            Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]:
        """
        Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent.

        Given the source text and the corresponding UAST this function compiles the list of
        `VirtualNode`-s and the parents mapping. That list of nodes equals to the original
        source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from
        `id(node)` to its parent `bblfsh.Node`.

        :param contents: source file text
        :param root: UAST root node
        :param path: path to the file, used for debugging
        :return: list of `VirtualNode`-s and the parents.
        """
        # build the line mapping
        lines = contents.splitlines(keepends=True)
        # Check if there is a newline in the end of file. Yes, you can just check
        # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for
        # new line this condition gives wrong result.
        eof_new_line = lines[-1].splitlines()[0] != lines[-1]
        if eof_new_line:
            # We add last line as empty one because it actually exists, but .splitlines() does not
            # return it.
            lines.append("")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line)
        line_offsets[-1] = pos + 1

        # walk the tree: collect nodes with assigned tokens and build the parents map
        node_tokens = []
        parents = {}
        queue = [root]
        while queue:
            node = queue.pop()
            if node.internal_type in self.node_fixtures:
                node = self.node_fixtures[node.internal_type](node)
            for child in node.children:
                parents[id(child)] = node
            queue.extend(node.children)
            if (node.token or node.start_position and node.end_position
                    and node.start_position != node.end_position and not node.children):
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        result = []
        pos = 0
        parser = self.tokens.PARSER
        searchsorted = numpy.searchsorted
        for node in node_tokens:
            if node.start_position.offset < pos:
                continue
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    positions = []
                    for suboff in (match.start(), match.end()):
                        offset = pos + suboff
                        line = searchsorted(line_offsets, offset, side="right")
                        col = offset - line_offsets[line - 1] + 1
                        positions.append(Position(offset, line, col))
                    token = match.group()
                    sumlen += len(token)
                    result.append(VirtualNode(token, *positions, path=path))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers))
            pos = node.end_position.offset
        return result, parents
Пример #2
0
    def _parse_file(self, file: AnnotationManager) -> None:
        """
        Annotate source code with `RawTokenAnnotation`-s.

        Given the source text and the corresponding UAST this function covers all code with a
        `RawTokenAnnotation`-s.

        :param file: Source code annotated with `UASTAnnotation`.
        """
        # TODO(zurk): rename this function when the refactoring is finished.
        contents = file.sequence
        # build the line mapping
        lines = contents.splitlines(keepends=True)
        # Check if there is a newline in the end of file. Yes, you can just check
        # lines[-1][-1] == "\n" but if someone decide to use weird '\u2028' unicode character for
        # new line this condition gives wrong result.
        eof_new_line = lines[-1].splitlines()[0] != lines[-1]
        if eof_new_line:
            # We add last line as empty one because it actually exists, but .splitlines() does not
            # return it.
            lines.append("")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line)
        line_offsets[-1] = pos + 1

        # walk the tree: collect nodes with assigned tokens
        node_tokens = []
        queue = [file.get(UASTAnnotation).uast]
        while queue:
            node = queue.pop()
            if node.internal_type in self.node_fixtures:
                self.node_fixtures[node.internal_type](node)
            queue.extend(node.children)
            if (node.token or node.start_position and node.end_position
                    and node.start_position != node.end_position
                    and not node.children):
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        pos = 0
        parser = self.tokens.PARSER
        for node in node_tokens:
            if node.start_position.offset < pos:
                continue
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    offsets = []
                    for suboff in (match.start(), match.end()):
                        offsets.append(pos + suboff)
                    token = match.group()
                    sumlen += len(token)
                    file.add(RawTokenAnnotation(*offsets))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            uast_node_annot = list(
                VirtualNode.from_node(node, contents, self.token_unwrappers))
            file.add(*uast_node_annot)
            pos = node.end_position.offset
Пример #3
0
def analyze_uast(path: str, root: bblfsh.Node, roles: set, reserved: set):
    contents = Path(path).read_text()

    # walk the tree: collect nodes with assigned tokens and build the parents map
    node_tokens = []
    parents = {}
    queue = [root]
    while queue:
        node = queue.pop()
        for child in node.children:
            parents[id(child)] = node
        queue.extend(node.children)
        if node.token or node.start_position and node.end_position and not node.children:
            node_tokens.append(node)
    node_tokens.sort(key=lambda n: n.start_position.offset)
    sentinel = bblfsh.Node()
    sentinel.start_position.offset = len(contents)
    sentinel.start_position.line = contents.count("\n")
    node_tokens.append(sentinel)

    # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes
    pos = 0
    ws = re.compile("\s+")
    alpha = re.compile("[a-zA-Z]+")
    IDENTIFIER = bblfsh.role_id("IDENTIFIER")
    log = logging.getLogger("analyze_uast")

    def ccheck(char: str) -> bool:
        return not char.isspace() and not char.isalnum(
        ) and not ord(char) >= 128

    for node in node_tokens:
        token = node.token if node.token else \
            contents[node.start_position.offset:node.end_position.offset]
        if node.start_position.offset > pos:
            diff = contents[pos:node.start_position.offset]
            parts = ws.split(diff)
            for part in parts:
                if len(part) >= 8:
                    continue
                # for keyword in alpha.finditer(part):
                #    reserved.add(keyword.group())
                for nonalpha in alpha.split(part):
                    for char in nonalpha:
                        if ccheck(char):
                            reserved.add(char)
        if node is sentinel:
            break
        pos = node.end_position.offset
        if IDENTIFIER not in node.roles:
            continue
        outer = contents[node.start_position.offset:node.end_position.offset]
        if outer == token:
            continue
        pos = outer.find(token)
        if pos < 0:
            log.warning(
                "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path,
                token, outer)
            break
        if pos > 0:
            for char in outer[:pos]:
                if ccheck(char):
                    reserved.add(char)
        if pos + len(token) < len(outer):
            for char in outer[pos + len(token):]:
                if ccheck(char):
                    reserved.add(char)
Пример #4
0
    def tokenize_code(self, path: str, contents: str = None, root: bblfsh.Node = None) -> \
            Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]:
        """
        Parse a file into a sequence of `VirtuaNode`-s and a mapping from VirtualNode to parent.

        Given the source text and the corresponding UAST this function compiles the list of
        `VirtualNode`-s and the parents mapping. That list of nodes equals to the original
        source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from
        `id(node)` to its parent `bblfsh.Node`.

        :param path: path to the file.
        :param contents: source file text, if not provided - path will be used to read content.
        :param root: UAST root node. If None - the file will be parsed using bblfsh client.
        :return: list of `VirtualNode`-s, the parents and root.
        """
        if contents is None:
            contents = self.client._get_contents(contents=contents, filename=path)
        if root is None:
            root = self.client.parse(filename=path, contents=contents).uast
        # build the line mapping
        contents = contents.decode()
        lines = contents.split("\n")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line) + 1
        line_offsets[-1] = pos

        # walk the tree: collect nodes with assigned tokens and build the parents map
        node_tokens = []
        parents = {}
        queue = [root]
        while queue:
            node = queue.pop()
            if node.internal_type in self.node_fixtures:
                node = self.node_fixtures[node.internal_type](node)
            for child in node.children:
                parents[id(child)] = node
            queue.extend(node.children)
            if (node.token or node.start_position and node.end_position
                    and node.start_position != node.end_position and not node.children):
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        result = []
        pos = 0
        parser = self.tokens.PARSER
        searchsorted = numpy.searchsorted
        for node in node_tokens:
            if node.start_position.offset < pos:
                continue
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    positions = []
                    for suboff in (match.start(), match.end()):
                        offset = pos + suboff
                        line = searchsorted(line_offsets, offset, side="right")
                        col = offset - line_offsets[line - 1] + 1
                        positions.append(Position(offset, line, col))
                    token = match.group()
                    sumlen += len(token)
                    result.append(VirtualNode(token, *positions, path=path))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            result.extend(VirtualNode.from_node(node, contents, path, self.token_unwrappers))
            pos = node.end_position.offset
        return result, parents, root
Пример #5
0
def analyze_uast(path: str, content: str, root: bblfsh.Node,
                 internal_types: dict, roles: dict, reserved: set):
    """
    Fill internal types, roles and reserved dictionaries with statistics computed from an UAST.

    :param path: Path of the analyzed file.
    :param content: Content of the analyzed file.
    :param root: UAST of the analyzed file.
    :param internal_types: Dictionary containing the internal types statistics.
    :param roles: Dictionary containing the roles statistics.
    :param reserved: Dictionary containing the reserved (or tokens) statistics.
    """
    # walk the tree: collect nodes with assigned tokens and build the parents map
    node_tokens = []
    parents = {}
    queue = [root]
    while queue:
        node = queue.pop()
        internal_types[node.internal_type] += 1
        for role in node.roles:
            roles[role] += 1
        for child in node.children:
            parents[id(child)] = node
        queue.extend(node.children)
        if node.token or node.start_position and node.end_position and not node.children:
            node_tokens.append(node)
    node_tokens.sort(key=lambda n: n.start_position.offset)
    sentinel = bblfsh.Node()
    sentinel.start_position.offset = len(content)
    sentinel.start_position.line = content.count("\n")
    node_tokens.append(sentinel)

    # scan `node_tokens` and analyze the gaps and the token prefixes and suffixes
    pos = 0
    ws = re.compile("\s+")
    alpha = re.compile("[a-zA-Z]+")
    IDENTIFIER = bblfsh.role_id("IDENTIFIER")
    log = logging.getLogger("analyze_uast")

    def ccheck(char: str) -> bool:
        return not char.isspace() and not char.isalnum(
        ) and not ord(char) >= 128

    for node in node_tokens:
        token = node.token if node.token else \
            content[node.start_position.offset:node.end_position.offset]
        if node.start_position.offset > pos:
            diff = content[pos:node.start_position.offset]
            parts = ws.split(diff)
            for part in parts:
                if len(part) >= 8:
                    log.debug("Skipping weird part in code: %s. Path: %s",
                              diff, path)
                    continue
                for nonalpha in alpha.split(part):
                    for char in nonalpha:
                        if ccheck(char):
                            reserved.add(char)
        if node is sentinel:
            break
        pos = node.end_position.offset
        if IDENTIFIER not in node.roles:
            continue
        outer = content[node.start_position.offset:node.end_position.offset]
        if outer == token:
            continue
        pos = outer.find(token)
        if pos < 0:
            log.warning(
                "skipped %s, token offset corruption \"%s\" vs. \"%s\"", path,
                token, outer)
            break
        if pos > 0:
            for char in outer[:pos]:
                if ccheck(char):
                    reserved.add(char)
        if pos + len(token) < len(outer):
            for char in outer[pos + len(token):]:
                if ccheck(char):
                    reserved.add(char)
Пример #6
0
    def _parse_file(self, contents: str, root: bblfsh.Node, path: str) -> \
            Tuple[List[VirtualNode], Dict[int, bblfsh.Node]]:
        """
        Given the source text and the corresponding UAST this function compiles the list of
        `VirtualNode`-s and the parents mapping. That list of nodes equals to the original
        source text bit-to-bit after `"".join(n.value for n in nodes)`. `parents` map from
        `id(node)` to its parent `bblfsh.Node`.

        :param contents: source file text
        :param root: UAST root node
        :return: list of `VirtualNode`-s and the parents.
        """
        # build the line mapping
        lines = contents.split("\n")
        line_offsets = numpy.zeros(len(lines) + 1, dtype=numpy.int32)
        pos = 0
        for i, line in enumerate(lines):
            line_offsets[i] = pos
            pos += len(line) + 1
        line_offsets[-1] = pos

        # walk the tree: collect nodes with assigned tokens and build the parents map
        node_tokens = []
        parents = {}
        queue = [root]
        while queue:
            node = queue.pop()
            for child in node.children:
                parents[id(child)] = node
            queue.extend(node.children)
            if node.token or node.start_position and node.end_position and not node.children:
                node_tokens.append(node)
        node_tokens.sort(key=lambda n: n.start_position.offset)
        sentinel = bblfsh.Node()
        sentinel.start_position.offset = len(contents)
        sentinel.start_position.line = len(lines)
        node_tokens.append(sentinel)

        # scan `node_tokens` and fill the gaps with imaginary nodes
        result = []
        pos = 0
        parser = self.tokens.PARSER
        searchsorted = numpy.searchsorted
        for node in node_tokens:
            if node.start_position.offset > pos:
                sumlen = 0
                diff = contents[pos:node.start_position.offset]
                for match in parser.finditer(diff):
                    positions = []
                    for suboff in (match.start(), match.end()):
                        offset = pos + suboff
                        line = searchsorted(line_offsets, offset, side="right")
                        col = offset - line_offsets[line - 1] + 1
                        positions.append(Position(offset, line, col))
                    token = match.group()
                    sumlen += len(token)
                    result.append(VirtualNode(token, *positions, path=path))
                assert sumlen == node.start_position.offset - pos, \
                    "missed some imaginary tokens: \"%s\"" % diff
            if node is sentinel:
                break
            result.extend(VirtualNode.from_node(node, contents, path))
            pos = node.end_position.offset
        return result, parents
Пример #7
0
 def SerializeToString(self):
     return bblfsh.Node().SerializeToString()