def prepare_files(filenames: Iterable[str], client: BblfshClient, language: str) -> Iterable[File]: """ Prepare the given folder for analysis by extracting UASTs and creating the gRPC wrappers. :param filenames: List of paths to files to analyze. :param client: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider. Will discard the other languages. :return: Iterator of File-s with content, uast, path and language set. """ files = [] for file in tqdm(filter_filepaths(list(filenames))): try: res = client.parse(file) except NonUTF8ContentException: # skip files that can't be parsed because of UTF-8 decoding errors. continue if res.status == 0 and res.language.lower() == language.lower(): uast = res.uast path = file with open(file) as f: content = f.read().encode("utf-8") files.append( File(content=content, uast=uast, path=path, language=res.language.lower())) return files
def return_features() -> Response: """Featurize the given code.""" body = request.get_json() code = body["code"] babelfish_address = body["babelfish_address"] language = body["language"] client = BblfshClient(babelfish_address) res = client.parse(filename="", contents=code.encode(), language=language) if res.status != 0: abort(500) model = FormatModel().load(str(Path(__file__).parent / "models" / "model.asdf")) if language not in model: raise NotFittedError() rules = model[language] file = UnicodeFile(content=code, uast=res.uast, language="javascript", path="path") config = rules.origin_config["feature_extractor"] config["return_sibling_indices"] = True fe = FeatureExtractor(language=language, **config) res = fe.extract_features([file]) if res is None: abort(500) X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe) refuse_to_predict = y_pred < 0 checker = UASTStabilityChecker(fe) _, _, _, _, safe_preds = checker.check( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files=[file], stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) break_uast = [False] * X.shape[0] for wrong_pred in set(range(X.shape[0])).difference(safe_preds): break_uast[wrong_pred] = True labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} app.logger.info("returning features of shape %d, %d" % X.shape) app.logger.info("length of rules: %d", len(rules)) return jsonify({ "code": code, "features": _input_matrix_to_descriptions(X, fe), "ground_truths": y.tolist(), "predictions": y_pred.tolist(), "refuse_to_predict": refuse_to_predict.tolist(), "sibling_indices": sibling_indices, "rules": _rules_to_jsonable(rules, fe), "winners": rule_winners.tolist(), "break_uast": break_uast, "feature_names": fe.feature_names, "class_representations": fe.composite_class_representations, "class_printables": fe.composite_class_printables, "vnodes": list(map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)), "config": _mapping_to_jsonable(rules.origin_config)})
def prepare_file(filename: str, client: BblfshClient, language: str) -> File: """ Prepare the given file for analysis by extracting UAST and creating the gRPC wrapper. :param filename: Path to the filename to analyze. :param client: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider. Will discard the other languages """ assert os.path.isfile(filename), "\"%s\" should be a file" % filename res = client.parse(filename, language) assert res.status == 0, "Parse returned status %s for file %s" % ( res.status, filename) error_log = "Language for % should be %s instead of %s" assert res.language.lower() == language.lower(), error_log % ( filename, language, res.language) with open(filename) as f: content = f.read().encode("utf-8") return File(content=content, uast=res.uast, path=filename)
class BblfshTests(unittest.TestCase): BBLFSH_SERVER_EXISTED = None fixtures_pyfile = "fixtures/test.py" fixtures_cfile = "fixtures/test.c" @classmethod def setUpClass(cls: t.Any) -> None: cls.BBLFSH_SERVER_EXISTED = ensure_bblfsh_is_running() @classmethod def tearDownClass(cls: t.Any) -> None: if not cls.BBLFSH_SERVER_EXISTED: client = docker.from_env(version="auto") client.containers.get("bblfshd").remove(force=True) client.api.close() def setUp(self) -> None: self.client = BblfshClient("localhost:9432") def _parse_fixture(self) -> ResultContext: ctx = self.client.parse(self.fixtures_pyfile) self._validate_ctx(ctx) return ctx def testVersion(self) -> None: version = self.client.version() self.assertTrue(hasattr(version, "version")) self.assertTrue(version.version) self.assertTrue(hasattr(version, "build")) self.assertTrue(version.build) def testNativeParse(self) -> None: ctx = self.client.parse(self.fixtures_pyfile, mode=Modes.NATIVE) self._validate_ctx(ctx) self.assertIsNotNone(ctx) it = ctx.filter("//*[@ast_type='NoopLine']") self.assertIsNotNone(it) self.assertIsInstance(it, NodeIterator) res = list(it) self.assertGreater(len(res), 1) for i in res: t = i.get_dict().get("ast_type") self.assertIsNotNone(t) self.assertEqual(t, "NoopLine") def testNonUTF8ParseError(self) -> None: self.assertRaises(NonUTF8ContentException, self.client.parse, "", "Python", b"a = '\x80abc'") def testUASTDefaultLanguage(self) -> None: ctx = self._parse_fixture() self.assertEqual(ctx.language, "python") def testUASTWithLanguage(self) -> None: ctx = self.client.parse(self.fixtures_pyfile, language="Python") self._validate_ctx(ctx) self.assertEqual(ctx.language, "python") def testUASTWithLanguageAlias(self) -> None: ctx = self.client.parse(self.fixtures_cfile) self._validate_ctx(ctx) self.assertEqual(ctx.language, "c") it = ctx.filter( "//uast:FunctionGroup/Nodes/uast:Alias/Name/uast:Identifier/Name") self.assertIsInstance(it, NodeIterator) self.assertEqual(next(it).get(), "main") self.assertEqual(next(it).get(), "fib") def testUASTFileContents(self) -> None: with open(self.fixtures_pyfile, "r") as fin: contents = fin.read() ctx = self.client.parse("file.py", contents=contents) self._validate_ctx(ctx) def assert_strnode(n: Node, expected: str) -> None: self.assertEqual(n.get(), expected) self.assertIsInstance(n.get_str(), str) self.assertEqual(n.get_str(), expected) it = ctx.filter("//uast:RuntimeImport/Path/uast:Identifier/Name") self.assertIsInstance(it, NodeIterator) assert_strnode(next(it), "os") assert_strnode(next(it), "resource") assert_strnode(next(it), "unittest") assert_strnode(next(it), "docker") assert_strnode(next(it), "bblfsh") assert_strnode(next(it), "bblfsh") self.assertRaises(StopIteration, next, it) def testBrokenFilter(self) -> None: ctx = self._parse_fixture() self.assertRaises(RuntimeError, ctx.filter, "dsdfkj32423#$@#$") def testFilterToken(self): ctx = self._parse_fixture() it = ctx.filter("//*[@token='else']/text()") first = next(it).get_str() self.assertEqual(first, "else") def testFilterRoles(self) -> None: ctx = self._parse_fixture() it = ctx.filter("//*[@role='Identifier']") self.assertIsInstance(it, NodeIterator) l = list(it) self.assertGreater(len(l), 0) it = ctx.filter("//*[@role='Friend']") self.assertIsInstance(it, NodeIterator) l = list(it) self.assertEqual(len(l), 0) def testFilterProperties(self) -> None: ctx = uast() obj = {"k1": "v1", "k2": "v2"} self.assertTrue(any(ctx.filter("/*[@k1='v1']", obj))) self.assertTrue(any(ctx.filter("/*[@k2='v2']", obj))) self.assertFalse(any(ctx.filter("/*[@k2='v1']", obj))) self.assertFalse(any(ctx.filter("/*[@k1='v2']", obj))) def testFilterStartOffset(self) -> None: ctx = self._parse_fixture() self.assertTrue( any( ctx.filter( "//uast:Positions/start/uast:Position[@offset=11749]"))) self.assertFalse( any( ctx.filter( "//uast:Positions/start/uast:Position[@offset=99999]"))) def testFilterStartLine(self) -> None: ctx = self._parse_fixture() self.assertTrue( any(ctx.filter("//uast:Positions/start/uast:Position[@line=295]"))) self.assertFalse( any(ctx.filter( "//uast:Positions/start/uast:Position[@line=99999]"))) def testFilterStartCol(self) -> None: ctx = self._parse_fixture() self.assertTrue( any(ctx.filter("//uast:Positions/start/uast:Position[@col=42]"))) self.assertFalse( any(ctx.filter( "//uast:Positions/start/uast:Position[@col=99999]"))) def testFilterEndOffset(self) -> None: ctx = self._parse_fixture() self.assertTrue( any(ctx.filter( "//uast:Positions/end/uast:Position[@offset=11757]"))) self.assertFalse( any(ctx.filter( "//uast:Positions/end/uast:Position[@offset=99999]"))) def testFilterEndLine(self) -> None: ctx = self._parse_fixture() self.assertTrue( any(ctx.filter("//uast:Positions/end/uast:Position[@line=321]"))) self.assertFalse( any(ctx.filter("//uast:Positions/end/uast:Position[@line=99999]"))) def testFilterEndCol(self) -> None: ctx = self._parse_fixture() self.assertTrue( any(ctx.filter("//uast:Positions/end/uast:Position[@col=49]"))) self.assertFalse( any(ctx.filter("//uast:Positions/end/uast:Position[@col=99999]"))) def testFilterBool(self) -> None: ctx = self._parse_fixture() self.assertTrue( ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=49])")) self.assertTrue( next( ctx.filter( "boolean(//uast:Positions/end/uast:Position[@col=49])")). get()) self.assertTrue( next( ctx.filter( "boolean(//uast:Positions/end/uast:Position[@col=49])")). get_bool()) self.assertFalse( next( ctx.filter( "boolean(//uast:Positions/end/uast:Position[@col=9999])")). get()) self.assertFalse( next( ctx.filter( "boolean(//uast:Positions/end/uast:Position[@col=9999])")). get_bool()) def testFilterNumber(self) -> None: ctx = self._parse_fixture() self.assertEqual( next( ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])" )).get(), 2) self.assertEqual( next( ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])" )).get_int(), 2) self.assertEqual( next( ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])" )).get_float(), 2.0) def testFilterString(self) -> None: ctx = self._parse_fixture() self.assertEqual( next(ctx.filter("name(//uast:Positions)")).get(), "uast:Positions") self.assertEqual( next(ctx.filter("name(//uast:Positions)")).get_str(), "uast:Positions") def testFilterBadQuery(self) -> None: ctx = uast() self.assertRaises(RuntimeError, ctx.filter, "//[@roleModule]", {}) def testFilterBadType(self) -> None: ctx = self._parse_fixture() res = next( ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])")) self.assertRaises(NodeTypedGetException, res.get_str) def testRoleIdName(self) -> None: self.assertEqual(role_id(role_name(1)), 1) self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER") @staticmethod def _itTestTree() -> dict: def set_position(node: dict, start_offset: int, start_line: int, start_col: int, end_offset: int, end_line: int, end_col: int) -> None: node["@pos"] = { "@type": "uast:Positions", "start": { "@type": "uast:Position", "offset": start_offset, "line": start_line, "col": start_col }, "end": { "@type": "uast:Position", "offset": end_offset, "line": end_line, "col": end_col } } root = {"@type": "root"} set_position(root, 0, 1, 1, 1, 1, 2) son1 = {"@type": "son1"} set_position(son1, 2, 2, 2, 3, 2, 3) son1_1 = {"@type": "son1_1"} set_position(son1_1, 10, 10, 1, 12, 2, 2) son1_2 = {"@type": "son1_2"} set_position(son1_2, 10, 10, 1, 12, 2, 2) son1["children"] = [son1_1, son1_2] son2 = {"@type": "son2"} set_position(son2, 100, 100, 1, 101, 100, 2) son2_1 = {"@type": "son2_1"} set_position(son2_1, 5, 5, 1, 6, 5, 2) son2_2 = {"@type": "son2_2"} set_position(son2_2, 15, 15, 1, 16, 15, 2) son2["children"] = [son2_1, son2_2] root["children"] = [son1, son2] return root @staticmethod def _get_nodetypes(iterator: NodeIterator) -> t.List[str]: return [ n["@type"] for n in filter(lambda x: isinstance(x, dict), iterator) ] def testIteratorPreOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.PRE_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual( expanded, ['root', 'son1', 'son1_1', 'son1_2', 'son2', 'son2_1', 'son2_2']) def testIteratorPostOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.POST_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual( expanded, ['son1_1', 'son1_2', 'son1', 'son2_1', 'son2_2', 'son2', 'root']) def testIteratorLevelOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.LEVEL_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual( expanded, ['root', 'son1', 'son2', 'son1_1', 'son1_2', 'son2_1', 'son2_2']) def testIteratorPositionOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.POSITION_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual( expanded, ['root', 'son1', 'son2_1', 'son1_1', 'son1_2', 'son2_2', 'son2']) def _validate_ctx(self, ctx: ResultContext) -> None: self.assertIsNotNone(ctx) self.assertIsInstance(ctx, ResultContext) self.assertIsInstance(ctx.uast, Node) def testFilterInsideIter(self) -> None: ctx = self._parse_fixture() c2 = uast() for n in ctx.iterate(TreeOrder.PRE_ORDER): c2.filter("//uast:Positions", n) def testItersMixingIterations(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) next(it) next(it) next(it) next(it) it2 = it.iterate(TreeOrder.PRE_ORDER) next(it2) a = next(it).get() b = next(it2).get() self.assertEqual(a, b) def testManyFilters(self) -> None: ctx = self._parse_fixture() before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(10000): ctx.filter("//*[@role='Identifier']") after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testManyParses(self) -> None: before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(100): self.client.parse(self.fixtures_pyfile) after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testManyParsesAndFilters(self) -> None: before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(100): ctx = self.client.parse(self.fixtures_pyfile) ctx.filter("//*[@role='Identifier']") after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testSupportedLanguages(self) -> None: res = self.client.supported_languages() self.assertGreater(len(res), 0) for l in res: for key in ('language', 'version', 'status', 'features'): self.assertTrue(hasattr(l, key)) self.assertIsNotNone(getattr(l, key)) def testEncode(self) -> None: ctx = self._parse_fixture() self.assertEqual(ctx.ctx.encode(None, 0), ctx._response.uast) def testEncodeWithEmptyContext(self) -> None: ctx = ResultContext() obj = {"k1": "v1", "k2": "v2"} fmt = 1 # YAML data = ctx.ctx.encode(obj, fmt) self.assertDictEqual(obj, decode(data, format=fmt).load()) def testGetAll(self) -> None: ctx = self._parse_fixture() expected = ["os", "resource", "unittest", "docker", "bblfsh"] actual = [] for k in ctx.get_all()["body"]: if "@type" in k and k[ "@type"] == "uast:RuntimeImport" and "Path" in k: path = k["Path"] if "Name" in path: actual.append(k["Path"]["Name"]) self.assertListEqual(expected, actual) def testLoad(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) next(it) next(it) next(it) next(it) it2 = it.iterate(TreeOrder.PRE_ORDER) n = next(it2) node_ext = n.node_ext obj = node_ext.load() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) path = obj["Path"] self.assertEqual("uast:Identifier", path["@type"]) self.assertEqual("os", path["Name"])
class Parser: """Parse files into list of nodes.""" _bblfsh_language: str _parser_reserved: Pattern _parser_space: Pattern _uast_fixers: Optional[Dict[str, Callable[[BblfshNode], None]]] _convert_to_utf8: bool _logger: Logger def __init_subclass__( cls, bblfsh_language: str, reserved: List[str], uast_fixers: Optional[Dict[str, Callable[[BblfshNode], None]]] = None, convert_to_utf8: bool = True, ) -> None: cls._bblfsh_language = bblfsh_language cls._parser_reserved = re_compile( "|".join(re_escape(i) for i in sorted(reserved, reverse=True)) ) cls._parser_space = re_compile(r"\s+") cls._uast_fixers = uast_fixers if uast_fixers else {} cls._convert_to_utf8 = convert_to_utf8 cls._logger = getLogger(cls.__name__) def __init__( self, bblfshd_endpoint: str = environ.get("BBLFSHD_ENDPOINT", "0.0.0.0:9432"), split_formatting: bool = False, ) -> None: """Construct a parser.""" for attr in [ "_bblfsh_language", "_parser_reserved", "_parser_space", "_uast_fixers", ]: if not hasattr(self, attr): raise NotImplementedError( f"The {self.__class__.__name__} is a base class and should not be " "used directly." ) self._bblfsh_client = BblfshClient(bblfshd_endpoint) self._split_formatting = split_formatting @property def split_formatting(self) -> bool: return self._split_formatting def parse(self, repository_path: Path, file_path: Path) -> Nodes: """ Parse a file into a list of `Node`s. :param repository_path: Path of the folder that contains the file to parse. :param file_path: Path of the file to parse. :return: List of parsed `Node`s. """ response = self._bblfsh_client.parse( str(repository_path / file_path), language=self._bblfsh_language ) if response.status != 0: self._logger.warn( "Could not process file %s, errors: %s", file_path, "; ".join(response.errors), ) raise ParsingException( f"Could not process file {file_path}, " f"errors: {'; '.join(response.errors)}" ) file_content = (repository_path / file_path).read_text( encoding="utf-8", errors="replace" ) bblfsh_node_converter = BblfshNodeConverter( file_content, convert_to_utf8=self._convert_to_utf8 ) root_node = bblfsh_node_converter.bblfsh_node_to_node(response.uast, None) to_visit = [(response.uast, root_node)] non_formatting_tokens = [] while to_visit: current_bblfsh_node, current_node = to_visit.pop() if current_bblfsh_node.internal_type in self._uast_fixers: current_bblfsh_node = self._uast_fixers[ current_bblfsh_node.internal_type ](current_bblfsh_node) if current_bblfsh_node is None: continue to_visit.extend( ( bblfsh_child, bblfsh_node_converter.bblfsh_node_to_node( bblfsh_child, current_node ), ) for bblfsh_child in current_bblfsh_node.children ) if ( current_node.token and not current_bblfsh_node.children and (current_node.start is not None and current_node.end is not None) ): non_formatting_tokens.append(current_node) sentinel = Node( token=None, internal_type="Sentinel", roles=[], parent=None, start=len(file_content), end=len(file_content), ) non_formatting_tokens.append(sentinel) pos = 0 tokens = [] for node in sorted(non_formatting_tokens, key=lambda n: n.start): if node.start < pos: continue if node.start > pos: sumlen = 0 diff = file_content[pos : node.start] additional_nodes = [] for match in self._parser_reserved.finditer(diff): token = match.group() additional_nodes.append( Node( start=match.start() + pos, end=match.end() + pos, token=token, parent=None, internal_type=token.title(), roles=[match.group().upper()], ) ) sumlen += len(token) for match in self._parser_space.finditer(diff): token = match.group() assert token.isspace() additional_nodes.append( Node( start=match.start() + pos, end=match.end() + pos, token=token, parent=None, internal_type=FORMATTING_INTERNAL_TYPE, roles=[FORMATTING_ROLE], ) ) sumlen += len(token) if sumlen != node.start - pos: self._logger.warn(f"missed some imaginary tokens: {diff}") raise ParsingException(f"missed some imaginary tokens: {diff}") tokens.extend(sorted(additional_nodes, key=lambda n: n.start)) if node is sentinel: break tokens.append(node) pos = node.end tokens = self._augment_tokens(tokens) closest_left_node = None for i, token_node in enumerate(tokens): if token_node.parent is not None: closest_left_node = token_node else: found_parent = self._find_parent(i, tokens, closest_left_node) token_node.parent = ( found_parent if found_parent is not None else root_node ) if self._split_formatting: tokens = self._perform_split_formatting(tokens) reconstructed_file_content = "".join(node.token for node in tokens) if file_content != reconstructed_file_content: diff = "".join( unified_diff( file_content.splitlines(keepends=True), reconstructed_file_content.splitlines(keepends=True), fromfile="original", tofile="reconstructed", ) ) self._logger.warn("reconstructed file is not equal to original:\n%s", diff) return Nodes.from_token_nodes(tokens) def _augment_tokens(self, tokens: List[Node]) -> List[Node]: augmented_tokens = [] if not tokens or tokens[0].internal_type != FORMATTING_INTERNAL_TYPE: augmented_tokens.append( Node( start=0, end=0, token="", parent=None, internal_type=FORMATTING_INTERNAL_TYPE, roles=[FORMATTING_ROLE], ) ) if tokens: augmented_tokens.append(tokens[0]) for previous_token, next_token in zip( islice(tokens, 0, None), islice(tokens, 1, None) ): assert previous_token.end == next_token.start if ( previous_token.internal_type != FORMATTING_INTERNAL_TYPE and next_token.internal_type != FORMATTING_INTERNAL_TYPE ): augmented_tokens.append( Node( start=previous_token.end, end=previous_token.end, token="", parent=None, internal_type=FORMATTING_INTERNAL_TYPE, roles=[FORMATTING_ROLE], ) ) augmented_tokens.append(next_token) if tokens and tokens[-1].internal_type != FORMATTING_INTERNAL_TYPE: augmented_tokens.append( Node( start=tokens[-1].end, end=tokens[-1].end, token="", parent=None, internal_type=FORMATTING_INTERNAL_TYPE, roles=[FORMATTING_ROLE], ) ) return augmented_tokens @staticmethod def _find_parent( node_index: int, nodes: List[Node], closest_left_node: Optional[Node] ) -> Optional[Node]: """ Compute a node's parent as the LCA of the closest left and right nodes. :param node_index: Index of the node for which to find a parent. :param nodes: Sequence of token `Node`-s. :param closest_left_node: Closest node on the left with a true parent. :return: The Node of the found parent or None if no parent was found. """ if closest_left_node is None: return None left_ancestor_ids = set() current_left_ancestor = closest_left_node.parent while current_left_ancestor is not None: left_ancestor_ids.add(id(current_left_ancestor)) current_left_ancestor = current_left_ancestor.parent for future_node in nodes[node_index + 1 :]: if future_node.parent is not None: break else: return None current_right_ancestor = future_node.parent while current_right_ancestor is not None: if id(current_right_ancestor) in left_ancestor_ids: return current_right_ancestor current_right_ancestor = current_right_ancestor.parent return None def _perform_split_formatting(self, nodes: List[Node]) -> List[Node]: """ Split each formatting node into a list of one node per character. :param nodes: Sequence of token `Node`-s. :return: The new sequence, with split formatting nodes. """ new_nodes = [] for node in nodes: if node.internal_type == FORMATTING_INTERNAL_TYPE and node.token: for i, char in enumerate(node.token): new_nodes.append( Node( token=char, internal_type=node.internal_type, roles=node.roles, parent=node.parent, start=node.start + i, end=node.start + i + 1, ) ) else: new_nodes.append(node) return new_nodes def __del__(self) -> None: if self._bblfsh_client: self._bblfsh_client._channel.close() self._bblfsh_client._channel = self._bblfsh_client._stub = None
def parse_files(filepaths: Sequence[str], line_length_limit: int, overall_size_limit: int, client: BblfshClient, language: str, random_state: int = 7, progress_tracker: Callable = lambda x: x, log: Optional[logging.Logger] = None) -> Iterable[File]: """ Parse files with Babelfish. If a file has lines longer than `line_length_limit`, it is skipped. If the summed size of \ parsed files exceeds `overall_size_limit` the rest of the files is skipped. Files paths are \ filtered with `filter_files_by_path()`. The order in which the files are parsed is random - \ and hence different from `filepaths`. :param filepaths: File paths to filter. :param line_length_limit: Maximum line length to accept a file. :param overall_size_limit: Maximum cumulative files size in bytes. \ The files are discarded after reaching this limit. :param client: Babelfish client instance. The Babelfish server should be running. :param language: Language to consider. Will discard the other languages. :param random_state: Random generator state for shuffling the files. :param progress_tracker: Optional progress metric whenn iterating over the input files. :param log: Logger to use to report the number of excluded files. :return: `File`-s with parsed UASTs and which passed through the filters. """ def load_file(path): with open(path, "rb") as f: return f.read() random.seed(random_state) filepaths_filtered = list(filter_files_by_path(filepaths)) files_filtered_by_line_length = sorted( filter_files_by_line_length(filepaths_filtered, load_file, line_length_limit)) files_filtered_by_line_length = random.sample(files_filtered_by_line_length, k=len(files_filtered_by_line_length)) size, n_parsed = 0, 0 size_passed = [] for filename in progress_tracker(files_filtered_by_line_length): try: res = client.parse(filename) except NonUTF8ContentException: # skip files that can't be parsed because of UTF-8 decoding errors. continue if res.status == 0 and res.language.lower() == language.lower(): n_parsed += 1 with open(filename, "rb") as f: content = f.read() size += len(content) if size > overall_size_limit: break uast = res.uast path = filename size_passed.append(File(content=content, uast=uast, path=path, language=res.language.lower())) if log is not None: log.debug("excluded %d/%d files based on their path", len(filepaths) - len(filepaths_filtered), len(filepaths)) log.debug("excluded %d/%d %s files by max line length %d", len(filepaths_filtered) - len(files_filtered_by_line_length), len(filepaths_filtered), language, line_length_limit) log.debug("excluded %d/%d %s files due to parsing problems", len(files_filtered_by_line_length) - n_parsed, len(files_filtered_by_line_length), language) log.debug("excluded %d/%d %s files by max overall size %d", n_parsed - len(size_passed), n_parsed, language, overall_size_limit) return size_passed
class BblfshTests(unittest.TestCase): BBLFSH_SERVER_EXISTED = None @classmethod def setUpClass(cls): cls.BBLFSH_SERVER_EXISTED = ensure_bblfsh_is_running() @classmethod def tearDownClass(cls): if not cls.BBLFSH_SERVER_EXISTED: client = docker.from_env(version="auto") client.containers.get("bblfshd").remove(force=True) client.api.close() def setUp(self): self.client = BblfshClient("0.0.0.0:9432") def testVersion(self): version = self.client.version() self.assertTrue(hasattr(version, "version")) self.assertTrue(version.version) self.assertTrue(hasattr(version, "build")) self.assertTrue(version.build) def testNativeParse(self): reply = self.client.native_parse(__file__) assert (reply.ast) def testNonUTF8ParseError(self): self.assertRaises(NonUTF8ContentException, self.client.parse, "", "Python", b"a = '\x80abc'") def testUASTDefaultLanguage(self): self._validate_resp(self.client.parse(__file__)) def testUASTPython(self): self._validate_resp(self.client.parse(__file__, language="Python")) def testUASTFileContents(self): with open(__file__, "rb") as fin: contents = fin.read() resp = self.client.parse("file.py", contents=contents) self._validate_resp(resp) self._validate_filter(resp) def testBrokenFilter(self): self.assertRaises(RuntimeError, filter, 0, "foo") def testFilterInternalType(self): node = Node() node.internal_type = 'a' self.assertTrue(any(filter(node, "//a"))) self.assertFalse(any(filter(node, "//b"))) def testFilterToken(self): node = Node() node.token = 'a' self.assertTrue(any(filter(node, "//*[@token='a']"))) self.assertFalse(any(filter(node, "//*[@token='b']"))) def testFilterRoles(self): node = Node() node.roles.append(1) self.assertTrue(any(filter(node, "//*[@roleIdentifier]"))) self.assertFalse(any(filter(node, "//*[@roleQualified]"))) def testFilterProperties(self): node = Node() node.properties['k1'] = 'v2' node.properties['k2'] = 'v1' self.assertTrue(any(filter(node, "//*[@k2='v1']"))) self.assertTrue(any(filter(node, "//*[@k1='v2']"))) self.assertFalse(any(filter(node, "//*[@k1='v1']"))) def testFilterStartOffset(self): node = Node() node.start_position.offset = 100 self.assertTrue(any(filter(node, "//*[@startOffset=100]"))) self.assertFalse(any(filter(node, "//*[@startOffset=10]"))) def testFilterStartLine(self): node = Node() node.start_position.line = 10 self.assertTrue(any(filter(node, "//*[@startLine=10]"))) self.assertFalse(any(filter(node, "//*[@startLine=100]"))) def testFilterStartCol(self): node = Node() node.start_position.col = 50 self.assertTrue(any(filter(node, "//*[@startCol=50]"))) self.assertFalse(any(filter(node, "//*[@startCol=5]"))) def testFilterEndOffset(self): node = Node() node.end_position.offset = 100 self.assertTrue(any(filter(node, "//*[@endOffset=100]"))) self.assertFalse(any(filter(node, "//*[@endOffset=10]"))) def testFilterEndLine(self): node = Node() node.end_position.line = 10 self.assertTrue(any(filter(node, "//*[@endLine=10]"))) self.assertFalse(any(filter(node, "//*[@endLine=100]"))) def testFilterEndCol(self): node = Node() node.end_position.col = 50 self.assertTrue(any(filter(node, "//*[@endCol=50]"))) self.assertFalse(any(filter(node, "//*[@endCol=5]"))) def testFilterBool(self): node = Node() self.assertTrue( filter_bool(node, "boolean(//*[@startOffset or @endOffset])")) self.assertFalse(filter_bool(node, "boolean(//*[@blah])")) def testFilterNumber(self): node = Node() node.children.extend([Node(), Node(), Node()]) self.assertEqual(int(filter_number(node, "count(//*)")), 4) def testFilterString(self): node = Node() node.internal_type = "test" self.assertEqual(filter_string(node, "name(//*[1])"), "test") def testFilterBadQuery(self): node = Node() self.assertRaises(RuntimeError, filter, node, "//*roleModule") def testFilterBadType(self): node = Node() node.end_position.col = 50 self.assertRaises(RuntimeError, filter, node, "boolean(//*[@startPosition or @endPosition])") def testRoleIdName(self): self.assertEqual(role_id(role_name(1)), 1) self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER") def _itTestTree(self): root = Node() root.internal_type = 'root' root.start_position.offset = 0 root.start_position.line = 0 root.start_position.col = 1 son1 = Node() son1.internal_type = 'son1' son1.start_position.offset = 1 son1_1 = Node() son1_1.internal_type = 'son1_1' son1_1.start_position.offset = 10 son1_2 = Node() son1_2.internal_type = 'son1_2' son1_2.start_position.offset = 10 son1.children.extend([son1_1, son1_2]) son2 = Node() son2.internal_type = 'son2' son2.start_position.offset = 100 son2_1 = Node() son2_1.internal_type = 'son2_1' son2_1.start_position.offset = 5 son2_2 = Node() son2_2.internal_type = 'son2_2' son2_2.start_position.offset = 15 son2.children.extend([son2_1, son2_2]) root.children.extend([son1, son2]) return root def testIteratorPreOrder(self): root = self._itTestTree() it = iterator(root, TreeOrder.PRE_ORDER) self.assertIsNotNone(it) expanded = [node.internal_type for node in it] self.assertListEqual( expanded, ['root', 'son1', 'son1_1', 'son1_2', 'son2', 'son2_1', 'son2_2']) def testIteratorPostOrder(self): root = self._itTestTree() it = iterator(root, TreeOrder.POST_ORDER) self.assertIsNotNone(it) expanded = [node.internal_type for node in it] self.assertListEqual( expanded, ['son1_1', 'son1_2', 'son1', 'son2_1', 'son2_2', 'son2', 'root']) def testIteratorLevelOrder(self): root = self._itTestTree() it = iterator(root, TreeOrder.LEVEL_ORDER) self.assertIsNotNone(it) expanded = [node.internal_type for node in it] self.assertListEqual( expanded, ['root', 'son1', 'son2', 'son1_1', 'son1_2', 'son2_1', 'son2_2']) def testIteratorPositionOrder(self): root = self._itTestTree() it = iterator(root, TreeOrder.POSITION_ORDER) self.assertIsNotNone(it) expanded = [node.internal_type for node in it] self.assertListEqual( expanded, ['root', 'son1', 'son2_1', 'son1_1', 'son1_2', 'son2_2', 'son2']) def _validate_resp(self, resp): self.assertIsNotNone(resp) self.assertEqual( type(resp).DESCRIPTOR.full_name, ParseResponse.DESCRIPTOR.full_name) self.assertEqual(len(resp.errors), 0) # self.assertIsInstance() does not work - must be some metaclass magic # self.assertIsInstance(resp.uast, Node) # Sometimes its fully qualified, sometimes is just "Node"... ditto self.assertTrue(resp.uast.__class__.__name__.endswith('Node')) def testFilterInsideIter(self): root = self.client.parse(__file__).uast it = iterator(root, TreeOrder.PRE_ORDER) self.assertIsNotNone(it) for n in it: filter(n, "//*[@roleIdentifier]") def testItersMixingIterations(self): root = self.client.parse(__file__).uast it = iterator(root, TreeOrder.PRE_ORDER) next(it) next(it) next(it) n = next(it) it2 = iterator(n, TreeOrder.PRE_ORDER) next(it2) assert (next(it) == next(it2)) def testManyFilters(self): root = self.client.parse(__file__).uast root.properties['k1'] = 'v2' root.properties['k2'] = 'v1' import resource before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(100): filter(root, "//*[@roleIdentifier]") after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled after running the filter self.assertLess(after[2] / before[2], 2.0) def _validate_filter(self, resp): results = filter(resp.uast, "//Import[@roleImport and @roleDeclaration]//alias") self.assertEqual(next(results).token, "os") self.assertEqual(next(results).token, "unittest") self.assertEqual(next(results).token, "docker")
class BblfshTests(unittest.TestCase): BBLFSH_SERVER_EXISTED = None fixtures_pyfile = "fixtures/test.py" fixtures_cfile = "fixtures/test.c" @classmethod def setUpClass(cls: t.Any) -> None: cls.BBLFSH_SERVER_EXISTED = ensure_bblfsh_is_running() @classmethod def tearDownClass(cls: t.Any) -> None: if not cls.BBLFSH_SERVER_EXISTED: client = docker.from_env(version="auto") client.containers.get("bblfshd").remove(force=True) client.api.close() def setUp(self) -> None: self.client = BblfshClient("localhost:9432") def _parse_fixture(self) -> ResultContext: ctx = self.client.parse(self.fixtures_pyfile) self._validate_ctx(ctx) return ctx def testVersion(self) -> None: version = self.client.version() self.assertTrue(hasattr(version, "version")) self.assertTrue(version.version) self.assertTrue(hasattr(version, "build")) self.assertTrue(version.build) def testNativeParse(self) -> None: ctx = self.client.parse(self.fixtures_pyfile, mode=Modes.NATIVE) self._validate_ctx(ctx) self.assertIsNotNone(ctx) it = ctx.filter("//*[@ast_type='NoopLine']") self.assertIsNotNone(it) self.assertIsInstance(it, NodeIterator) res = list(it) self.assertGreater(len(res), 1) for i in res: t = i.get_dict().get("ast_type") self.assertIsNotNone(t) self.assertEqual(t, "NoopLine") def testNonUTF8ParseError(self) -> None: self.assertRaises(NonUTF8ContentException, self.client.parse, "", "Python", b"a = '\x80abc'") def testUASTDefaultLanguage(self) -> None: ctx = self._parse_fixture() self.assertEqual(ctx.language, "python") def testUASTWithLanguage(self) -> None: ctx = self.client.parse(self.fixtures_pyfile, language="Python") self._validate_ctx(ctx) self.assertEqual(ctx.language, "python") def testUASTWithLanguageAlias(self) -> None: ctx = self.client.parse(self.fixtures_cfile) self._validate_ctx(ctx) self.assertEqual(ctx.language, "c") it = ctx.filter("//uast:FunctionGroup/Nodes/uast:Alias/Name/uast:Identifier/Name") self.assertIsInstance(it, NodeIterator) self.assertEqual(next(it).get(), "main") self.assertEqual(next(it).get(), "fib") def testUASTFileContents(self) -> None: with open(self.fixtures_pyfile, "r") as fin: contents = fin.read() ctx = self.client.parse("file.py", contents=contents) self._validate_ctx(ctx) def assert_strnode(n: Node, expected: str) -> None: self.assertEqual(n.get(), expected) self.assertIsInstance(n.get_str(), str) self.assertEqual(n.get_str(), expected) it = ctx.filter("//uast:RuntimeImport/Path/uast:Identifier/Name") self.assertIsInstance(it, NodeIterator) assert_strnode(next(it), "os") assert_strnode(next(it), "resource") assert_strnode(next(it), "unittest") assert_strnode(next(it), "docker") assert_strnode(next(it), "bblfsh") assert_strnode(next(it), "bblfsh") self.assertRaises(StopIteration, next, it) def testBrokenFilter(self) -> None: ctx = self._parse_fixture() self.assertRaises(RuntimeError, ctx.filter, "dsdfkj32423#$@#$") def testFilterToken(self): ctx = self._parse_fixture() it = ctx.filter("//*[@token='else']/text()") first = next(it).get_str() self.assertEqual(first, "else") def testFilterRoles(self) -> None: ctx = self._parse_fixture() it = ctx.filter("//*[@role='Identifier']") self.assertIsInstance(it, NodeIterator) l = list(it) self.assertGreater(len(l), 0) it = ctx.filter("//*[@role='Friend']") self.assertIsInstance(it, NodeIterator) l = list(it) self.assertEqual(len(l), 0) def testFilterProperties(self) -> None: ctx = uast() obj = {"k1": "v1", "k2": "v2"} self.assertTrue(any(ctx.filter("/*[@k1='v1']", obj))) self.assertTrue(any(ctx.filter("/*[@k2='v2']", obj))) self.assertFalse(any(ctx.filter("/*[@k2='v1']", obj))) self.assertFalse(any(ctx.filter("/*[@k1='v2']", obj))) def testFilterStartOffset(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/start/uast:Position[@offset=11749]"))) self.assertFalse(any(ctx.filter("//uast:Positions/start/uast:Position[@offset=99999]"))) def testFilterStartLine(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/start/uast:Position[@line=295]"))) self.assertFalse(any(ctx.filter("//uast:Positions/start/uast:Position[@line=99999]"))) def testFilterStartCol(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/start/uast:Position[@col=42]"))) self.assertFalse(any(ctx.filter("//uast:Positions/start/uast:Position[@col=99999]"))) def testFilterEndOffset(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/end/uast:Position[@offset=11757]"))) self.assertFalse(any(ctx.filter("//uast:Positions/end/uast:Position[@offset=99999]"))) def testFilterEndLine(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/end/uast:Position[@line=321]"))) self.assertFalse(any(ctx.filter("//uast:Positions/end/uast:Position[@line=99999]"))) def testFilterEndCol(self) -> None: ctx = self._parse_fixture() self.assertTrue(any(ctx.filter("//uast:Positions/end/uast:Position[@col=49]"))) self.assertFalse(any(ctx.filter("//uast:Positions/end/uast:Position[@col=99999]"))) def testFilterBool(self) -> None: ctx = self._parse_fixture() self.assertTrue(ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=49])")) self.assertTrue(next(ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=49])")).get()) self.assertTrue(next(ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=49])")).get_bool()) self.assertFalse(next(ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=9999])")).get()) self.assertFalse(next(ctx.filter("boolean(//uast:Positions/end/uast:Position[@col=9999])")).get_bool()) def testFilterNumber(self) -> None: ctx = self._parse_fixture() self.assertEqual(next(ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])")).get(), 2) self.assertEqual(next(ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])")).get_int(), 2) self.assertEqual(next(ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])")).get_float(), 2.0) def testFilterString(self) -> None: ctx = self._parse_fixture() self.assertEqual(next(ctx.filter("name(//uast:Positions)")).get(), "uast:Positions") self.assertEqual(next(ctx.filter("name(//uast:Positions)")).get_str(), "uast:Positions") def testFilterBadQuery(self) -> None: ctx = uast() self.assertRaises(RuntimeError, ctx.filter, "//[@roleModule]", {}) def testFilterBadType(self) -> None: ctx = self._parse_fixture() res = next(ctx.filter("count(//uast:Positions/end/uast:Position[@col=49])")) self.assertRaises(NodeTypedGetException, res.get_str) def testRoleIdName(self) -> None: self.assertEqual(role_id(role_name(1)), 1) self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER") @staticmethod def _itTestTree() -> dict: def set_position(node: dict, start_offset: int, start_line: int, start_col: int, end_offset: int, end_line: int, end_col: int) -> None: node["@pos"] = { "@type": "uast:Positions", "start": { "@type": "uast:Position", "offset": start_offset, "line": start_line, "col": start_col }, "end": { "@type": "uast:Position", "offset": end_offset, "line": end_line, "col": end_col } } root = {"@type": "root"} set_position(root, 0,1,1, 1,1,2) son1 = {"@type": "son1"} set_position(son1, 2,2,2, 3,2,3) son1_1 = {"@type": "son1_1"} set_position(son1_1, 10,10,1, 12,2,2) son1_2 = {"@type": "son1_2"} set_position(son1_2, 10,10,1, 12,2,2) son1["children"] = [son1_1, son1_2] son2 = {"@type": "son2"} set_position(son2, 100,100,1, 101,100,2) son2_1 = {"@type": "son2_1"} set_position(son2_1, 5,5,1, 6,5,2) son2_2 = {"@type": "son2_2"} set_position(son2_2, 15,15,1, 16,15,2) son2["children"] = [son2_1, son2_2] root["children"] = [son1, son2] return root @staticmethod def _get_nodetypes(iterator: NodeIterator) -> t.List[str]: return [n["@type"] for n in filter(lambda x: isinstance(x, dict), iterator)] @staticmethod def _get_nodes(iterator: NodeIterator) -> t.List[dict]: return [n.get() for n in iterator] @staticmethod def _get_positions(iterator: NodeIterator): startPositions = [ n["@pos"]["start"] for n in filter(lambda x: isinstance(x, dict) and "@pos" in x.keys() and "start" in x["@pos"].keys(), iterator) ] return [ (int(n["offset"]), int(n["line"]), int(n["col"])) for n in startPositions ] def decrefAndGC(self, obj) -> None: del obj gc.collect() def testIteratorPreOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.PRE_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual(expanded, ['root', 'son1', 'son1_1', 'son1_2', 'son2', 'son2_1', 'son2_2']) def testIteratorPostOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.POST_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual(expanded, ['son1_1', 'son1_2', 'son1', 'son2_1', 'son2_2', 'son2', 'root']) def testIteratorLevelOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.LEVEL_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual(expanded, ['root', 'son1', 'son2', 'son1_1', 'son1_2', 'son2_1', 'son2_2']) def testIteratorPositionOrder(self) -> None: # Check first our homemade tree root = self._itTestTree() it = iterator(root, TreeOrder.POSITION_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) self.assertListEqual(expanded, ['root', 'son1', 'son2_1', 'son1_1', 'son1_2', 'son2_2', 'son2']) # Check that when using the positional order the positions we get are # in fact sorted by (offset, line, col) it = iterator(root, TreeOrder.POSITION_ORDER) positions = self._get_positions(it) self.assertListEqual(positions, [(0,1,1), (2,2,2), (5,5,1), (10,10,1), (10,10,1), (15,15,1), (100,100,1)]) def testAnyOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.ANY_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) # We only can test that the order gives us all the nodes self.assertEqual(set(expanded), {'root', 'son1', 'son2', 'son1_1', 'son1_2', 'son2_1', 'son2_2'}) def testChildrenOrder(self) -> None: root = self._itTestTree() it = iterator(root, TreeOrder.CHILDREN_ORDER) self.assertIsNotNone(it) expanded = self._get_nodetypes(it) # We only can test that the order gives us all the nodes self.assertEqual(expanded, ['son1', 'son2']) # Iterating from the root node should give the same result as # iterating from the tree, for every available node def testNodeIteratorEqualsCtxIterator(self) -> None: ctx = self._parse_fixture() root = ctx.root for order in TreeOrder: itCtx = ctx.iterate(order) itRoot = root.iterate(order) self.assertListEqual(self._get_nodes(itCtx), self._get_nodes(itRoot)) def _validate_ctx(self, ctx: ResultContext) -> None: self.assertIsNotNone(ctx) self.assertIsInstance(ctx, ResultContext) self.assertIsInstance(ctx.uast, Node) def testFilterInsideIter(self) -> None: ctx = self._parse_fixture() c2 = uast() for n in ctx.iterate(TreeOrder.PRE_ORDER): c2.filter("//uast:Positions", n) def testItersMixingIterations(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) next(it); next(it); next(it); next(it) it2 = it.iterate(TreeOrder.PRE_ORDER) next(it2) a = next(it).get() b = next(it2).get() self.assertEqual(a, b) def testManyFilters(self) -> None: ctx = self._parse_fixture() before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(10000): ctx.filter("//*[@role='Identifier']") after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testManyParses(self) -> None: before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(100): self.client.parse(self.fixtures_pyfile) after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testManyParsesAndFilters(self) -> None: before = resource.getrusage(resource.RUSAGE_SELF) for _ in range(100): ctx = self.client.parse(self.fixtures_pyfile) ctx.filter("//*[@role='Identifier']") after = resource.getrusage(resource.RUSAGE_SELF) # Check that memory usage has not doubled self.assertLess(after[2] / before[2], 2.0) def testSupportedLanguages(self) -> None: res = self.client.supported_languages() self.assertGreater(len(res), 0) for l in res: for key in ('language', 'version', 'status', 'features'): self.assertTrue(hasattr(l, key)) self.assertIsNotNone(getattr(l, key)) def testEncode(self) -> None: ctx = self._parse_fixture() # This test is here for backward compatibility purposes, # in case someone was relying on encoding contexts this way self.assertEqual(ctx.ctx.encode(None, 0), ctx._response.uast) self.assertEqual(ctx.encode(), ctx._response.uast) def testEncodeWithEmptyContext(self) -> None: ctx = ResultContext() obj = {"k1": "v1", "k2": "v2"} fmt = 1 # YAML # This test is here for backward compatibility purposes, # in case someone was relying on encoding contexts this way data = ctx.ctx.encode(obj, fmt) other_data = ctx.encode(obj, fmt) self.assertDictEqual(obj, decode(data, format = fmt).load()) self.assertDictEqual(obj, decode(other_data, format = fmt).load()) def testGetAll(self) -> None: ctx = self._parse_fixture() expected = ["os", "resource", "unittest", "docker", "bblfsh"] actual = [] for k in ctx.get_all()["body"]: if "@type" in k and k["@type"] == "uast:RuntimeImport" and "Path" in k: path = k["Path"] if "Name" in path: actual.append(k["Path"]["Name"]) self.assertListEqual(expected, actual) def testLoad(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) next(it); next(it); next(it); next(it) it2 = it.iterate(TreeOrder.PRE_ORDER) n = next(it2) node_ext = n.node_ext obj = node_ext.load() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) path = obj["Path"] self.assertEqual("uast:Identifier", path["@type"]) self.assertEqual("os", path["Name"]) # The following testOrphan{x} methods verifies that iterators and nodes work # correctly once the context they come from has been DECREFed. Loading an # (external) node and filtering it after the context / iterators have been # DECREFed are also checked. As an example, the following code should work # in Python: # # its = [] # for file in files: # ctx = client.parse(file) # it = ctx.filter("blablablah") # its.append(it) # # it = pick a it from its # node = next(it) # # Instead of testing with a while, we can just delete ctx before doing # something with the iterator def testOrphanFilter(self) -> None: ctx = self._parse_fixture() it = ctx.filter("//uast:RuntimeImport") self.decrefAndGC(ctx) # We should be able to retrieve values from the iterator # after the context has been DECREFed but the iterator # still exists obj = next(it).get() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) # Chaining calls has the same effect as splitting # the effect across different lines as above self.decrefAndGC(it) it = self._parse_fixture().filter("//uast:RuntimeImport") next(it) obj = next(it).get() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) def testOrphanIterator(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) self.decrefAndGC(ctx) # We should be able to retrieve values from the iterator # after the context has been DECREFed but the iterator # still exists obj = next(it).get() self.assertIsInstance(obj, dict) # Chaining calls has the same effect as splitting # the effect across different lines as above self.decrefAndGC(it) it = self._parse_fixture().iterate(TreeOrder.POST_ORDER) obj = next(it) self.assertIsInstance(obj, Node) def testLoadOrphanNode(self) -> None: ctx = self._parse_fixture() it = ctx.iterate(TreeOrder.PRE_ORDER) # The underlying ctx should not be deallocated even if ctx goes # out of scope because the iterator is still alive self.decrefAndGC(ctx) next(it); next(it); next(it); node = next(it) self.decrefAndGC(it) # Context should not have been deallocated yet because we # want to iterate from the node onwards it2 = node.iterate(TreeOrder.PRE_ORDER) node_ext = node.node_ext # node could be deallocated here also, if we by, any chance, # we happen to be storing only the external nodes self.decrefAndGC(node) obj = node_ext.load() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) def testFilterOrphanNode(self) -> None: ctx = self._parse_fixture() root = ctx.root self.decrefAndGC(ctx) # filter should work here over the tree even if we ctx has # been DECREFed by the interpreter (it has gone out of scope) it = root.filter("//uast:RuntimeImport") obj = next(it).get() typ = obj["@type"] self.assertEqual("uast:RuntimeImport", typ) def testPythonContextIterate(self) -> None: # C++ memory context ctxC = self._parse_fixture() # Python memory context pyDict = ctxC.root.get() ctxPy = bblfsh.context(pyDict) for treeOrder in TreeOrder: itC = ctxC.iterate(treeOrder) itPy = ctxPy.iterate(treeOrder) for nodeC, nodePy in zip(itC, itPy): self.assertEqual(nodeC.get(), nodePy) def testPythonContextFilter(self) -> None: # C++ memory context ctxC = self._parse_fixture() # Python memory context pyDict = ctxC.root.get() ctxPy = bblfsh.context(pyDict) itC = ctxC.filter("//*[@role='Identifier']") itPy = ctxPy.filter("//*[@role='Identifier']") for nodeC, nodePy in zip(itC, itPy): self.assertEqual(nodeC.get(), nodePy) def testBinaryEncodeDecodePythonContext(self) -> None: # Binary encoding should be invertible # C++ memory context ctxC = self._parse_fixture() # Python memory context pyDict = ctxC.root.get() ctxPy = bblfsh.context(pyDict) encoded = ctxPy.encode(fmt = 0) # Binary encoding decoded = decode(encoded, format = 0) self.assertEqual(pyDict, decoded.load()) def testInvalidDecodeBytes(self) -> None: with self.assertRaises(RuntimeError): decode(b'', format = 0) with self.assertRaises(RuntimeError): decode(b'abcdef', format = 0)
class BblfshTests(unittest.TestCase): BBLFSH_SERVER_EXISTED = None @classmethod def setUpClass(cls): cls.BBLFSH_SERVER_EXISTED = ensure_bblfsh_is_running() @classmethod def tearDownClass(cls): if not cls.BBLFSH_SERVER_EXISTED: client = docker.from_env(version="auto") client.containers.get("bblfshd").remove(force=True) client.api.close() def setUp(self): self.client = BblfshClient("0.0.0.0:9432") def testVersion(self): version = self.client.version() self.assertTrue(hasattr(version, "version")) self.assertTrue(version.version) self.assertTrue(hasattr(version, "build")) self.assertTrue(version.build) # def testNativeParse(self): # reply = self.client.native_parse(__file__) # assert(reply.ast) # def testNonUTF8ParseError(self): self.assertRaises(NonUTF8ContentException, self.client.parse, "", "Python", b"a = '\x80abc'") # def testUASTDefaultLanguage(self): self._validate_ctx(self.client.parse(__file__)) def testUASTPython(self): ctx = self.client.parse(__file__, language="Python") self._validate_ctx(ctx) self.assertEqual(ctx.language, "python") def testUASTFileContents(self): with open(__file__, "rb") as fin: contents = fin.read() ctx = self.client.parse("file.py", contents=contents) self._validate_ctx(ctx) self._validate_filter(ctx) # # def testBrokenFilter(self): # self.assertRaises(RuntimeError, filter, 0, "foo") # # def testFilterInternalType(self): # node = Node() # node.internal_type = 'a' # self.assertTrue(any(filter(node, "//a"))) # self.assertFalse(any(filter(node, "//b"))) # # def testFilterToken(self): # node = Node() # node.token = 'a' # self.assertTrue(any(filter(node, "//*[@token='a']"))) # self.assertFalse(any(filter(node, "//*[@token='b']"))) # # def testFilterRoles(self): # node = Node() # node.roles.append(1) # self.assertTrue(any(filter(node, "//*[@roleIdentifier]"))) # self.assertFalse(any(filter(node, "//*[@roleQualified]"))) # # def testFilterProperties(self): # node = Node() # node.properties['k1'] = 'v2' # node.properties['k2'] = 'v1' # self.assertTrue(any(filter(node, "//*[@k2='v1']"))) # self.assertTrue(any(filter(node, "//*[@k1='v2']"))) # self.assertFalse(any(filter(node, "//*[@k1='v1']"))) # # def testFilterStartOffset(self): # node = Node() # node.start_position.offset = 100 # self.assertTrue(any(filter(node, "//*[@startOffset=100]"))) # self.assertFalse(any(filter(node, "//*[@startOffset=10]"))) # # def testFilterStartLine(self): # node = Node() # node.start_position.line = 10 # self.assertTrue(any(filter(node, "//*[@startLine=10]"))) # self.assertFalse(any(filter(node, "//*[@startLine=100]"))) # # def testFilterStartCol(self): # node = Node() # node.start_position.col = 50 # self.assertTrue(any(filter(node, "//*[@startCol=50]"))) # self.assertFalse(any(filter(node, "//*[@startCol=5]"))) # # def testFilterEndOffset(self): # node = Node() # node.end_position.offset = 100 # self.assertTrue(any(filter(node, "//*[@endOffset=100]"))) # self.assertFalse(any(filter(node, "//*[@endOffset=10]"))) # # def testFilterEndLine(self): # node = Node() # node.end_position.line = 10 # self.assertTrue(any(filter(node, "//*[@endLine=10]"))) # self.assertFalse(any(filter(node, "//*[@endLine=100]"))) # # def testFilterEndCol(self): # node = Node() # node.end_position.col = 50 # self.assertTrue(any(filter(node, "//*[@endCol=50]"))) # self.assertFalse(any(filter(node, "//*[@endCol=5]"))) # # def testFilterBool(self): # node = Node() # self.assertTrue(filter_bool(node, "boolean(//*[@startOffset or @endOffset])")) # self.assertFalse(filter_bool(node, "boolean(//*[@blah])")) # # def testFilterNumber(self): # node = Node() # node.children.extend([Node(), Node(), Node()]) # self.assertEqual(int(filter_number(node, "count(//*)")), 4) # # def testFilterString(self): # node = Node() # node.internal_type = "test" # self.assertEqual(filter_string(node, "name(//*[1])"), "test") # # def testFilterBadQuery(self): # node = Node() # self.assertRaises(RuntimeError, filter, node, "//*roleModule") # # def testFilterBadType(self): # node = Node() # node.end_position.col = 50 # self.assertRaises(RuntimeError, filter, node, "boolean(//*[@startPosition or @endPosition])") # # def testRoleIdName(self): # self.assertEqual(role_id(role_name(1)), 1) # self.assertEqual(role_name(role_id("IDENTIFIER")), "IDENTIFIER") # # def _itTestTree(self): # root = Node() # root.internal_type = 'root' # root.start_position.offset = 0 # root.start_position.line = 0 # root.start_position.col = 1 # # son1 = Node() # son1.internal_type = 'son1' # son1.start_position.offset = 1 # # son1_1 = Node() # son1_1.internal_type = 'son1_1' # son1_1.start_position.offset = 10 # # son1_2 = Node() # son1_2.internal_type = 'son1_2' # son1_2.start_position.offset = 10 # # son1.children.extend([son1_1, son1_2]) # # son2 = Node() # son2.internal_type = 'son2' # son2.start_position.offset = 100 # # son2_1 = Node() # son2_1.internal_type = 'son2_1' # son2_1.start_position.offset = 5 # # son2_2 = Node() # son2_2.internal_type = 'son2_2' # son2_2.start_position.offset = 15 # # son2.children.extend([son2_1, son2_2]) # root.children.extend([son1, son2]) # # return root # # def testIteratorPreOrder(self): # root = self._itTestTree() # it = iterator(root, TreeOrder.PRE_ORDER) # self.assertIsNotNone(it) # expanded = [node.internal_type for node in it] # self.assertListEqual(expanded, ['root', 'son1', 'son1_1', 'son1_2', # 'son2', 'son2_1', 'son2_2']) # # def testIteratorPostOrder(self): # root = self._itTestTree() # it = iterator(root, TreeOrder.POST_ORDER) # self.assertIsNotNone(it) # expanded = [node.internal_type for node in it] # self.assertListEqual(expanded, ['son1_1', 'son1_2', 'son1', 'son2_1', # 'son2_2', 'son2', 'root']) # # def testIteratorLevelOrder(self): # root = self._itTestTree() # it = iterator(root, TreeOrder.LEVEL_ORDER) # self.assertIsNotNone(it) # expanded = [node.internal_type for node in it] # self.assertListEqual(expanded, ['root', 'son1', 'son2', 'son1_1', # 'son1_2', 'son2_1', 'son2_2']) # # def testIteratorPositionOrder(self): # root = self._itTestTree() # it = iterator(root, TreeOrder.POSITION_ORDER) # self.assertIsNotNone(it) # expanded = [node.internal_type for node in it] # self.assertListEqual(expanded, ['root', 'son1', 'son2_1', 'son1_1', # 'son1_2', 'son2_2', 'son2']) # def _validate_ctx(self, ctx): import bblfsh self.assertIsNotNone(ctx) self.assertIsInstance(ctx, bblfsh.result_context.ResultContext) self.assertIsInstance(ctx.uast, bytes) # def testFilterInsideIter(self): # root = self.client.parse(__file__).uast # it = iterator(root, TreeOrder.PRE_ORDER) # self.assertIsNotNone(it) # for n in it: # filter(n, "//*[@roleIdentifier]") # # def testItersMixingIterations(self): # root = self.client.parse(__file__).uast # it = iterator(root, TreeOrder.PRE_ORDER) # next(it); next(it); next(it) # n = next(it) # it2 = iterator(n, TreeOrder.PRE_ORDER) # next(it2) # assert(next(it) == next(it2)) # # def testManyFilters(self): # root = self.client.parse(__file__).uast # root.properties['k1'] = 'v2' # root.properties['k2'] = 'v1' # # before = resource.getrusage(resource.RUSAGE_SELF) # for _ in range(500): # filter(root, "//*[@roleIdentifier]") # # after = resource.getrusage(resource.RUSAGE_SELF) # # # Check that memory usage has not doubled after running the filter # self.assertLess(after[2] / before[2], 2.0) # # def testManyParses(self): # before = resource.getrusage(resource.RUSAGE_SELF) # for _ in range(100): # root = self.client.parse(__file__).uast # root.properties['k1'] = 'v2' # root.properties['k2'] = 'v1' # # after = resource.getrusage(resource.RUSAGE_SELF) # # # Check that memory usage has not doubled after running the parse+filter # self.assertLess(after[2] / before[2], 2.0) # # def testManyParsersAndFilters(self): # before = resource.getrusage(resource.RUSAGE_SELF) # for _ in range(100): # root = self.client.parse(__file__).uast # root.properties['k1'] = 'v2' # root.properties['k2'] = 'v1' # # filter(root, "//*[@roleIdentifier]") # # after = resource.getrusage(resource.RUSAGE_SELF) # # # Check that memory usage has not doubled after running the parse+filter # self.assertLess(after[2] / before[2], 2.0) # # def testSupportedLanguages(self): # res = self.client.supported_languages() # self.assertGreater(len(res), 0) # for l in res: # for key in ('language', 'version', 'status', 'features'): # print(key) # self.assertTrue(hasattr(l, key)) # self.assertIsNotNone(getattr(l, key)) def _validate_filter(self, ctx): def assert_strnode(n: Node, expected: str) -> None: self.assertEqual(n.get(), expected) self.assertIsInstance(n.get_str(), str) self.assertEqual(n.get_str(), expected) # print(ctx) it = ctx.filter( "//uast:RuntimeImport/Path/uast:Alias/Name/uast:Identifier/Name") self.assertIsInstance(it, NodeIterator) # wtf = next(it) # print(type(wtf)) # print(wtf) assert_strnode(next(it), "os") assert_strnode(next(it), "resource") assert_strnode(next(it), "unittest") assert_strnode(next(it), "docker") assert_strnode(next(it), "bblfsh") self.assertRaises(StopIteration, next(it))