def test_extract_features_all_lines(self): file = File(content=bytes(self.contents, "utf-8"), uast=self.uast) files = [file, file] self.check_X_y(*self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") + 1))] * 2))
def prepare_files(filenames: Iterable[str], client: BblfshClient, language: str) -> Iterable[File]: """ Prepare the given folder for analysis by extracting UASTs and creating the gRPC wrappers. :param filenames: List of paths to files to analyze. :param client: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider. Will discard the other languages. :return: Iterator of File-s with content, uast, path and language set. """ files = [] for file in tqdm(filter_filepaths(list(filenames))): try: res = client.parse(file) except NonUTF8ContentException: # skip files that can't be parsed because of UTF-8 decoding errors. continue if res.status == 0 and res.language.lower() == language.lower(): uast = res.uast path = file with open(file) as f: content = f.read().encode("utf-8") files.append( File(content=content, uast=uast, path=path, language=res.language.lower())) return files
def setUpClass(cls): config = FormatAnalyzer._load_train_config(merge_dicts( get_train_config(), { "javascript": { "feature_extractor": { "left_siblings_window": 1, "right_siblings_window": 1, "parents_depth": 1, "node_features": ["start_line", "reserved", "roles"], }, }, })) base = Path(__file__).parent with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = File(content=bytes(contents, "utf-8"), uast=uast) files = [file, file] cls.fe = FeatureExtractor(language="javascript", **config["javascript"]["feature_extractor"]) cls.fe.extract_features(files) cls.class_representations = cls.fe.composite_class_representations cls.n_classes = len(cls.fe.labels_to_class_sequences) cls.ordinal = cls.return_node_feature(FeatureId.start_line) cls.categorical = cls.return_node_feature(FeatureId.reserved) cls.bag = cls.return_node_feature(FeatureId.roles)
def run(self, ptr_from: ReferencePointer, data_service_head: DataService, data_service_base: Optional[DataService] = None ) -> Iterable[FileFix]: """ Run `generate_file_fixes` for all files in ptr_from revision. :param ptr_from: Git repository state pointer to the base revision. :param data_service_head: Connection to the Lookout data retrieval service to get \ the new files. :param data_service_base: Connection to the Lookout data retrieval service to get \ the initial files. If it is None, we assume the empty contents. :return: Generator of fixes for each file. """ files_head = list( request_files(data_service_head.get_data(), ptr_from, contents=True, uast=True, unicode=True)) if data_service_base is not None: files_base = list( request_files(data_service_base.get_data(), ptr_from, contents=True, uast=True, unicode=True)) else: files_base = [File(path=f.path) for f in files_head] return self.generate_file_fixes( data_service_head, [self.Changes(f1, f2) for f1, f2 in zip(files_base, files_head)])
def generate_local_test(mcs, case_name, uast, contents): fe_config = FormatAnalyzer._load_config( get_config())["train"]["javascript"] feature_extractor = FeatureExtractor(language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) file = File(content=bytes(contents, "utf-8"), uast=uast) _, _, (vnodes_y, _, _, _) = feature_extractor.extract_features([file]) offsets, y_pred, result = cases[case_name] def _test(self): y_cur = deepcopy(self.y) for offset, yi in zip(offsets, y_pred): i = None for i, vnode in enumerate(vnodes_y): # noqa: B007 if offset == vnode.start.offset: break y_cur[i] = yi code_generator = CodeGenerator(self.feature_extractor) pred_vnodes = code_generator.apply_predicted_y( self.vnodes, self.vnodes_y, list(range(len(self.vnodes_y))), FakeRules(y_cur)) generated_file = code_generator.generate(pred_vnodes) self.assertEqual(generated_file, result) return _test
def test_extract_features(self): file = File(content=bytes(self.contents, "utf-8"), uast=self.uast) files = [file, file] res = self.extractor.extract_features(files) self.assertIsNotNone(res, "Failed to parse files.") self.check_X_y(*res)
def test_filter_files_by_overall_size(self): files = { "one.py": File(content=b"hello"), "two.py": File(content=b"world" * 100) } def getter(key): return files[key].content filtered = list( filter_files_by_overall_size(files.keys(), getter, 1000000)) self.assertEqual(len(filtered), 2) filtered = list(filter_files_by_overall_size(files.keys(), getter, 1)) self.assertEqual(len(filtered), 0) filtered = list( filter_files_by_overall_size(files.keys(), getter, 5 * 100)) self.assertEqual(len(filtered), 1)
def test_filter_files(self): files = [File(path="one", content=b"hello"), File(path="two", content=b"world" * 100)] files = {file.path: file for file in files} logged = False class Log: def debug(self, *args, **kwargs): nonlocal logged logged = True try: bblfsh_client = BblfshClient("0.0.0.0:9432") filtered = filter_files(files=files, line_length_limit=80, overall_size_limit=5 << 20, log=Log()) self.assertEqual(len(filtered), 1) self.assertEqual(filtered[0].content, b"hello") self.assertTrue(logged) finally: bblfsh_client._channel.close()
def test_files_by_language(self): file_stats = {"js": 2, "ruby": 7, "Python": 5} files = [] for language, n_files in file_stats.items(): for i in range(n_files): files.append(File(language=language, uast=Node(children=[Node()]), path=language + str(i))) result = files_by_language(files) self.assertEqual([("python", 5), ("js", 2), ("ruby", 7)], [(k, len(v)) for k, v in result.items()]) return result
def get_class_sequences_from_code( code: str) -> Sequence[Tuple[int, ...]]: uast = client.parse(filename="", language="javascript", contents=code.encode()).uast extractor = FeatureExtractor(language="javascript", **config) result = extractor.extract_features( [File(content=code.encode(), uast=uast, path="")]) if result is None: self.fail("Could not parse test code.") _, _, (vnodes_y, _, _, _) = result return [vnode.y for vnode in vnodes_y]
def setUpClass(cls): base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = File(content=bytes(contents, "utf-8"), uast=uast) cls.files = [file] config = FormatAnalyzer._load_config(get_config())["train"] cls.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"])
def setUpClass(cls): base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) file = File(content=bytes(contents, "utf-8"), uast=uast) cls.files = [file] cls.extractor = FeatureExtractor("javascript", parents_depth=2, siblings_window=5)
def test_vnode_positions(self): test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code).uast file = BytesToUnicodeConverter.convert_file( File(content=code, uast=uast, language="javascript", path="test.js")) annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) # Just should not fail self.extractor._classify_vnodes(annotated_data)
def test_find_deleted_lines(self): text_base = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare. Vivamus euismod lorem viverra semper dictum. Nam consectetur enim eget elementum mattis. Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus. Etiam vitae nisi at ante pretium lacinia et eu massa.""" base_lines_number = text_base.count("\n") + 1 # Delete first line new_line_indices = find_deleted_lines( File(content=bytes(text_base, "utf-8")), File(content=bytes("\n".join(text_base.split("\n")[1:]), "utf-8"))) self.assertEqual(new_line_indices, [1]) # Delete first two lines new_line_indices = find_deleted_lines( File(content=bytes(text_base, "utf-8")), File(content=bytes("\n".join(text_base.split("\n")[2:]), "utf-8"))) self.assertEqual(new_line_indices, [1]) # Delete last line new_line_indices = find_deleted_lines( File(content=bytes(text_base, "utf-8")), File( content=bytes("\n".join(text_base.split("\n")[:-1]), "utf-8"))) self.assertEqual(new_line_indices, [base_lines_number - 1]) # Delete last two lines new_line_indices = find_deleted_lines( File(content=bytes(text_base, "utf-8")), File( content=bytes("\n".join(text_base.split("\n")[:-2]), "utf-8"))) self.assertEqual(new_line_indices, [base_lines_number - 2]) # Delete line in the middle middle = 3 text_head = text_base.split("\n") text_head.pop(middle) text_head = "\n".join(text_head) new_line_indices = find_deleted_lines( File(content=bytes(text_base, "utf-8")), File(content=bytes(text_head, "utf-8"))) self.assertEqual(new_line_indices, [middle, middle + 1])
def test_extract_features_some_lines(self): file = File(content=bytes(self.contents, 'utf-8'), uast=self.uast) files = [file] X1, y1, vn1 = self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2) self.check_X_y(X1, y1, vn1) X2, y2, vn2 = self.extractor.extract_features(files) self.assertTrue((X1 == X2[:len(X1)]).all()) self.assertTrue((y1 == y2[:len(y1)]).all()) self.assertTrue(vn1 == vn2[:len(vn1)]) self.assertLess(len(y1), len(y2))
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_base_file.js", contents=contents).uast cls.base_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] with lzma.open(str(base / "test_head_file.js.xz")) as fin: contents = b"var print_tipe = 0;\n" + fin.read() uast = cls.bblfsh_client.parse("test_head_file.js", contents=contents).uast cls.head_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit") cls.config = { "model": MODEL_PATH, "confidence_threshold": 0.0, "n_candidates": 3, "check_all_identifiers": True, "analyze": { "filepath": cls.base_files[0].path, "wrong_id": "print_tipe", "line": 0 } }
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.py.xz")) as fin: uast = client.parse("test_base_file.py", contents=fin.read()).uast cls.base_files = [ File(path="test_base_file.py", content=fin.read(), uast=uast, language="Python") ] with lzma.open(str(base / "test_head_file.py.xz")) as fin: uast = client.parse("test_head_file.py", contents=fin.read()).uast cls.head_files = [ File(path="test_head_file.py", content=fin.read(), uast=uast, language="Python") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit")
def test_find_modified_lines(self): text_base = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Maecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare. Vivamus euismod lorem viverra semper dictum. Nam consectetur enim eget elementum mattis. Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus. Etiam vitae nisi at ante pretium lacinia et eu massa.""" # inserted lines: 3 and 6 (counting from 1 with a new line at the start) # modified line: 4 text_head = """ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur congue libero vitae quam venenatis, tristique commodo diam lacinia. Mecenas volutpat dui id ipsum cursus, sit amet accumsan nisl ornare. Vivamus euismod lorem viverra semper dictum. Praesent eu ipsum sit amet elit aliquam laoreet. Nam consectetur enim eget elementum mattis. Ut condimentum metus vehicula tellus tempus, vel ultricies lectus dapibus. Etiam vitae nisi at ante pretium lacinia et eu massa.""" new_line_indices = find_new_lines( File(content=bytes(text_base, "utf-8")), File(content=bytes(text_head, "utf-8"))) self.assertEqual(new_line_indices, [3, 4, 6])
def test_files_by_language(self): file_stats = {"js": 2, "Python": 5, "ruby": 7} files = [] for language, n_files in file_stats.items(): for i in range(n_files): files.append( File(language=language, uast=self.uast, path=str(i))) result = FormatAnalyzer._files_by_language(files) self.assertEqual({ "js": 2, "python": 5, "ruby": 7 }, {k: len(v) for k, v in result.items()}) return result
def test_extract_features_some_lines(self): file = File(content=bytes(self.contents, "utf-8"), uast=self.uast) files = [file] X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents) = self.extractor.extract_features( files, [list(range(1, self.contents.count("\n") // 2 + 1))] * 2) self.check_X_y(X1_csr, y1, (vn1_y, vn1, vn1_parents, n1_parents)) X2_csr, y2, (vn2_y, vn2, _, _) = self.extractor.extract_features(files) X1, X2 = X1_csr.toarray(), X2_csr.toarray() self.assertTrue((X1 == X2[:len(X1)]).all()) self.assertTrue((y1 == y2[:len(y1)]).all()) self.assertTrue(vn1_y == vn2_y[:len(vn1_y)]) self.assertLess(len(y1), len(y2))
def setUpClass(cls): cls.maxDiff = None base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark_small.js.xz"), mode="rt") as fin: contents = fin.read() with lzma.open(str(base / "benchmark_small.js.uast.xz")) as fin: uast = bblfsh.Node.FromString(fin.read()) config = FormatAnalyzer._load_train_config(get_train_config()) fe_config = config["javascript"] cls.feature_extractor = FeatureExtractor( language="javascript", label_composites=label_composites, **fe_config["feature_extractor"]) cls.file = File(content=bytes(contents, "utf-8"), uast=uast) cls.X, cls.y, (cls.vnodes_y, cls.vnodes, cls.vnode_parents, cls.node_parents) = \ cls.feature_extractor.extract_features([cls.file])
def get_files_from_tar(tar_path: str) -> Dict[str, File]: files = defaultdict(lambda: [None, None]) with tarfile.open(tar_path) as tar: for member in tar: name = member.name if name == ".": continue file = tar.extractfile(member) uast = True if name.endswith(".uast") else False content = file.read() if uast: name = name[:-5] content = bblfsh.Node.FromString(content) files[name][uast] = content for key, (content, uast) in files.items(): files[key] = File(path=key, content=content, uast=uast, language="JavaScript") return files
def prepare_file(filename: str, client: BblfshClient, language: str) -> File: """ Prepare the given file for analysis by extracting UAST and creating the gRPC wrapper. :param filename: Path to the filename to analyze. :param client: Babelfish client. Babelfish server should be started accordingly. :param language: Language to consider. Will discard the other languages """ assert os.path.isfile(filename), "\"%s\" should be a file" % filename res = client.parse(filename, language) assert res.status == 0, "Parse returned status %s for file %s" % ( res.status, filename) error_log = "Language for % should be %s instead of %s" assert res.language.lower() == language.lower(), error_log % ( filename, language, res.language) with open(filename) as f: content = f.read().encode("utf-8") return File(content=content, uast=res.uast, path=filename)
def test_multiple_files(self): data = [ ("var a = 0", { 1: (CLS_NOOP, ) }), ("var b = 123", { 4: (CLS_NOOP, ) }), ] files = [] for i, (code, _) in enumerate(data): uast, errors = parse_uast(self.stub, code, filename="", language=self.language) if errors: self.fail("Could not parse the testing code.") files.append( File(content=code.encode(), uast=uast, path="test_file_%d" % i)) X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features(files) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for (_, modif) in data: for i in modif: y_pred[i] = self._to_label(modif[i]) checker = UASTStabilityChecker(self.fe) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, files, self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions={}) self.assertEqual(list(safe_preds), [0, 2, 3, 4, 5, 6, 7, 8])
def run(self, ptr: ReferencePointer, data_service: DataService) -> Iterable[TypoFix]: """ Run `generate_typos_fixes` for all lines and all files in `ptr_from` revision. :param ptr: Git repository state pointer to the revision that should be analyzed. :param data_service: Connection to the Lookout data retrieval service to get the files. :return: Generator of fixes for each file. """ for file in request_files(data_service.get_data(), ptr, contents=True, uast=True, unicode=False): if file.path == self.config["filepath_to_analyze"]: break else: raise ValueError("No such file %s in %s" % (self.config["filepath_to_analyze"], ptr)) typos_fixes = list( self.generate_typos_fixes([ UnicodeChange(head=file, base=File(path=file.path, language=file.language)) ])) if typos_fixes: return typos_fixes identifiers_number = len(self._get_identifiers(file.uast, [])) if not identifiers_number: raise ValueError("No identifiers for file %s in %s" % (self.config["filepath_to_analyze"], ptr)) return [ TypoFix(content=file.content.decode("utf-8", "replace"), path=file.path, line_number=0, identifier="", candidates=[], identifiers_number=identifiers_number) ]
def edit_and_test(self, code: str, modifs: Mapping[int, Sequence[str]], *, quote_indices: Optional[Tuple[int, ...]] = None, bad_indices: Optional[FrozenSet[int]] = None) -> None: uast, errors = parse_uast(self.stub, code, filename="", language=self.language) if errors: self.fail("Could not parse the testing code.") file = File(content=code.encode(), uast=uast, path="test_file") X, y, (vnodes_y, vnodes, vnode_parents, node_parents) = self.fe.extract_features([file]) y_pred = y.copy() rule_winners = numpy.zeros(y.shape) for index, classes in modifs.items(): y_pred[index] = self._to_label(classes) checker = UASTStabilityChecker(self.fe) grouped_quote_predictions = self._grouped_predictions_mapping( vnodes, quote_indices) new_y, new_y_pred, new_vnodes_y, new_rule_winners, safe_preds = checker.check( y, y_pred, vnodes_y, vnodes, [file], self.stub, vnode_parents, node_parents, rule_winners, grouped_quote_predictions=grouped_quote_predictions) bad_preds = set(range(y.shape[0])) - set(safe_preds) bad = modifs.keys() if bad_indices is None else bad_indices self.assertEqual(bad_preds, bad) self.assertEqual(len(y) - len(bad), len(new_y)) self.assertEqual(len(y_pred) - len(bad), len(new_y_pred)) self.assertEqual(len(vnodes_y) - len(bad), len(new_vnodes_y)) self.assertEqual(len(rule_winners) - len(bad), len(new_rule_winners))
def test_extended_roles(self): file = File(content=bytes(self.contents, 'utf-8'), uast=self.uast) X, _, vns = self.extractor.extract_features([file]) # last columns are only roles last_columns = self.extractor.parents_depth + self.extractor.siblings_window self.assertGreater( numpy.count_nonzero(X[:, -last_columns:] > len(ROLE_INDEX)), 0) col_role_left_sibling = ( self.extractor.count_features(FeatureType.node) + self.extractor.count_features(FeatureType.left_siblings) - 1) def get_ext_role(role_index): return RESERVED[role_index - len(ROLE_INDEX)] for i, (x, vn) in enumerate(zip(X, vns)): start = vn.start.offset # Don't test the first two nodes, they might not have a left sibling if i < 2: continue role_index_left = x[col_role_left_sibling] if role_index_left >= len(ROLE_INDEX): role_left = get_ext_role(role_index_left) self.assertEqual(self.contents[start - len(role_left):start], role_left)
def test_extract_features_exact_match(self): file = File(content=bytes(self.contents, "utf-8"), uast=self.uast) files = [file] X, y, (vnodes_y, vnodes, _, _) = self.extractor.extract_features(files) self.assertEqual("".join(vnode.value for vnode in vnodes), self.contents)
def return_features() -> Response: """Featurize the given code.""" body = request.get_json() code = body["code"] babelfish_address = body["babelfish_address"] language = body["language"] client = BblfshClient(babelfish_address) res = client.parse(filename="", contents=code.encode(), language=language) if res.status != 0: abort(500) model = FormatModel().load( str(Path(__file__).parent / "models" / "model.asdf")) if language not in model: raise NotFittedError() rules = model[language] file = File(content=code.encode(), uast=res.uast, language="javascript") config = rules.origin_config["feature_extractor"] config["return_sibling_indices"] = True fe = FeatureExtractor(language=language, **config) res = fe.extract_features([file]) if res is None: abort(500) X, y, (vnodes_y, vnodes, vnode_parents, node_parents, sibling_indices) = res y_pred, rule_winners, rules, grouped_quote_predictions = rules.predict( X=X, vnodes_y=vnodes_y, vnodes=vnodes, feature_extractor=fe) refuse_to_predict = y_pred < 0 _, _, _, _, safe_preds = filter_uast_breaking_preds( y=y, y_pred=y_pred, vnodes_y=vnodes_y, vnodes=vnodes, files={file.path: file}, feature_extractor=fe, stub=client._stub, vnode_parents=vnode_parents, node_parents=node_parents, rule_winners=rule_winners, grouped_quote_predictions=grouped_quote_predictions) break_uast = [False] * X.shape[0] for wrong_pred in set(range(X.shape[0])).difference(safe_preds): break_uast[wrong_pred] = True labeled_indices = {id(vnode): i for i, vnode in enumerate(vnodes_y)} app.logger.info("returning features of shape %d, %d" % X.shape) app.logger.info("length of rules: %d", len(rules)) return jsonify({ "code": code, "features": _input_matrix_to_descriptions(X, fe), "ground_truths": y.tolist(), "predictions": y_pred.tolist(), "refuse_to_predict": refuse_to_predict.tolist(), "sibling_indices": sibling_indices, "rules": _rules_to_jsonable(rules, fe), "winners": rule_winners.tolist(), "break_uast": break_uast, "feature_names": fe.feature_names, "class_representations": fe.composite_class_representations, "class_printables": fe.composite_class_printables, "vnodes": list( map(partial(_vnode_to_jsonable, labeled_indices=labeled_indices), vnodes)), "config": _mapping_to_jsonable(rules.origin_config) })
def create_files(): files = [File(path="one", content=b"hello"), File(path="two", content=b"world" * 100)] * 1000 files = random.sample(files, k=len(files)) # note: no need to set the seed return {file.path: file for file in files}