def test_positions(self): test_js_code_filepath = Path( __file__).parent / "browser-policy-content.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="", language="javascript", contents=code).uast converter = BytesToUnicodeConverter(code) code_uni = converter.convert_content() uast_uni = converter.convert_uast(uast) file = UnicodeFile(content=code_uni, uast=uast_uni, language="javascript", path="test.js") annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) nodes, _ = file_to_old_parse_file_format(annotated_data) for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])): self.assertLessEqual( node1.start.line, node2.start.line, "Start line position decrease for %d, %d nodes" % (index, index + 1)) self.assertLessEqual( node1.start.offset, node2.start.offset, "Start offset position decrease for %d, %d nodes" % (index, index + 1))
def analyze_files( analyzer_type: Type[FormatAnalyzer], config: dict, model_path: str, language: str, bblfsh_addr: str, input_pattern: str, log: logging.Logger, ) -> List[Comment]: """Run the model, record the fixes for each file and return them.""" class FakePointer: def to_pb(self): return None model = FormatModel().load(model_path) if language not in model: raise NotFittedError() rules = model[language] client = bblfsh.BblfshClient(bblfsh_addr) files = parse_files( filepaths=glob.glob(input_pattern, recursive=True), line_length_limit=rules.origin_config["line_length_limit"], overall_size_limit=rules.origin_config["overall_size_limit"], client=client, language=language, log=log) log.info("Model parameters: %s" % rules.origin_config) log.info("Rules stats: %s" % rules) log.info("Number of files: %s" % (len(files))) return analyzer_type(model, input_pattern, config).analyze(FakePointer(), None, data_service=FakeDataService( client, files, []))
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_base_file.js", contents=contents).uast cls.base_files = [ FakeFile(path="test_base_file.js", content=contents, uast=uast, language="Javascript") ] with lzma.open(str(base / "test_head_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_head_file.js", contents=contents).uast cls.head_files = [ FakeFile(path="test_head_file.js", content=contents, uast=uast, language="Javascript") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit")
def analyze_code_file(path: str): nonlocal errors if errors: return try: try: client = clients.client except AttributeError: client = bblfsh.BblfshClient(args.bblfsh) clients.client = client response = client.parse(path) nonlocal language if not language: language = response.language elif language != response.language: log.warning("dropped %s - language mismatch %s != %s", path, language, response.language) return content = Path(path).read_text() analyze_uast(path, content, response.uast, internal_types, roles, reserved) except: # noqa: E722 log.exception("Parsing %s", path) errors = True finally: with progress_lock: progress.disable = False # this is needed, do not remove progress.update(1)
def __init__(self, language: str = "javascript", bblfsh_address: str = "0.0.0.0:9432"): """ Construct a `CodeTokenizer`. :param language: Which language to extract features for. :param bblfsh_address: Address of bblfsh server. """ self.language = language.lower() # import everything related to language self.tokens = importlib.import_module("tokenizer.langs.%s.tokens" % language) self.roles = importlib.import_module("tokenizer.langs.%s.roles" % language) try: self.token_unwrappers = importlib.import_module( "tokenizer..langs.%s.token_unwrappers" % language).TOKEN_UNWRAPPERS except ImportError: # It's normal for some languages not to have a token_unwrappers module. self.token_unwrappers = {} try: self.node_fixtures = importlib.import_module( "tokenizer.langs.%s.uast_fixers" % language).NODE_FIXTURES except ImportError: # It's normal for some languages not to have a uast_fixes module. self.node_fixtures = {} # Create instance of bblfsh client in case of bblfsh_address is not None. # If None - UAST has to be provided by client. if bblfsh_address is not None: self.client = bblfsh.BblfshClient(bblfsh_address)
def test_parse_file_exact_match(self): test_js_code_filepath = Path(__file__).parent / "for_parse_test.js.xz" with lzma.open(str(test_js_code_filepath), mode="rt") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code.encode()).uast nodes, parents = self.extractor._parse_file(code, uast, test_js_code_filepath) self.assertEqual("".join(n.value for n in nodes), code)
def setUpClass(cls): slogging_setup("DEBUG", False) cls.language = "javascript" cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") cls.data_service = FakeDataService(cls.bblfsh_client, files=None, changes=None) cls.stub = cls.data_service.get_bblfsh() cls.config = FormatAnalyzer._load_config({ "train": {"language_defaults": {"feature_extractor": {"cutoff_label_support": 0}}}, })["train"][cls.language]["feature_extractor"]
def setUp(self): self.bblfsh = bblfsh.BblfshClient("localhost:9432") self.uast = self.bblfsh.parse(models.SOURCE_PY, mode=bblfsh.Modes.ANNOTATED).uast self.uast2walk = Uast2RandomWalks(p_explore_neighborhood=0.5, q_leave_neighborhood=0.5, n_walks=5, n_steps=19, node2index=FakeVocabulary(), seed=42)
def test_extract_functions_from_uast(self): client = bblfsh.BblfshClient("localhost:9432") uast = client.parse(MODER_FUNC).uast functions = list(Moder(mode="func").extract_functions_from_uast(uast)) self.assertEqual(len(functions), 3) function_names = ["func_a", "func_b", "func_c"] for f in functions: self.assertIn(f[0].token, function_names)
def test_vnode_positions(self): test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code).uast nodes, parents = list(self.extractor._parse_file(code.decode("utf-8", "replace"), uast, test_js_code_filepath)) # Just should not fail list(self.extractor._classify_vnodes(nodes, "filepath"))
def __init__(self, source_file, cfg=None): #client = cfg.ast_client() client = bblfsh.BblfshClient('localhost:9432') tree = client.parse(source_file).uast self.id = 0 self.nodes = [] self.nmap = {} self.visited = [] self.anytree = None self.__process__(tree, tree) self.__node_mapping__() self.anytree = self.__get_any_tree__()
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("FormatAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 with lzma.open(str(base / "benchmark.uast.xz")) as fin: cls.uast = bblfsh.Node.FromString(fin.read()) cls.base_files = cls.get_files_from_tar(str(base / "freecodecamp-base.tar.xz")) cls.head_files = cls.get_files_from_tar(str(base / "freecodecamp-head.tar.xz")) cls.ptr = ReferencePointer("someurl", "someref", "somecommit") FeatureExtractor._log.level = logging.DEBUG cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432")
def test_vnode_positions(self): test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code).uast file = BytesToUnicodeConverter.convert_file( File(content=code, uast=uast, language="javascript", path="test.js")) annotated_data = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_data) # Just should not fail self.extractor._classify_vnodes(annotated_data)
def main() -> None: args = parse_arguments() import bblfsh from pprint import pprint client = bblfsh.BblfshClient(args.ip + ":" + args.port) parse_result = client.parse(args.file) if parse_result.status != 0: print(parse_result.errors) pprint(run_checks(args.checks, args.language, parse_result.uast))
def test_parse_file_comment_after_regexp(self): code = b"x = // comment\n/<regexp>/;" uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="", language="javascript", contents=code).uast file = BytesToUnicodeConverter.convert_file( File(uast=uast, content=code, language="javascript", path="")) annotated_file = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_file) self.assertEqual( "".join( annotated_file[token.span] for token in annotated_file.iter_by_type(RawTokenAnnotation)), code.decode())
def run_default_fixture(path: str, check_fnc: CheckFnc, conn_str: str = "0.0.0.0:9432", silent: bool=False) \ -> Checks: from pprint import pprint client = bblfsh.BblfshClient(conn_str) language = os.path.normpath(os.path.abspath( inspect.getfile(check_fnc))).split(os.sep)[-2] fixture_path = os.path.join(THIS_PATH, "fixtures", language, os.path.split(path)[1][:-3] + ".java") res = check_fnc(client.parse(fixture_path).uast) if not silent: pprint(res) return res
def main(data, lang, output): client = bblfsh.BblfshClient("0.0.0.0:9432") files = recursive_glob(data, '*.%s' % lang) for file in files: print("Processing file: {}".format(file)) uast = client.parse(file).uast if len(uast.children) > 0: out_file = "%s/%s_uast.bin" % (output, file) print("Writing file %s" % out_file) if not os.path.exists(os.path.dirname(out_file)): os.makedirs(os.path.dirname(out_file)) with open(out_file, 'wb') as o: o.write(uast.SerializeToString()) return
def test_empty_strings(self): config = deepcopy(self.final_config["feature_extractor"]) config["cutoff_label_support"] = 0 client = bblfsh.BblfshClient("0.0.0.0:9432") def get_class_sequences_from_code(code: str) -> Sequence[Tuple[int, ...]]: uast = client.parse(filename="", language="javascript", contents=code.encode()).uast extractor = FeatureExtractor(language="javascript", **config) result = extractor.extract_features([UnicodeFile(content=code, uast=uast, path="", language="javascript")]) if result is None: self.fail("Could not parse test code.") _, _, (vnodes_y, _, _, _) = result return [vnode.y for vnode in vnodes_y] self.assertEqual(get_class_sequences_from_code("var a = '';"), get_class_sequences_from_code("var a = 'a';"))
def main() -> None: args = parse_arguments() import bblfsh client = bblfsh.BblfshClient(args.ip + ":" + args.port) parse_result = client.parse(args.file) if parse_result.status != 0: print(json.dumps(parse_result.errors)) print( json.dumps( run_checks(args.checks, args.language, parse_result.uast, json_result=False)))
def setUpClass(cls): config = FormatAnalyzer._load_config(get_config())["train"] cls.extractor = FeatureExtractor( language="javascript", **config["javascript"]["feature_extractor"]) test_js_code_filepath = Path(__file__).parent / "jquery.layout.js" with open(str(test_js_code_filepath), mode="rb") as f: cls.code = f.read() cls.uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=cls.code).uast feature_extractor_output = cls.extractor.extract_features([ FakeFile(path="test.py", content=cls.code, uast=cls.uast, language="JavaScript") ]) X, cls.y, (cls.vnodes_y, cls.vnodes, vnode_parents, node_parents) = \ feature_extractor_output
def test_positions(self): test_js_code_filepath = Path( __file__).parent / "browser-policy-content.js" with open(str(test_js_code_filepath), mode="rt") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse( filename="", language="javascript", contents=code.encode()).uast nodes, parents = self.extractor._parse_file(code, uast, test_js_code_filepath) for index, (node1, node2) in enumerate(zip(nodes, nodes[1:])): self.assertLessEqual( node1.start.line, node2.start.line, "Start line position decrease for %d, %d nodes" % (index, index + 1)) self.assertLessEqual( node1.start.offset, node2.start.offset, "Start offset position decrease for %d, %d nodes" % (index, index + 1))
def setUp(self): self.languages = get_languages() self.check_funcs: Dict[str, Dict[str, Any]] = {} self.fixtures: Dict[str, List[str]] = {} fixtures_dir = get_fixtures_dir() for lang in self.languages: self.check_funcs[lang] = { path: module.check for (module, path) in _get_check_modules(lang) } self.fixtures[lang] = [ os.path.join(fixtures_dir, lang, i) for i in os.listdir(os.path.join(fixtures_dir, lang)) ] self.client = bblfsh.BblfshClient("0.0.0.0:9432")
def test_parse_file_exact_match(self): test_js_code_filepath = str( Path(__file__).parent / "for_parse_test.js.xz") with lzma.open(test_js_code_filepath, mode="rb") as f: code = f.read() uast = bblfsh.BblfshClient("0.0.0.0:9432").parse(filename="", language="javascript", contents=code).uast file = BytesToUnicodeConverter.convert_file( File(uast=uast, content=code, language="javascript", path="")) annotated_file = AnnotationManager.from_file(file) self.extractor._parse_file(annotated_file) nodes, _ = file_to_old_parse_file_format(annotated_file) self.assertEqual("".join(n.value for n in nodes), code.decode()) self.assertEqual( "".join( annotated_file[token.span] for token in annotated_file.iter_by_type(RawTokenAnnotation)), code.decode())
def __init__(self, n_trials: int = 4, max_rep: int = 2, max_ins: int = 2, max_mutants: int = 10, bblfsh_address: str = "0.0.0.0:9432"): """ Initialize mutator. :param n_trials: number of trials for mutation. :param max_rep: max number of repeats. :param max_ins: max number of insertions. :param max_mutants: max number of mutants to collect. :param depth: max number of mutants to collect. """ self.n_trials = n_trials self.max_rep = max_rep self.max_ins = max_ins self.max_mutants = max_mutants self.bblfsh_address = bblfsh_address self.client = bblfsh.BblfshClient(endpoint=bblfsh_address)
def setUpClass(cls): logging.basicConfig(level=logging.INFO) logging.getLogger("IdTyposAnalyzer").setLevel(logging.DEBUG) base = Path(__file__).parent # str() is needed for Python 3.5 cls.bblfsh_client = bblfsh.BblfshClient("0.0.0.0:9432") with lzma.open(str(base / "test_base_file.js.xz")) as fin: contents = fin.read() uast = cls.bblfsh_client.parse("test_base_file.js", contents=contents).uast cls.base_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] with lzma.open(str(base / "test_head_file.js.xz")) as fin: contents = b"var print_tipe = 0;\n" + fin.read() uast = cls.bblfsh_client.parse("test_head_file.js", contents=contents).uast cls.head_files = [ File(path="test_file.js", content=contents, uast=uast, language="Javascript") ] cls.ptr = ReferencePointer("someurl", "someref", "somecommit") cls.config = { "model": MODEL_PATH, "confidence_threshold": 0.0, "n_candidates": 3, "check_all_identifiers": True, "analyze": { "filepath": cls.base_files[0].path, "wrong_id": "print_tipe", "line": 0 } }
def main_per_repository(self, repo_name: str) -> None: STATS_PATH = DEFAULT_STATS_DIR / repo_name / 'stats.db' STATS_PATH.parent.mkdir(parents=True, exist_ok=True) if STATS_PATH.exists(): logger.info('Stats already exist for %s' % repo_name) return stats = Stats() client = bblfsh.BblfshClient(self._bblfshd) repo = get_repository(repo_name) trainer = GitRepositoryTrainer(repo=repo, repo_name=repo_name, client=client, stats=stats, filters=[ VendorFilter(), LanguageFilter(['Go']), MaxSizeFilter(max_size=10 * 1024) ]) trainer.train_all() logger.info('saving stats: %s' % STATS_PATH) stats.save(filename=STATS_PATH) logger.info('saved stats: %s' % STATS_PATH)
def analyze_file(path: str): nonlocal errors if errors: return try: try: client = clients.client except AttributeError: client = bblfsh.BblfshClient(args.bblfsh) clients.client = client response = client.parse(path) nonlocal language if not language: language = response.language elif language != response.language: log.warning("dropped %s - language mismatch %s != %s", path, language, response.language) return analyze_uast(path, response.uast, roles, reserved) progress.update(1) except: # noqa: E722 log.exception("Parsing %s", path) errors = True
def main(data, output): client = bblfsh.BblfshClient("0.0.0.0:9432") files = recursive_glob(data, '*.py') uasts = [] for file in files: print("Processing file: {}".format(file)) uast = client.parse(file).uast if len(uast.children) > 0: uasts.append(uast) # print(uast) # "filter' allows you to use XPath queries to filter on result nodes: # print(bblfsh.filter(uast, "//Import[@roleImport and @roleDeclaration]//alias")) rules_count, nodes_count = process_uasts(uasts) # print_statistics(rules_count, nodes_count) # # cluster_nodes(nodes_count) # # save_roles(output, nodes_count) return
import bblfsh import sys import os from bblfsh import filter as filter_uast from java_sonar_rule_RSPEC_1214 import rule_chk as rule_chk from java_while_rule import rule_chk as rule_chk_while from java_sonar_rule_RSPEC_1764 import rule_chk as rule_chk_1764 if __name__ == "__main__": #Initializing the babelfish client client = bblfsh.BblfshClient("0.0.0.0:9432") response = client.parse(sys.argv[1]) if response.status != 0: raise Exception('Some error happened: ' + str(response.errors)) findings = rule_chk_while(response.uast) print(findings)
def __init__(self, path): self.path = path self.client = bblfsh.BblfshClient(self.server_endpoint)