def iter_binaries(db: ghcc.BinaryDB, binaries: Dict[str, BinaryInfo]) -> Iterator[BinaryInfo]: binary_entries = {entry["sha"]: entry for entry in db.collection.find() } # getting stuff in batch is much faster skipped_count = 0 migrated_count = 0 for sha, info in binaries.items(): entry = binary_entries.get(sha, None) if entry is not None: if "repo_owner" in entry: skipped_count += 1 else: db.collection.update_one({"_id": entry["_id"]}, { "$set": { "repo_owner": info["repo_owner"], "repo_name": info["repo_name"], } }) migrated_count += 1 continue if migrated_count > 0: flutes.log(f"Migrated {migrated_count} binary entries", force_console=True) migrated_count = 0 if skipped_count > 0: flutes.log( f"Skipped {skipped_count} binaries that have been processed", force_console=True) skipped_count = 0 yield info
def progress_bar_fn(idx: int, bar) -> None: total = (idx + 1) * 2 bar.new(desc=f"Bar {idx}", total=total) for i in range(total): bar.update(1, postfix={"i": i}) if i % 5 == 1: flutes.log(f"test {i}") for i in bar.new(range(total)): bar.update(postfix={"i": i})
def main(args): random.seed(args.seed) if (not args.output_dir.exists()) or args.overwrite: args.output_dir.mkdir(exist_ok=True, parents=True) else: print(f"Directory {args.output_dir} already exists.") sys.exit(0) tr_indices, vl_indices, ts_indices = make_splits_indices( 100, args.split_ratio) with (args.output_dir / "file_map.txt").open("w") as f: print("train", file=f) for i in tr_indices: print(f"{i} ", file=f, end="") print(file=f) print("valid", file=f) for i in vl_indices: print(f"{i} ", file=f, end="") print(file=f) print("test") for i in ts_indices: print(f"{i} ", file=f, end="") print(file=f) # in/out file pairs files = ([( (args.dump_dir / f"{i}.pkl"), (args.output_dir / f"train_{new_idx:02}.jsonl"), ) for new_idx, i in enumerate(tr_indices)] + [( (args.dump_dir / f"{i}.pkl"), (args.output_dir / f"valid_{new_idx:02}.jsonl"), ) for new_idx, i in enumerate(vl_indices)] + [( (args.dump_dir / f"{i}.pkl"), (args.output_dir / f"test_{new_idx:02}.jsonl"), ) for new_idx, i in enumerate(ts_indices)]) # files = sorted( # list((args.dump_dir).glob("*.pkl")), key=lambda x: int(x.with_suffix("").name) # ) total = {} with flutes.safe_pool(processes=args.njobs, state_class=Processor) as pool: for idx, _ in enumerate( pool.imap_unordered(Processor.process_pkl, files, chunksize=1)): flutes.log(f"Processed {(idx + 1)} files") states = pool.get_states() for state in states: total.update(state.results) print( f"Train: {sum(v for k, v in total.items() if k.startswith('train'))}") print( f"Valid: {sum(v for k, v in total.items() if k.startswith('valid'))}") print(f"Test: {sum(v for k, v in total.items() if k.startswith('test'))}")
def main(args) -> None: files = [(args.output_dir, args.input_dir / f"{i}.pkl") for i in range(0, 10000, 100)] total_map = {} with flutes.work_in_progress("Parallel"): with flutes.safe_pool(processes=args.njobs, state_class=Worker) as pool_stateful: for idx, _ in enumerate( pool_stateful.imap_unordered(Worker.merge, files, chunksize=1)): flutes.log(f"Processed {(idx + 1)} files")
def test_ProgressBarManager() -> None: for verbose in [False, True]: for proc in [0, 2]: # Test multiprocessing in `proc = 2` # Test coverage in `proc = 0` manager = flutes.ProgressBarManager(verbose=verbose) with flutes.safe_pool(proc, closing=[manager]) as pool: fn = functools.partial(progress_bar_fn, bar=manager.proxy) pool.map(fn, range(10)) fn = functools.partial(file_progress_bar_fn, bar=manager.proxy) pool.map(fn, range(4)) flutes.log( f"This should still show up: verbose={verbose}, proc={proc}", force_console=True)
def test_log() -> None: with tempfile.NamedTemporaryFile("w") as f_tmp: flutes.set_log_file(f_tmp.name) flutes.set_log_file(f_tmp.name) flutes.set_logging_level("warning") flutes.log("info output", "info") flutes.log("warning output", "warning") flutes.log("error output", "error") flutes.log("success output", "success")
def iter_repos(db: ghcc.RepoDB, repo_list_path: str, max_count: Optional[int] = None) -> Iterator[RepoInfo]: db_entries = { (entry["repo_owner"], entry["repo_name"]): entry for entry in db.collection.find() # getting stuff in batch is much faster } flutes.log(f"{len(db_entries)} entries loaded from DB") index = 0 with open(repo_list_path, "r") as repo_file: for line in repo_file: if not line: continue url = line.strip().rstrip("/") if url.endswith(".git"): url = url[:-len(".git")] repo_owner, repo_name = url.split("/")[-2:] # db_result = db.get(repo_owner, repo_name) db_result = db_entries.get((repo_owner, repo_name), None) yield RepoInfo(index, repo_owner, repo_name, db_result) index += 1 if max_count is not None and index >= max_count: break
def main(args) -> None: Proc = LegacyFilter if args.legacy else Filter if args.target == "paper": processing = Proc.filter_ids_complete # gathers pid, in/out cite pids elif args.target == "citation": processing = Proc.filter_ids_text # gathers pids if args.valid_citations is not None and args.valid_citations.exists(): with args.valid_citations.open("rb") as f: d = pickle.load(f) dict_valid_citations = {k: True for _, pids in d.items() for k in pids} del d else: dict_valid_citations = {} files = [ (f, dict_valid_citations, args.min_cite, args.max_cite, args.seed) for f in list(args.input_dir.glob("*")) ] with flutes.work_in_progress("Parallel"): total_results = defaultdict(list) with flutes.safe_pool(processes=args.njobs, state_class=Proc) as pool_stateful: for idx, _ in enumerate( pool_stateful.imap_unordered(processing, files, chunksize=10) ): flutes.log(f"Processed {(idx + 1)} files") with flutes.work_in_progress("Get states"): states = pool_stateful.get_states() for state in states: total_results.update(state.results) with args.output_file.open("wb") as f: # Dict[batchnum, List[obj]] pickle.dump(total_results, f)
def load_bg_info_legacy(target_pid, obj, content="cite_context"): if obj is None: flutes.log(f"{target_pid} not found.") return None if content == "abstract": if obj["metadata"]["abstract"] is not None: return obj["metadata"]["abstract"] elif len(obj["grobid_parse"]["abstract"]) > 0: return obj["grobid_parse"]["abstract"][0]["text"] return obj["metadata"]["abstract"] elif content == "cite_context": parse_data = obj["grobid_parse"] if parse_data is None: flutes.log(f"parse not found.") return None try: # find the right BIBREFX bibdict = { v["links"]: k for k, v in parse_data["bib_entries"].items() if v["links"] is not None } bibref = bibdict[target_pid] except KeyError: # data bug flutes.log(f"links not found.") return None match = [] for block in parse_data["body_text"]: sec = block["section"] spans = [ span for span in block["cite_spans"] if span["ref_id"] == bibref ] if len(spans) == 0: continue # look for the right sent idx where the span belongs to. sents = sent_tokenize(block["text"].replace("al.", "al@")) # cumulative sum + stripped (hopefully just one) spaces for each sent. sent_start_pos = np.cumsum([len(s) for s in sents]) + np.arange( len(sents), dtype=np.int) for sp in spans: # TODO: try to pick up section information here once merged! sent_idx = bisect.bisect_left(sent_start_pos, sp["start"]) if sent_idx == len(sents): # de-tokenize error sent_idx = len(sents) - 1 match.append((sec, sents[sent_idx].replace("al@", "al."))) return match
def compare_logs(info_old: Dict[str, Dict[str, int]], info_new: Dict[str, Dict[str, int]]) -> Dict[str, DiffDict]: for repo_name in info_new: if repo_name not in info_old: flutes.log(f"{repo_name} missing in OLD log", "error") repo_diff: Dict[str, DiffDict] = defaultdict(dict) for repo_name in info_old: if repo_name not in info_new: flutes.log(f"{repo_name} missing in NEW", "error") continue old_repo_info = info_old[repo_name] new_repo_info = info_new[repo_name] difference = [] for tag in TAGS: old_val = old_repo_info[tag] new_val = new_repo_info[tag] if old_val != new_val: difference.append(f"{tag} {old_val}->{new_val}") repo_diff[repo_name][tag] = (old_val, new_val) if len(difference) > 0: flutes.log(f"{repo_name}: {', '.join(difference)}") return repo_diff
def main() -> None: if args.n_procs == 0: # Only do this on the single-threaded case. flutes.register_ipython_excepthook() flutes.log(f"Running with {args.n_procs} worker processes", "warning") # Check for/create output directories make_directory(args.output_dir) # Use RAM-backed memory for tmp if available if os.path.exists('/dev/shm'): tempfile.tempdir = '/dev/shm' flutes.set_log_file(args.log_file) write_pseudo_registry() # Obtain a list of all binaries binaries = get_binary_mapping(args.binary_mapping_cache_file) flutes.log(f"{len(binaries)} binaries to process.") file_count = 0 db = ghcc.BinaryDB() with flutes.safe_pool(args.n_procs, closing=[db]) as pool: decompile_fn: Callable[[BinaryInfo], DecompilationResult] = functools.partial( decompile, output_dir=args.output_dir, binary_dir=args.binaries_dir, timeout=args.timeout) for result in pool.imap_unordered(decompile_fn, iter_binaries(db, binaries)): file_count += 1 if result is not None: db.add_binary(result.info["repo_owner"], result.info["repo_name"], result.hash, result.status is DecompilationStatus.Success) if file_count % 100 == 0: flutes.log(f"Processed {file_count} binaries", force_console=True)
def main() -> None: if not ghcc.utils.verify_docker_image(verbose=True): exit(1) sys.setrecursionlimit(10000) args = Arguments() if args.pdb: flutes.register_ipython_excepthook() if args.n_procs == 0: globals()['match_functions'] = match_functions.__wrapped__ if not args.verbose: flutes.set_logging_level("quiet", console=True, file=False) flutes.set_log_file(args.log_file) flutes.log("Running with arguments:\n" + args.to_string(), force_console=True) if os.path.exists(args.temp_dir): flutes.log( f"Removing contents of temporary folder '{args.temp_dir}'...", "warning", force_console=True) ghcc.utils.run_docker_command( ["rm", "-rf", "/usr/src/*"], user=0, directory_mapping={args.temp_dir: "/usr/src"}) db = ghcc.MatchFuncDB() output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) manager = flutes.ProgressBarManager( verbose=args.show_progress, bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]") with flutes.safe_pool(args.n_procs, closing=[db, manager]) as pool: iterator, stats = iter_repos( db, args.max_repos, skip_to=args.skip_to, cache_path=args.repo_binary_info_cache_path, force_reprocess=args.force_reprocess) match_fn: Callable[[RepoInfo], Result] = functools.partial( match_functions, archive_folder=args.archive_dir, temp_folder=args.temp_dir, decompile_folder=args.decompile_dir, use_fake_libc_headers=args.use_fake_libc_headers, preprocess_timeout=args.preprocess_timeout, progress_bar=manager.proxy) repo_count = stats.repo_count func_count = stats.func_count func_without_ast_count = stats.func_without_ast_count for result in pool.imap_unordered(match_fn, iterator): if result is None: # Exception occurred. if args.exit_on_exception: flutes.log( f"Exception occurred, exiting because 'exit_on_exception' is True", "warning") break continue # Write the matched functions to disk. result: Result # type: ignore repo_dir = output_dir / result.repo_owner / result.repo_name repo_dir.mkdir(parents=True, exist_ok=True) with (repo_dir / "matched_funcs.jsonl").open("w") as f: for matched_func in result.matched_functions: f.write( json.dumps(matched_func._asdict(), separators=(',', ':')) + "\n") for sha, code in result.preprocessed_original_code.items(): with (repo_dir / f"{sha}.c").open("w") as f: pos = code.rfind(ghcc.parse.FAKE_LIBC_END_LINE) if pos != -1: code = code[(pos + len(ghcc.parse.FAKE_LIBC_END_LINE)):] f.write(code) if args.write_db: db.add_repo( result.repo_owner, result.repo_name, files_found=result.files_found, funcs_found=result.functions_found, funcs_matched=len(result.matched_functions), funcs_matched_without_ast=result.funcs_without_asts) repo_count += 1 func_count += len(result.matched_functions) func_without_ast_count += result.funcs_without_asts if repo_count % 100 == 0: flutes.log( f"Processed {repo_count} repositories, {func_count} functions matched " f"({func_without_ast_count} w/o AST)", force_console=True)
def main(): # flutes.register_ipython_excepthook() sys.setrecursionlimit(50000) args = Arguments() output_dir = Path(args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) flutes.log("Dataset creation start") db = ghcc.MatchFuncDB() original_code_set: Set[str] = set() n_duplicate = 0 n_examples = 0 manager = mp.Manager() example_queue: 'mp.Queue[QueueElem]' = manager.Queue(args.queue_size) with flutes.safe_pool(args.n_procs, closing=[db]) as pool: repos = [ RepoInfo(entry['repo_owner'], entry['repo_name']) for entry in db.collection.find() if entry['funcs_matched'] > 0 ] if args.max_repos is not None: repos = repos[:args.max_repos] process_fn: Callable[[RepoInfo], None] = functools.partial(process, data_dir=args.input_dir, queue=example_queue) pool.map_async(process_fn, repos, error_callback=flutes.log_exception) end_signals = 0 progress = tqdm.tqdm(total=len(repos)) file_cnt = 0 text_data = [] def save_file(): nonlocal file_cnt, text_data # Save text & AST separately with (output_dir / f"data_{file_cnt:03d}.pkl").open("wb") as f: pickle.dump(text_data, f, protocol=PICKLE_PROTOCOL) progress.write(f"Saved part {file_cnt:03d}") text_data = [] file_cnt += 1 while end_signals < len(repos): elem = example_queue.get() if elem == END_SIGNATURE: progress.update(1) end_signals += 1 continue ex = pickle.loads(elem) original_code = ex[1] if original_code not in original_code_set: original_code_set.add(original_code) text_data.append( ex) # (decompiled, orig, var_names, repo, sha) n_examples += 1 else: n_duplicate += 1 if (n_examples + n_duplicate) % 100 == 0: progress.set_postfix( { "duplicate": n_duplicate, "examples": n_examples }, refresh=False) progress.refresh() if len(text_data) >= args.block_size: save_file() if len(text_data) > 0: save_file()
def match_functions( repo_info: RepoInfo, archive_folder: str, temp_folder: str, decompile_folder: str, use_fake_libc_headers: bool = True, preprocess_timeout: Optional[int] = None, *, progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None ) -> Result: # Directions: # 1. Clone or extract from archive. # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run. # This probably won't take long as the compiler exits after running the processor, and linking would fail. # Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`. # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions. start_time = time.time() total_files = sum( len(makefile) for makefile in repo_info.makefiles.values()) repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" archive_path = (Path(archive_folder) / f"{repo_full_name}.tar.gz").absolute() repo_dir = (Path(temp_folder) / repo_folder_name).absolute() repo_src_path = repo_dir / "src" repo_binary_dir = repo_dir / "bin" repo_binary_dir.mkdir(parents=True, exist_ok=True) has_error = False if progress_bar is not None: worker_id = flutes.get_worker_id() process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process" progress_bar.new(total=total_files, desc=process_name + f" [{repo_full_name}]") flutes.log(f"Begin processing {repo_full_name} ({total_files} files)") if os.path.exists(archive_path): # Extract archive flutes.run_command(["tar", f"xzf", str(archive_path)], cwd=str(repo_dir)) (repo_dir / repo_folder_name).rename(repo_src_path) else: # Clone repo if repo_src_path.exists(): shutil.rmtree(repo_src_path) ret = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=str(repo_dir), folder_name="src") if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]: flutes.log( f"Failed to clone {repo_full_name}: error type {ret.error_type}", "error") # Return a dummy result so this repo is ignored in the future. return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0, 0, 0) # Write makefile info to pickle with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl: pickle.dump(repo_info.makefiles, f_pkl) gcc_flags = "-E" directory_mapping = None if use_fake_libc_headers: gcc_flags = f"-E -nostdlib -I/usr/src/libc" directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"} if progress_bar is not None: progress_bar.update(postfix={"status": "preprocessing"}) makefiles = ghcc.docker_batch_compile( str(repo_binary_dir), str(repo_src_path), compile_timeout=preprocess_timeout, gcc_override_flags=gcc_flags, use_makefile_info_pkl=True, directory_mapping=directory_mapping, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) parser = CParser(lexer=ghcc.parse.CachedCLexer) lexer = ghcc.parse.LexerWrapper() decompile_path = Path(decompile_folder) extractor = ghcc.parse.FunctionExtractor() matched_functions: List[MatchedFunction] = [] preprocessed_original_code: Dict[str, str] = {} files_found = 0 functions_found = 0 for makefile in makefiles: mkfile_dir = Path(makefile['directory']) for path, sha in zip(makefile["binaries"], makefile["sha256"]): # Load and parse preprocessed original code. code_path = str(mkfile_dir / path) json_path = decompile_path / (sha + ".jsonl") preprocessed_code_path = repo_binary_dir / sha if progress_bar is not None: progress_bar.update(1, postfix={"file": code_path}) if not json_path.exists() or not preprocessed_code_path.exists(): continue try: with preprocessed_code_path.open("r") as f: code = f.read() code = LINE_CONTROL_REGEX.sub("", code) except UnicodeDecodeError: continue # probably a real binary file preprocessed_original_code[sha] = code try: original_ast: ASTNode = parser.parse(code, filename=os.path.join( repo_full_name, path)) except (pycparser.c_parser.ParseError, AssertionError) as e: # For some reason `pycparser` uses `assert`s in places where there should have been a check. flutes.log( f"{repo_full_name}: Parser error when processing file " f"{code_path} ({sha}): {str(e)}", "error") has_error = True continue # ignore parsing errors original_tokens = ghcc.parse.convert_to_tokens( code, parser.clex.cached_tokens) files_found += 1 function_asts = extractor.find_functions(original_ast) functions_found += len(function_asts) # Collect decompiled functions with matching original code. with json_path.open("r") as f: decompiled_json = [ line for line in f if line ] # don't decode, as we only need the function name decompiled_funcs: Dict[str, str] = {} # (func_name) -> decompiled_code decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \ # (func_name) -> (var_id) -> (decomp_name, orig_name) for line_num, j in enumerate(decompiled_json): # Find function name from JSON line without parsing. match = JSON_FUNC_NAME_REGEX.search(j) assert match is not None func_name = match.group(1) if func_name not in function_asts: continue try: decompiled_data = json.loads(j) except json.JSONDecodeError as e: flutes.log( f"{repo_full_name}: Decode error when reading JSON file at {json_path}: " f"{str(e)}", "error") continue decompiled_code = decompiled_data["raw_code"] # Store the variable names used in the function. # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode. for length in range(3, 10 + 1): var_identifier_prefix = "v" + "".join( random.choices(string.ascii_lowercase, k=length)) if var_identifier_prefix not in decompiled_code: break else: # No way this is happening, right? flutes.log( f"{repo_full_name}: Could not find valid identifier prefix for " f"{func_name} in {code_path} ({sha})", "error") continue variables: Dict[str, Tuple[str, str]] = { } # (var_id) -> (decompiled_name, original_name) for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code): var_id, decompiled_name, original_name = match.groups() var_id = f"{var_identifier_prefix}_{var_id}" if var_id in variables: assert variables[var_id] == (decompiled_name, original_name) else: variables[var_id] = (decompiled_name, original_name) decompiled_var_names[func_name] = variables # Remove irregularities in decompiled code to make the it parsable: # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash). # - Remove the register allocation indication in `var@<rdi>`. decompiled_code = DECOMPILED_VAR_REGEX.sub( rf"{var_identifier_prefix}_\1", decompiled_code) decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub( "", decompiled_code) if func_name.startswith("_"): # For some reason, Hexrays would chomp off one leading underscore from function names in their # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we # heuristically try to find and replace the changed function name. decompiled_code = re.sub( # replace all identifiers with matching name r"(?<![a-zA-Z0-9_])" + func_name[1:] + r"(?![a-zA-Z0-9_])", func_name, decompiled_code) # Note that this doesn't fix references of the function in other functions. But really, why would # someone name their function `_01inverse`? decompiled_funcs[func_name] = decompiled_code # Generate code replacing original functions with decompiled functions. replacer = ghcc.parse.FunctionReplacer(decompiled_funcs) replaced_code = replacer.visit(original_ast) # Obtain AST for decompiled code by parsing it again. code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code try: code_to_parse = ghcc.parse.preprocess(code_to_preprocess) except ghcc.parse.PreprocessError as e: msg = ( f"{repo_full_name}: GCC return value nonzero for decompiled code of " f"{code_path} ({sha})") if len(e.args) > 0: msg += ":\n" + str(e) flutes.log(msg, "error") has_error = True continue try: decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code( code_to_parse, lexer, parser) decompiled_tokens = ghcc.parse.convert_to_tokens( code_to_parse, parser.clex.cached_tokens) except (ValueError, pycparser.c_parser.ParseError) as e: flutes.log( f"{repo_full_name}: Could not parse decompiled code for " f"{code_path} ({sha}): {str(e)}", "error") has_error = True # We don't have ASTs for decompiled functions, but we can still dump the code. # Use the dummy typedefs to extract functions. code_lines = code_to_parse.split("\n") func_begin_end: Dict[str, List[Optional[int]]] = defaultdict( lambda: [None, None]) for idx, line in enumerate(code_lines): name, is_begin = replacer.extract_func_name(line) if name is not None: func_begin_end[name][0 if is_begin else 1] = idx for func_name, (begin, end) in func_begin_end.items(): if begin is not None and end is not None and func_name in function_asts: decompiled_func_tokens = lexer.lex("\n".join( code_lines[(begin + 1):end])) original_func_ast = function_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=None) matched_functions.append(matched_func) else: # We've successfully parsed decompiled code. decompiled_func_asts = extractor.find_functions(decompiled_ast) for func_name in decompiled_funcs.keys(): original_func_ast = function_asts[func_name] if func_name not in decompiled_func_asts: # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them. continue decompiled_func_ast = decompiled_func_asts[func_name] original_ast_json, original_func_tokens = serialize( original_func_ast, original_tokens) decompiled_ast_json, decompiled_func_tokens = serialize( decompiled_func_ast, decompiled_tokens) matched_func = MatchedFunction( file_path=code_path, binary_hash=sha, func_name=func_name, variable_names=decompiled_var_names[func_name], original_tokens=original_func_tokens, decompiled_tokens=decompiled_func_tokens, original_ast_json=original_ast_json, decompiled_ast_json=decompiled_ast_json) matched_functions.append(matched_func) # Cleanup the folders; if errors occurred, keep the preprocessed code. status = ("success" if not has_error and len(matched_functions) > 0 else ( "warning" if not has_error or len(matched_functions) > 0 else "error")) shutil.rmtree(repo_dir) end_time = time.time() funcs_without_asts = sum(matched_func.decompiled_ast_json is None for matched_func in matched_functions) flutes.log( f"[{end_time - start_time:6.2f}s] " f"{repo_full_name}: " f"Files found: {files_found}/{total_files}, " f"functions matched: {len(matched_functions)}/{functions_found} " f"({funcs_without_asts} w/o ASTs)", status, force_console=True) return Result(repo_owner=repo_info.repo_owner, repo_name=repo_info.repo_name, matched_functions=matched_functions, preprocessed_original_code=preprocessed_original_code, files_found=files_found, functions_found=functions_found, funcs_without_asts=funcs_without_asts)
def clone_and_compile( repo_info: RepoInfo, clone_folder: str, binary_folder: str, archive_folder: str, recursive_clone: bool = True, clone_timeout: Optional[float] = None, compile_timeout: Optional[float] = None, force_reclone: bool = False, force_recompile: bool = False, docker_batch_compile: bool = True, max_archive_size: Optional[int] = None, compression_type: str = "gzip", record_libraries: bool = False, record_metainfo: bool = False, gcc_override_flags: Optional[str] = None) -> PipelineResult: r"""Perform the entire pipeline. :param repo_info: Information about the repository. :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``. This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of empty folders for repository owners. :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``. :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``. :param recursive_clone: If ``True``, uses ``--recursive`` when cloning. :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time. :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time. :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there are no matching archives. :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB. :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container. :param max_archive_size: If specified, only archive repositories whose size is not larger than the given value (in bytes). :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and ``"xz"`` (smaller). :param record_libraries: If ``True``, record the libraries used in compilation. :param record_metainfo: If ``True``, record meta-info values. :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC. :return: An entry to insert into the DB, or `None` if no operations are required. """ repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}" repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}" repo_path = os.path.join(clone_folder, repo_folder_name) if compression_type == "xz": archive_extension = ".tar.xz" tar_type_flag = "J" elif compression_type == "gzip": archive_extension = ".tar.gz" tar_type_flag = "z" else: raise ValueError(f"Invalid compression type '{compression_type}'") archive_path = os.path.abspath( os.path.join(archive_folder, f"{repo_full_name}{archive_extension}")) repo_entry = repo_info.db_result clone_success = None # Skip repos that are fully processed if (repo_entry is not None and (repo_entry["clone_successful"] and not force_reclone) and (repo_entry["compiled"] and not force_recompile)): return PipelineResult(repo_info) # Stage 1: Cloning from GitHub. if not force_reclone and os.path.exists(archive_path): # Extract the archive instead of cloning. try: flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path], timeout=clone_timeout, cwd=clone_folder) flutes.log(f"{repo_full_name} extracted from archive", "success") except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e: flutes.log( f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) elif (repo_entry is None or # not processed force_reclone or (repo_entry["clone_successful"] and # not compiled (not repo_entry["compiled"] or force_recompile) and not os.path.exists(repo_path))): clone_result = ghcc.clone(repo_info.repo_owner, repo_info.repo_name, clone_folder=clone_folder, folder_name=repo_folder_name, timeout=clone_timeout, skip_if_exists=False, recursive=recursive_clone) clone_success = clone_result.success if not clone_result.success: if clone_result.error_type is CloneErrorType.FolderExists: flutes.log(f"{repo_full_name} skipped because folder exists", "warning") elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent: flutes.log( f"Failed to clone {repo_full_name} because repository is private or nonexistent", "warning") else: if clone_result.error_type is CloneErrorType.Unknown: msg = f"Failed to clone {repo_full_name} with unknown error" else: # CloneErrorType.Timeout msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "error") if clone_result.error_type is CloneErrorType.Unknown: return PipelineResult(repo_info) # return dummy info return PipelineResult(repo_info, clone_success=clone_success) elif clone_result.error_type is CloneErrorType.SubmodulesFailed: msg = f"Submodules in {repo_full_name} ignored due to error" if clone_result.captured_output is not None: msg += f". Captured output: '{clone_result.captured_output!r}'" flutes.log(msg, "warning") repo_size = flutes.get_folder_size(repo_path) flutes.log( f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, " f"{flutes.readable_size(repo_size)})", "success") else: if not repo_entry["clone_successful"]: return PipelineResult(repo_info) # return dummy info repo_size = flutes.get_folder_size(repo_path) makefiles = None libraries = None meta_info: Optional[PipelineMetaInfo] = None if not repo_entry or not repo_entry["compiled"] or force_recompile: # # SPECIAL CHECK: Do not attempt to compile OS kernels! # kernel_name = None # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"): # kernel_name = "Linux" # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"): # kernel_name = "FreeBSD" # if kernel_name is not None: # shutil.rmtree(repo_path) # ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. " # f"Repository deleted", "warning") # return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) # Stage 2: Finding Makefiles. makefile_dirs = ghcc.find_makefiles(repo_path) if len(makefile_dirs) == 0: # Repo has no Makefiles, delete. shutil.rmtree(repo_path) flutes.log( f"No Makefiles found in {repo_full_name}, repository deleted", "warning") return PipelineResult(repo_info, clone_success=clone_success, makefiles=[]) else: pass # Stage 3: Compile each Makefile. repo_binary_dir = os.path.join(binary_folder, repo_full_name) if not os.path.exists(repo_binary_dir): os.makedirs(repo_binary_dir) flutes.log(f"Starting compilation for {repo_full_name}...") if docker_batch_compile: makefiles = ghcc.docker_batch_compile( repo_binary_dir, repo_path, compile_timeout, record_libraries, gcc_override_flags, user_id=(repo_info.idx % 10000) + 30000, # user IDs 30000 ~ 39999 exception_log_fn=functools.partial(exception_handler, repo_info=repo_info)) else: makefiles = list( ghcc.compile_and_move(repo_binary_dir, repo_path, makefile_dirs, compile_timeout, record_libraries, gcc_override_flags)) num_succeeded = sum(makefile["success"] for makefile in makefiles) if record_libraries: library_log_path = os.path.join(repo_binary_dir, "libraries.txt") if os.path.exists(library_log_path): with open(library_log_path) as f: libraries = list(set(f.read().split())) else: libraries = [] num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles) msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \ f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries" flutes.log( msg, "success" if num_succeeded == len(makefile_dirs) else "warning") if record_metainfo: meta_info = PipelineMetaInfo({ "num_makefiles": len(makefile_dirs), "has_gitmodules": os.path.exists(os.path.join(repo_path, ".gitmodules")), "makefiles_using_automake": sum( ghcc.contains_files(directory, ["configure.ac", "configure.in"]) for directory in makefile_dirs) }) # Stage 4: Clean and zip repo. if max_archive_size is not None and repo_size > max_archive_size: shutil.rmtree(repo_path) flutes.log( f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) " f"exceeds limits", "info") else: # Repository is already cleaned in the compile stage. os.makedirs(os.path.split(archive_path)[0], exist_ok=True) compress_success = False try: flutes.run_command([ "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name ], timeout=clone_timeout, cwd=clone_folder) compress_success = True except subprocess.TimeoutExpired: flutes.log( f"Compression timeout for {repo_full_name}, giving up", "error") except subprocess.CalledProcessError as e: flutes.log( f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'", "error") shutil.rmtree(repo_path) if compress_success: flutes.log(f"Compressed {repo_full_name}, folder removed", "info") elif os.path.exists(archive_path): os.remove(archive_path) return PipelineResult(repo_info, clone_success=clone_success, repo_size=repo_size, makefiles=makefiles, libraries=libraries, meta_info=meta_info)
def main() -> None: if not ghcc.utils.verify_docker_image(verbose=True): exit(1) args = Arguments() if args.n_procs == 0: # Only do this on the single-threaded case. flutes.register_ipython_excepthook() flutes.set_log_file(args.log_file) flutes.set_logging_level(args.logging_level, console=True, file=False) flutes.log("Running with arguments:\n" + args.to_string(), force_console=True) if os.path.exists(args.clone_folder): flutes.log( f"Removing contents of clone folder '{args.clone_folder}'...", "warning", force_console=True) ghcc.utils.run_docker_command( ["rm", "-rf", "/usr/src/*"], user=0, directory_mapping={args.clone_folder: "/usr/src"}) flutes.log("Crawling starts...", "warning", force_console=True) db = ghcc.RepoDB() libraries: Set[str] = set() if args.record_libraries is not None and os.path.exists( args.record_libraries): with open(args.record_libraries, "r") as f: libraries = set(f.read().split()) def flush_libraries(): if args.record_libraries is not None: with open(args.record_libraries, "w") as f: f.write("\n".join(libraries)) with flutes.safe_pool(args.n_procs, closing=[db, flush_libraries]) as pool: iterator = iter_repos(db, args.repo_list_file, args.max_repos) pipeline_fn: Callable[ [RepoInfo], Optional[PipelineResult]] = functools.partial( clone_and_compile, clone_folder=args.clone_folder, binary_folder=args.binary_folder, archive_folder=args.archive_folder, recursive_clone=args.recursive_clone, clone_timeout=args.clone_timeout, compile_timeout=args.compile_timeout, force_reclone=args.force_reclone, force_recompile=args.force_recompile, docker_batch_compile=args.docker_batch_compile, max_archive_size=args.max_archive_size, compression_type=args.compression_type, record_libraries=(args.record_libraries is not None), record_metainfo=args.record_metainfo, gcc_override_flags=args.gcc_override_flags) repo_count = 0 meta_info = MetaInfo() for result in pool.imap_unordered(pipeline_fn, iterator): repo_count += 1 if repo_count % 100 == 0: flutes.log(f"Processed {repo_count} repositories", force_console=True) if result is None: continue repo_owner, repo_name = result.repo_info.repo_owner, result.repo_info.repo_name if args.write_db: if result.clone_success is not None or result.repo_info.db_result is None: # There's probably an inconsistency somewhere if we didn't clone while `db_result` is None. # To prevent more errors, just add it to the DB. repo_size = result.repo_size or -1 # a value of zero is probably also wrong clone_success = result.clone_success if result.clone_success is not None else True db.add_repo(repo_owner, repo_name, clone_success, repo_size=repo_size) flutes.log(f"Added {repo_owner}/{repo_name} to DB") if result.makefiles is not None: update_result = db.update_makefile( repo_owner, repo_name, result.makefiles, ignore_length_mismatch=True) if not update_result: flutes.log( f"Makefiles of {repo_owner}/{repo_name} not saved to DB due to Unicode encoding " f"errors", "error") if result.libraries is not None: libraries.update(result.libraries) if repo_count % 10 == 0: # flush every 10 repos flush_libraries() if args.record_metainfo: meta_info.add_repo(result) if repo_count % 100 == 0: flutes.log(repr(meta_info), force_console=True) flutes.log(repr(meta_info), force_console=True)
import os import subprocess import flutes import ghcc parser = argparse.ArgumentParser() parser.add_argument("folder", type=str) # the folder to clean up parser.add_argument("-y", action="store_true", default=False) # yes args = parser.parse_args() try: parent = os.path.abspath(os.path.join(args.folder, "..")) folder = os.path.split(os.path.abspath(args.folder))[1] yes = args.y if not yes: confirm = input( f"This will delete {parent} / {folder}. Confirm? [y/N] ") yes = confirm.lower() in ["y", "yes"] if yes: ghcc.utils.run_docker_command(["rm", "-rf", f"/usr/src/{folder}"], user=0, directory_mapping={parent: "/usr/src"}) except subprocess.CalledProcessError as e: flutes.log(f"Command failed with retcode {e.returncode}", "error") output = e.output.decode("utf-8") if len(output) > 200: output = output[:200] + "... (omitted)" flutes.log("Captured output: " + output)
def decompile(binary_info: BinaryInfo, output_dir: str, binary_dir: str, timeout: Optional[int] = None) -> DecompilationResult: binary_path = binary_info["path"] original_path = binary_info["path_in_repo"] binary_hash = os.path.split(binary_path)[1] def create_result( status: DecompilationStatus, time: Optional[datetime.timedelta] = None) -> DecompilationResult: return DecompilationResult(binary_info, binary_hash, status, time) output_path = os.path.join(output_dir, f"{binary_hash}.jsonl") if os.path.exists(output_path): # Binary already decompiled, but for some reason it wasn't written to the DB. return create_result(DecompilationStatus.Success) start = datetime.datetime.now() env: EnvDict = os.environ.copy() env['IDALOG'] = '/dev/stdout' env['PREFIX'] = binary_hash file_path = os.path.join(binary_dir, binary_path) # Create a temporary directory, since the decompiler makes a lot of additional # files that we can't clean up from here. with tempfile.TemporaryDirectory() as tempdir: # Put the output JSONL file here as well to prevent partially-generated files. env['OUTPUT_DIR'] = os.path.abspath(tempdir) with tempfile.NamedTemporaryFile(dir=tempdir) as collected_vars: # First collect variables. env['COLLECTED_VARS'] = collected_vars.name with tempfile.NamedTemporaryFile(dir=tempdir) as orig: flutes.run_command(['cp', file_path, orig.name]) # Timeout after 30 seconds for first run. try: run_decompiler(orig.name, COLLECT, env=env, timeout=timeout) except subprocess.TimeoutExpired: flutes.log(f"[TIMED OUT] {original_path} ({binary_path})", "warning") return create_result(DecompilationStatus.TimedOut) try: assert pickle.load(collected_vars) # non-empty except: flutes.log(f"[NO VARS] {original_path} ({binary_path})", "warning") return create_result(DecompilationStatus.NoVariables) # Make a new stripped copy and pass it the collected vars. with tempfile.NamedTemporaryFile(dir=tempdir) as stripped: flutes.run_command(['cp', file_path, stripped.name]) flutes.run_command(['strip', '--strip-debug', stripped.name]) # Dump the trees. # No timeout here, we know it'll run in a reasonable amount of # time and don't want mismatched files. run_decompiler(stripped.name, DUMP_TREES, env=env) jsonl_path = os.path.join(tempdir, f"{binary_hash}.jsonl") flutes.run_command(['cp', jsonl_path, output_path]) end = datetime.datetime.now() duration = end - start flutes.log( f"[OK {duration.total_seconds():5.2f}s] {original_path} ({binary_path})", "success") return create_result(DecompilationStatus.Success, duration)
def main(args) -> None: pids_with_text = set() if args.valid_pids and args.valid_pids.exists(): pids_with_text = set() with args.valid_pids.open("rb") as f: data = pickle.load(f) buf = set() if args.mode == "citation": for k, d in tqdm(enumerate(data.values()), ncols=88, ascii=True): buf = buf.union( set([pid for i in d for pid in i[1] + i[2]])) if k % 500 == 0: pids_with_text = pids_with_text.union(buf) buf = set() elif args.mode == "paper": for k, d in tqdm(data.items(), ncols=88, ascii=True): buf = buf.union(set([i[0] for i in d])) if k % 500 == 0: pids_with_text = pids_with_text.union(buf) buf = set() # remaining one pids_with_text = pids_with_text.union(buf) flutes.log(f"# of valid pids to consider: {len(pids_with_text)}") if args.legacy: # glob takes more time than this? files = ((args.input_dir / f"{i}.jsonl.gz", pids_with_text) for i in range(10000)) Proc = LegacyFilter else: files = ((args.input_dir / f"pdf_parses_{i}.jsonl.gz", pids_with_text) for i in range(100)) Proc = Filter with flutes.work_in_progress("Parallel"): total_map = {} with flutes.safe_pool(processes=args.njobs, state_class=Proc) as pool_stateful: for idx, _ in enumerate( pool_stateful.imap_unordered(Proc.make_map, files, chunksize=10)): flutes.log(f"Processed {(idx + 1)} files") with flutes.work_in_progress("Get states"): states = pool_stateful.get_states() for state in states: # TODO: Incorporate incite number total_map.update(state.results) flutes.log(f"Total map size: {len(total_map)}") with args.output.open("w") as f: for k, v in total_map.items(): print(k, v[0], v[1], sep="\t", file=f) flutes.log(f"Dumped to {args.output}")