示例#1
0
    def test_clone(self) -> None:
        # Clone an existing repo.
        result = ghcc.clone("huzecong",
                            "memes",
                            clone_folder=self.tempdir.name)
        self.assertTrue(result.success, msg=result.captured_output)
        self.assertTrue(os.path.exists(
            os.path.join(self.tempdir.name, "huzecong", "memes",
                         "Get Memes.scpt")),
                        msg=result.captured_output)

        # Non-existent repo.
        result = ghcc.clone("huzecong",
                            "non-existent-repo",
                            clone_folder=self.tempdir.name)
        self.assertFalse(result.success, msg=result.captured_output)
        self.assertEqual(ghcc.CloneErrorType.PrivateOrNonexistent,
                         result.error_type,
                         msg=result.captured_output)

        # Timeout
        result = ghcc.clone("torvalds",
                            "linux",
                            clone_folder=self.tempdir.name,
                            timeout=1)
        self.assertFalse(result.success, msg=result.captured_output)
        self.assertEqual(ghcc.CloneErrorType.Timeout,
                         result.error_type,
                         msg=result.captured_output)
示例#2
0
def main():
    flutes.register_ipython_excepthook()
    random.seed(ghcc.__MAGIC__)
    np.random.seed(ghcc.__MAGIC__)

    repo_info = analyze_logs(args.log_file)
    changed = changed_repos(repo_info)

    # Sample 100 failed repositories.
    repos_with_fail = [
        repo for repo, info in repo_info.items()
        if info["n_partial"][-1] < info["n_total"][-1]
    ]
    samples = np.random.choice(len(repos_with_fail), 100, replace=False)
    _repo_samples = [repos_with_fail[x] for x in samples]

    # Remove repositories with more than 50 Makefiles.
    repo_samples = []
    for repo in _repo_samples:
        _, val = repo_info[repo]["n_total"][-1]
        if val <= 50:
            repo_samples.append(repo)
        else:
            print(f"{repo} contains {val} Makefiles, skipping")

    # Clone the repositories.
    for repo in tqdm(repo_samples, desc="Cloning repos"):
        owner, name = repo.split("/")
        ghcc.clone(owner, name, "test_compile")

    # Write repository information into a CSV file.
    # Each line is a separate Makefile.
    db = ghcc.RepoDB()
    with open("repo_samples.csv", "w") as f:
        writer = csv.writer(f)
        writer.writerow(["Repo", "Makefile", "Status", "Failed Reason?"])

        for repo in tqdm(repo_samples, desc="Writing CSV"):
            makefiles = ghcc.find_makefiles(os.path.join("test_compile", repo))
            owner, name = repo.split("/")
            entry = db.get(owner, name)
            success_makefiles = set()
            for makefile_info in entry['makefiles']:
                directory = makefile_info["directory"]
                directory = "/".join([owner, name] + directory.split("/")[4:])
                success_makefiles.add(directory)
            for makefile in makefiles:
                directory = "/".join(makefile.split("/")[1:])
                status = "" if directory in success_makefiles else "Failed"
                writer.writerow([repo, directory, status])
                print(repo, directory, status)
示例#3
0
    def setUp(self) -> None:
        self.tempdir = tempfile.TemporaryDirectory()
        self.repo_owner = "pjreddie"
        self.repo_name = "uwimg"

        # Clone an existing repo.
        result = ghcc.clone(self.repo_owner, self.repo_name, clone_folder=self.tempdir.name, skip_if_exists=False)
        assert result.success is True, result.captured_output

        self.directory = os.path.join(self.tempdir.name, self.repo_owner, self.repo_name)
        self.target_elfs = [
            "libuwimg.so",
            "obj/args.o",
            "obj/classifier.o",
            "obj/data.o",
            "obj/filter_image.o",
            "obj/flow_image.o",
            "obj/harris_image.o",
            "obj/image_opencv.o",
            "obj/list.o",
            "obj/load_image.o",
            "obj/main.o",
            "obj/matrix.o",
            "obj/panorama_image.o",
            "obj/process_image.o",
            "obj/resize_image.o",
            "obj/test.o",
            "uwimg",
        ]
示例#4
0
    def test_serialization(self) -> None:
        # Clone the `pycparser` repo.
        result = ghcc.clone("eliben",
                            "pycparser",
                            clone_folder=self.tempdir.name)
        assert result.success

        def _test(code: str):
            ast = self.parser.parse(code)
            json_dict = ghcc.parse.ast_to_dict(ast)
            deserialized_ast = ghcc.parse.dict_to_ast(json_dict)
            self._test_ast_equivalent(ast, deserialized_ast)

        for file in (Path(self.tempdir.name) / "eliben" / "pycparser" /
                     "examples" / "c_files").iterdir():
            preprocessed_code = ghcc.parse.preprocess_file(str(file))
            _test(preprocessed_code)

        for code, _ in EXAMPLE_CODE:
            preprocessed_code = ghcc.parse.preprocess(code)
            _test(preprocessed_code)
示例#5
0
def match_functions(
        repo_info: RepoInfo,
        archive_folder: str,
        temp_folder: str,
        decompile_folder: str,
        use_fake_libc_headers: bool = True,
        preprocess_timeout: Optional[int] = None,
        *,
        progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None
) -> Result:
    # Directions:
    # 1. Clone or extract from archive.
    # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run.
    #    This probably won't take long as the compiler exits after running the processor, and linking would fail.
    #    Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`.
    # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions.

    start_time = time.time()
    total_files = sum(
        len(makefile) for makefile in repo_info.makefiles.values())
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    archive_path = (Path(archive_folder) /
                    f"{repo_full_name}.tar.gz").absolute()
    repo_dir = (Path(temp_folder) / repo_folder_name).absolute()
    repo_src_path = repo_dir / "src"
    repo_binary_dir = repo_dir / "bin"
    repo_binary_dir.mkdir(parents=True, exist_ok=True)
    has_error = False

    if progress_bar is not None:
        worker_id = flutes.get_worker_id()
        process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process"
        progress_bar.new(total=total_files,
                         desc=process_name + f" [{repo_full_name}]")

    flutes.log(f"Begin processing {repo_full_name} ({total_files} files)")

    if os.path.exists(archive_path):
        # Extract archive
        flutes.run_command(["tar", f"xzf", str(archive_path)],
                           cwd=str(repo_dir))
        (repo_dir / repo_folder_name).rename(repo_src_path)
    else:
        # Clone repo
        if repo_src_path.exists():
            shutil.rmtree(repo_src_path)
        ret = ghcc.clone(repo_info.repo_owner,
                         repo_info.repo_name,
                         clone_folder=str(repo_dir),
                         folder_name="src")
        if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]:
            flutes.log(
                f"Failed to clone {repo_full_name}: error type {ret.error_type}",
                "error")
            # Return a dummy result so this repo is ignored in the future.
            return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0,
                          0, 0)

    # Write makefile info to pickle
    with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl:
        pickle.dump(repo_info.makefiles, f_pkl)

    gcc_flags = "-E"
    directory_mapping = None
    if use_fake_libc_headers:
        gcc_flags = f"-E -nostdlib -I/usr/src/libc"
        directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"}

    if progress_bar is not None:
        progress_bar.update(postfix={"status": "preprocessing"})
    makefiles = ghcc.docker_batch_compile(
        str(repo_binary_dir),
        str(repo_src_path),
        compile_timeout=preprocess_timeout,
        gcc_override_flags=gcc_flags,
        use_makefile_info_pkl=True,
        directory_mapping=directory_mapping,
        user_id=(repo_info.idx % 10000) + 30000,  # user IDs 30000 ~ 39999
        exception_log_fn=functools.partial(exception_handler,
                                           repo_info=repo_info))

    parser = CParser(lexer=ghcc.parse.CachedCLexer)
    lexer = ghcc.parse.LexerWrapper()
    decompile_path = Path(decompile_folder)
    extractor = ghcc.parse.FunctionExtractor()
    matched_functions: List[MatchedFunction] = []
    preprocessed_original_code: Dict[str, str] = {}
    files_found = 0
    functions_found = 0
    for makefile in makefiles:
        mkfile_dir = Path(makefile['directory'])
        for path, sha in zip(makefile["binaries"], makefile["sha256"]):
            # Load and parse preprocessed original code.
            code_path = str(mkfile_dir / path)
            json_path = decompile_path / (sha + ".jsonl")
            preprocessed_code_path = repo_binary_dir / sha
            if progress_bar is not None:
                progress_bar.update(1, postfix={"file": code_path})
            if not json_path.exists() or not preprocessed_code_path.exists():
                continue
            try:
                with preprocessed_code_path.open("r") as f:
                    code = f.read()
                code = LINE_CONTROL_REGEX.sub("", code)
            except UnicodeDecodeError:
                continue  # probably a real binary file
            preprocessed_original_code[sha] = code
            try:
                original_ast: ASTNode = parser.parse(code,
                                                     filename=os.path.join(
                                                         repo_full_name, path))
            except (pycparser.c_parser.ParseError, AssertionError) as e:
                # For some reason `pycparser` uses `assert`s in places where there should have been a check.
                flutes.log(
                    f"{repo_full_name}: Parser error when processing file "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True
                continue  # ignore parsing errors
            original_tokens = ghcc.parse.convert_to_tokens(
                code, parser.clex.cached_tokens)
            files_found += 1
            function_asts = extractor.find_functions(original_ast)
            functions_found += len(function_asts)

            # Collect decompiled functions with matching original code.
            with json_path.open("r") as f:
                decompiled_json = [
                    line for line in f if line
                ]  # don't decode, as we only need the function name
            decompiled_funcs: Dict[str,
                                   str] = {}  # (func_name) -> decompiled_code
            decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \
                # (func_name) -> (var_id) -> (decomp_name, orig_name)

            for line_num, j in enumerate(decompiled_json):
                # Find function name from JSON line without parsing.
                match = JSON_FUNC_NAME_REGEX.search(j)
                assert match is not None
                func_name = match.group(1)
                if func_name not in function_asts:
                    continue

                try:
                    decompiled_data = json.loads(j)
                except json.JSONDecodeError as e:
                    flutes.log(
                        f"{repo_full_name}: Decode error when reading JSON file at {json_path}: "
                        f"{str(e)}", "error")
                    continue
                decompiled_code = decompiled_data["raw_code"]
                # Store the variable names used in the function.
                # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode.
                for length in range(3, 10 + 1):
                    var_identifier_prefix = "v" + "".join(
                        random.choices(string.ascii_lowercase, k=length))
                    if var_identifier_prefix not in decompiled_code:
                        break
                else:
                    # No way this is happening, right?
                    flutes.log(
                        f"{repo_full_name}: Could not find valid identifier prefix for "
                        f"{func_name} in {code_path} ({sha})", "error")
                    continue
                variables: Dict[str, Tuple[str, str]] = {
                }  # (var_id) -> (decompiled_name, original_name)
                for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code):
                    var_id, decompiled_name, original_name = match.groups()
                    var_id = f"{var_identifier_prefix}_{var_id}"
                    if var_id in variables:
                        assert variables[var_id] == (decompiled_name,
                                                     original_name)
                    else:
                        variables[var_id] = (decompiled_name, original_name)
                decompiled_var_names[func_name] = variables
                # Remove irregularities in decompiled code to make the it parsable:
                # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash).
                # - Remove the register allocation indication in `var@<rdi>`.
                decompiled_code = DECOMPILED_VAR_REGEX.sub(
                    rf"{var_identifier_prefix}_\1", decompiled_code)
                decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub(
                    "", decompiled_code)
                if func_name.startswith("_"):
                    # For some reason, Hexrays would chomp off one leading underscore from function names in their
                    # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we
                    # heuristically try to find and replace the changed function name.
                    decompiled_code = re.sub(  # replace all identifiers with matching name
                        r"(?<![a-zA-Z0-9_])" + func_name[1:] +
                        r"(?![a-zA-Z0-9_])", func_name, decompiled_code)
                    # Note that this doesn't fix references of the function in other functions. But really, why would
                    # someone name their function `_01inverse`?
                decompiled_funcs[func_name] = decompiled_code

            # Generate code replacing original functions with decompiled functions.
            replacer = ghcc.parse.FunctionReplacer(decompiled_funcs)
            replaced_code = replacer.visit(original_ast)

            # Obtain AST for decompiled code by parsing it again.
            code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code
            try:
                code_to_parse = ghcc.parse.preprocess(code_to_preprocess)
            except ghcc.parse.PreprocessError as e:
                msg = (
                    f"{repo_full_name}: GCC return value nonzero for decompiled code of "
                    f"{code_path} ({sha})")
                if len(e.args) > 0:
                    msg += ":\n" + str(e)
                flutes.log(msg, "error")
                has_error = True
                continue

            try:
                decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code(
                    code_to_parse, lexer, parser)
                decompiled_tokens = ghcc.parse.convert_to_tokens(
                    code_to_parse, parser.clex.cached_tokens)
            except (ValueError, pycparser.c_parser.ParseError) as e:
                flutes.log(
                    f"{repo_full_name}: Could not parse decompiled code for "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True

                # We don't have ASTs for decompiled functions, but we can still dump the code.
                # Use the dummy typedefs to extract functions.
                code_lines = code_to_parse.split("\n")
                func_begin_end: Dict[str, List[Optional[int]]] = defaultdict(
                    lambda: [None, None])
                for idx, line in enumerate(code_lines):
                    name, is_begin = replacer.extract_func_name(line)
                    if name is not None:
                        func_begin_end[name][0 if is_begin else 1] = idx
                for func_name, (begin, end) in func_begin_end.items():
                    if begin is not None and end is not None and func_name in function_asts:
                        decompiled_func_tokens = lexer.lex("\n".join(
                            code_lines[(begin + 1):end]))
                        original_func_ast = function_asts[func_name]
                        original_ast_json, original_func_tokens = serialize(
                            original_func_ast, original_tokens)
                        matched_func = MatchedFunction(
                            file_path=code_path,
                            binary_hash=sha,
                            func_name=func_name,
                            variable_names=decompiled_var_names[func_name],
                            original_tokens=original_func_tokens,
                            decompiled_tokens=decompiled_func_tokens,
                            original_ast_json=original_ast_json,
                            decompiled_ast_json=None)
                        matched_functions.append(matched_func)

            else:
                # We've successfully parsed decompiled code.
                decompiled_func_asts = extractor.find_functions(decompiled_ast)
                for func_name in decompiled_funcs.keys():
                    original_func_ast = function_asts[func_name]
                    if func_name not in decompiled_func_asts:
                        # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them.
                        continue
                    decompiled_func_ast = decompiled_func_asts[func_name]
                    original_ast_json, original_func_tokens = serialize(
                        original_func_ast, original_tokens)
                    decompiled_ast_json, decompiled_func_tokens = serialize(
                        decompiled_func_ast, decompiled_tokens)
                    matched_func = MatchedFunction(
                        file_path=code_path,
                        binary_hash=sha,
                        func_name=func_name,
                        variable_names=decompiled_var_names[func_name],
                        original_tokens=original_func_tokens,
                        decompiled_tokens=decompiled_func_tokens,
                        original_ast_json=original_ast_json,
                        decompiled_ast_json=decompiled_ast_json)
                    matched_functions.append(matched_func)

    # Cleanup the folders; if errors occurred, keep the preprocessed code.
    status = ("success" if not has_error and len(matched_functions) > 0 else (
        "warning" if not has_error or len(matched_functions) > 0 else "error"))
    shutil.rmtree(repo_dir)

    end_time = time.time()
    funcs_without_asts = sum(matched_func.decompiled_ast_json is None
                             for matched_func in matched_functions)
    flutes.log(
        f"[{end_time - start_time:6.2f}s] "
        f"{repo_full_name}: "
        f"Files found: {files_found}/{total_files}, "
        f"functions matched: {len(matched_functions)}/{functions_found} "
        f"({funcs_without_asts} w/o ASTs)",
        status,
        force_console=True)
    return Result(repo_owner=repo_info.repo_owner,
                  repo_name=repo_info.repo_name,
                  matched_functions=matched_functions,
                  preprocessed_original_code=preprocessed_original_code,
                  files_found=files_found,
                  functions_found=functions_found,
                  funcs_without_asts=funcs_without_asts)
示例#6
0
def clone_and_compile(
        repo_info: RepoInfo,
        clone_folder: str,
        binary_folder: str,
        archive_folder: str,
        recursive_clone: bool = True,
        clone_timeout: Optional[float] = None,
        compile_timeout: Optional[float] = None,
        force_reclone: bool = False,
        force_recompile: bool = False,
        docker_batch_compile: bool = True,
        max_archive_size: Optional[int] = None,
        compression_type: str = "gzip",
        record_libraries: bool = False,
        record_metainfo: bool = False,
        gcc_override_flags: Optional[str] = None) -> PipelineResult:
    r"""Perform the entire pipeline.

    :param repo_info: Information about the repository.
    :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be
        ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``.
        This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of
        empty folders for repository owners.
    :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will
        be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``.
    :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will
        be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``.

    :param recursive_clone: If ``True``, uses ``--recursive`` when cloning.
    :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time.
    :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time.
    :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there
        are no matching archives.
    :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB.
    :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container.
    :param max_archive_size: If specified, only archive repositories whose size is not larger than the given
        value (in bytes).
    :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and
        ``"xz"`` (smaller).
    :param record_libraries: If ``True``, record the libraries used in compilation.
    :param record_metainfo: If ``True``, record meta-info values.
    :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC.

    :return: An entry to insert into the DB, or `None` if no operations are required.
    """
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_path = os.path.join(clone_folder, repo_folder_name)
    if compression_type == "xz":
        archive_extension = ".tar.xz"
        tar_type_flag = "J"
    elif compression_type == "gzip":
        archive_extension = ".tar.gz"
        tar_type_flag = "z"
    else:
        raise ValueError(f"Invalid compression type '{compression_type}'")
    archive_path = os.path.abspath(
        os.path.join(archive_folder, f"{repo_full_name}{archive_extension}"))

    repo_entry = repo_info.db_result
    clone_success = None

    # Skip repos that are fully processed
    if (repo_entry is not None
            and (repo_entry["clone_successful"] and not force_reclone)
            and (repo_entry["compiled"] and not force_recompile)):
        return PipelineResult(repo_info)

    # Stage 1: Cloning from GitHub.
    if not force_reclone and os.path.exists(archive_path):
        # Extract the archive instead of cloning.
        try:
            flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path],
                               timeout=clone_timeout,
                               cwd=clone_folder)
            flutes.log(f"{repo_full_name} extracted from archive", "success")
        except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
            flutes.log(
                f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'",
                "error")
            shutil.rmtree(repo_path)
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)
    elif (repo_entry is None or  # not processed
          force_reclone or (repo_entry["clone_successful"] and  # not compiled
                            (not repo_entry["compiled"] or force_recompile) and
                            not os.path.exists(repo_path))):
        clone_result = ghcc.clone(repo_info.repo_owner,
                                  repo_info.repo_name,
                                  clone_folder=clone_folder,
                                  folder_name=repo_folder_name,
                                  timeout=clone_timeout,
                                  skip_if_exists=False,
                                  recursive=recursive_clone)
        clone_success = clone_result.success
        if not clone_result.success:
            if clone_result.error_type is CloneErrorType.FolderExists:
                flutes.log(f"{repo_full_name} skipped because folder exists",
                           "warning")
            elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent:
                flutes.log(
                    f"Failed to clone {repo_full_name} because repository is private or nonexistent",
                    "warning")
            else:
                if clone_result.error_type is CloneErrorType.Unknown:
                    msg = f"Failed to clone {repo_full_name} with unknown error"
                else:  # CloneErrorType.Timeout
                    msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}"
                if clone_result.captured_output is not None:
                    msg += f". Captured output: '{clone_result.captured_output!r}'"
                flutes.log(msg, "error")

                if clone_result.error_type is CloneErrorType.Unknown:
                    return PipelineResult(repo_info)  # return dummy info

            return PipelineResult(repo_info, clone_success=clone_success)

        elif clone_result.error_type is CloneErrorType.SubmodulesFailed:
            msg = f"Submodules in {repo_full_name} ignored due to error"
            if clone_result.captured_output is not None:
                msg += f". Captured output: '{clone_result.captured_output!r}'"
            flutes.log(msg, "warning")

        repo_size = flutes.get_folder_size(repo_path)
        flutes.log(
            f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, "
            f"{flutes.readable_size(repo_size)})", "success")
    else:
        if not repo_entry["clone_successful"]:
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)

    makefiles = None
    libraries = None
    meta_info: Optional[PipelineMetaInfo] = None
    if not repo_entry or not repo_entry["compiled"] or force_recompile:
        # # SPECIAL CHECK: Do not attempt to compile OS kernels!
        # kernel_name = None
        # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"):
        #     kernel_name = "Linux"
        # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"):
        #     kernel_name = "FreeBSD"
        # if kernel_name is not None:
        #     shutil.rmtree(repo_path)
        #     ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. "
        #              f"Repository deleted", "warning")
        #     return PipelineResult(repo_info, clone_success=clone_success, makefiles=[])

        # Stage 2: Finding Makefiles.
        makefile_dirs = ghcc.find_makefiles(repo_path)
        if len(makefile_dirs) == 0:
            # Repo has no Makefiles, delete.
            shutil.rmtree(repo_path)
            flutes.log(
                f"No Makefiles found in {repo_full_name}, repository deleted",
                "warning")
            return PipelineResult(repo_info,
                                  clone_success=clone_success,
                                  makefiles=[])
        else:
            pass

        # Stage 3: Compile each Makefile.
        repo_binary_dir = os.path.join(binary_folder, repo_full_name)
        if not os.path.exists(repo_binary_dir):
            os.makedirs(repo_binary_dir)
        flutes.log(f"Starting compilation for {repo_full_name}...")

        if docker_batch_compile:
            makefiles = ghcc.docker_batch_compile(
                repo_binary_dir,
                repo_path,
                compile_timeout,
                record_libraries,
                gcc_override_flags,
                user_id=(repo_info.idx % 10000) +
                30000,  # user IDs 30000 ~ 39999
                exception_log_fn=functools.partial(exception_handler,
                                                   repo_info=repo_info))
        else:
            makefiles = list(
                ghcc.compile_and_move(repo_binary_dir, repo_path,
                                      makefile_dirs, compile_timeout,
                                      record_libraries, gcc_override_flags))
        num_succeeded = sum(makefile["success"] for makefile in makefiles)
        if record_libraries:
            library_log_path = os.path.join(repo_binary_dir, "libraries.txt")
            if os.path.exists(library_log_path):
                with open(library_log_path) as f:
                    libraries = list(set(f.read().split()))
            else:
                libraries = []
        num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles)

        msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \
              f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries"
        flutes.log(
            msg,
            "success" if num_succeeded == len(makefile_dirs) else "warning")

        if record_metainfo:
            meta_info = PipelineMetaInfo({
                "num_makefiles":
                len(makefile_dirs),
                "has_gitmodules":
                os.path.exists(os.path.join(repo_path, ".gitmodules")),
                "makefiles_using_automake":
                sum(
                    ghcc.contains_files(directory,
                                        ["configure.ac", "configure.in"])
                    for directory in makefile_dirs)
            })

        # Stage 4: Clean and zip repo.
        if max_archive_size is not None and repo_size > max_archive_size:
            shutil.rmtree(repo_path)
            flutes.log(
                f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) "
                f"exceeds limits", "info")
        else:
            # Repository is already cleaned in the compile stage.
            os.makedirs(os.path.split(archive_path)[0], exist_ok=True)
            compress_success = False
            try:
                flutes.run_command([
                    "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name
                ],
                                   timeout=clone_timeout,
                                   cwd=clone_folder)
                compress_success = True
            except subprocess.TimeoutExpired:
                flutes.log(
                    f"Compression timeout for {repo_full_name}, giving up",
                    "error")
            except subprocess.CalledProcessError as e:
                flutes.log(
                    f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'",
                    "error")
            shutil.rmtree(repo_path)
            if compress_success:
                flutes.log(f"Compressed {repo_full_name}, folder removed",
                           "info")
            elif os.path.exists(archive_path):
                os.remove(archive_path)

    return PipelineResult(repo_info,
                          clone_success=clone_success,
                          repo_size=repo_size,
                          makefiles=makefiles,
                          libraries=libraries,
                          meta_info=meta_info)