Пример #1
0
def iter_binaries(db: ghcc.BinaryDB,
                  binaries: Dict[str, BinaryInfo]) -> Iterator[BinaryInfo]:
    binary_entries = {entry["sha"]: entry
                      for entry in db.collection.find()
                      }  # getting stuff in batch is much faster
    skipped_count = 0
    migrated_count = 0
    for sha, info in binaries.items():
        entry = binary_entries.get(sha, None)
        if entry is not None:
            if "repo_owner" in entry:
                skipped_count += 1
            else:
                db.collection.update_one({"_id": entry["_id"]}, {
                    "$set": {
                        "repo_owner": info["repo_owner"],
                        "repo_name": info["repo_name"],
                    }
                })
                migrated_count += 1
            continue
        if migrated_count > 0:
            flutes.log(f"Migrated {migrated_count} binary entries",
                       force_console=True)
            migrated_count = 0
        if skipped_count > 0:
            flutes.log(
                f"Skipped {skipped_count} binaries that have been processed",
                force_console=True)
            skipped_count = 0
        yield info
Пример #2
0
def progress_bar_fn(idx: int, bar) -> None:
    total = (idx + 1) * 2
    bar.new(desc=f"Bar {idx}", total=total)
    for i in range(total):
        bar.update(1, postfix={"i": i})
        if i % 5 == 1:
            flutes.log(f"test {i}")
    for i in bar.new(range(total)):
        bar.update(postfix={"i": i})
Пример #3
0
def main(args):

    random.seed(args.seed)

    if (not args.output_dir.exists()) or args.overwrite:
        args.output_dir.mkdir(exist_ok=True, parents=True)
    else:
        print(f"Directory {args.output_dir} already exists.")
        sys.exit(0)

    tr_indices, vl_indices, ts_indices = make_splits_indices(
        100, args.split_ratio)
    with (args.output_dir / "file_map.txt").open("w") as f:
        print("train", file=f)
        for i in tr_indices:
            print(f"{i} ", file=f, end="")
        print(file=f)
        print("valid", file=f)
        for i in vl_indices:
            print(f"{i} ", file=f, end="")
        print(file=f)
        print("test")
        for i in ts_indices:
            print(f"{i} ", file=f, end="")
        print(file=f)

    # in/out file pairs
    files = ([(
        (args.dump_dir / f"{i}.pkl"),
        (args.output_dir / f"train_{new_idx:02}.jsonl"),
    ) for new_idx, i in enumerate(tr_indices)] + [(
        (args.dump_dir / f"{i}.pkl"),
        (args.output_dir / f"valid_{new_idx:02}.jsonl"),
    ) for new_idx, i in enumerate(vl_indices)] + [(
        (args.dump_dir / f"{i}.pkl"),
        (args.output_dir / f"test_{new_idx:02}.jsonl"),
    ) for new_idx, i in enumerate(ts_indices)])

    # files = sorted(
    #     list((args.dump_dir).glob("*.pkl")), key=lambda x: int(x.with_suffix("").name)
    # )

    total = {}
    with flutes.safe_pool(processes=args.njobs, state_class=Processor) as pool:
        for idx, _ in enumerate(
                pool.imap_unordered(Processor.process_pkl, files,
                                    chunksize=1)):
            flutes.log(f"Processed {(idx + 1)} files")

        states = pool.get_states()
        for state in states:
            total.update(state.results)
    print(
        f"Train: {sum(v for k, v in total.items() if k.startswith('train'))}")
    print(
        f"Valid: {sum(v for k, v in total.items() if k.startswith('valid'))}")
    print(f"Test:  {sum(v for k, v in total.items() if k.startswith('test'))}")
Пример #4
0
def main(args) -> None:

    files = [(args.output_dir, args.input_dir / f"{i}.pkl")
             for i in range(0, 10000, 100)]

    total_map = {}
    with flutes.work_in_progress("Parallel"):
        with flutes.safe_pool(processes=args.njobs,
                              state_class=Worker) as pool_stateful:
            for idx, _ in enumerate(
                    pool_stateful.imap_unordered(Worker.merge,
                                                 files,
                                                 chunksize=1)):
                flutes.log(f"Processed {(idx + 1)} files")
Пример #5
0
def test_ProgressBarManager() -> None:
    for verbose in [False, True]:
        for proc in [0, 2]:
            # Test multiprocessing in `proc = 2`
            # Test coverage in `proc = 0`
            manager = flutes.ProgressBarManager(verbose=verbose)
            with flutes.safe_pool(proc, closing=[manager]) as pool:
                fn = functools.partial(progress_bar_fn, bar=manager.proxy)
                pool.map(fn, range(10))
                fn = functools.partial(file_progress_bar_fn, bar=manager.proxy)
                pool.map(fn, range(4))
            flutes.log(
                f"This should still show up: verbose={verbose}, proc={proc}",
                force_console=True)
Пример #6
0
def test_log() -> None:
    with tempfile.NamedTemporaryFile("w") as f_tmp:
        flutes.set_log_file(f_tmp.name)
        flutes.set_log_file(f_tmp.name)
        flutes.set_logging_level("warning")
        flutes.log("info output", "info")
        flutes.log("warning output", "warning")
        flutes.log("error output", "error")
        flutes.log("success output", "success")
Пример #7
0
def iter_repos(db: ghcc.RepoDB,
               repo_list_path: str,
               max_count: Optional[int] = None) -> Iterator[RepoInfo]:
    db_entries = {
        (entry["repo_owner"], entry["repo_name"]): entry
        for entry in
        db.collection.find()  # getting stuff in batch is much faster
    }
    flutes.log(f"{len(db_entries)} entries loaded from DB")
    index = 0
    with open(repo_list_path, "r") as repo_file:
        for line in repo_file:
            if not line:
                continue
            url = line.strip().rstrip("/")
            if url.endswith(".git"):
                url = url[:-len(".git")]
            repo_owner, repo_name = url.split("/")[-2:]
            # db_result = db.get(repo_owner, repo_name)
            db_result = db_entries.get((repo_owner, repo_name), None)
            yield RepoInfo(index, repo_owner, repo_name, db_result)
            index += 1
            if max_count is not None and index >= max_count:
                break
Пример #8
0
def main(args) -> None:
    Proc = LegacyFilter if args.legacy else Filter

    if args.target == "paper":
        processing = Proc.filter_ids_complete  # gathers pid, in/out cite pids
    elif args.target == "citation":
        processing = Proc.filter_ids_text  # gathers pids

    if args.valid_citations is not None and args.valid_citations.exists():
        with args.valid_citations.open("rb") as f:
            d = pickle.load(f)
        dict_valid_citations = {k: True for _, pids in d.items() for k in pids}
        del d
    else:
        dict_valid_citations = {}

    files = [
        (f, dict_valid_citations, args.min_cite, args.max_cite, args.seed)
        for f in list(args.input_dir.glob("*"))
    ]

    with flutes.work_in_progress("Parallel"):
        total_results = defaultdict(list)
        with flutes.safe_pool(processes=args.njobs, state_class=Proc) as pool_stateful:
            for idx, _ in enumerate(
                pool_stateful.imap_unordered(processing, files, chunksize=10)
            ):
                flutes.log(f"Processed {(idx + 1)} files")
            with flutes.work_in_progress("Get states"):
                states = pool_stateful.get_states()
            for state in states:
                total_results.update(state.results)

    with args.output_file.open("wb") as f:
        # Dict[batchnum, List[obj]]
        pickle.dump(total_results, f)
Пример #9
0
def load_bg_info_legacy(target_pid, obj, content="cite_context"):

    if obj is None:
        flutes.log(f"{target_pid} not found.")
        return None

    if content == "abstract":
        if obj["metadata"]["abstract"] is not None:
            return obj["metadata"]["abstract"]
        elif len(obj["grobid_parse"]["abstract"]) > 0:
            return obj["grobid_parse"]["abstract"][0]["text"]
        return obj["metadata"]["abstract"]

    elif content == "cite_context":
        parse_data = obj["grobid_parse"]
        if parse_data is None:
            flutes.log(f"parse not found.")
            return None

        try:
            # find the right BIBREFX
            bibdict = {
                v["links"]: k
                for k, v in parse_data["bib_entries"].items()
                if v["links"] is not None
            }
            bibref = bibdict[target_pid]

        except KeyError:
            # data bug
            flutes.log(f"links not found.")
            return None

        match = []
        for block in parse_data["body_text"]:
            sec = block["section"]
            spans = [
                span for span in block["cite_spans"]
                if span["ref_id"] == bibref
            ]
            if len(spans) == 0:
                continue

            # look for the right sent idx where the span belongs to.
            sents = sent_tokenize(block["text"].replace("al.", "al@"))
            # cumulative sum + stripped (hopefully just one) spaces for each sent.
            sent_start_pos = np.cumsum([len(s) for s in sents]) + np.arange(
                len(sents), dtype=np.int)
            for sp in spans:
                # TODO: try to pick up section information here once merged!
                sent_idx = bisect.bisect_left(sent_start_pos, sp["start"])
                if sent_idx == len(sents):  # de-tokenize error
                    sent_idx = len(sents) - 1
                match.append((sec, sents[sent_idx].replace("al@", "al.")))

        return match
Пример #10
0
def compare_logs(info_old: Dict[str, Dict[str, int]], info_new: Dict[str, Dict[str, int]]) -> Dict[str, DiffDict]:
    for repo_name in info_new:
        if repo_name not in info_old:
            flutes.log(f"{repo_name} missing in OLD log", "error")
    repo_diff: Dict[str, DiffDict] = defaultdict(dict)
    for repo_name in info_old:
        if repo_name not in info_new:
            flutes.log(f"{repo_name} missing in NEW", "error")
            continue
        old_repo_info = info_old[repo_name]
        new_repo_info = info_new[repo_name]
        difference = []
        for tag in TAGS:
            old_val = old_repo_info[tag]
            new_val = new_repo_info[tag]
            if old_val != new_val:
                difference.append(f"{tag} {old_val}->{new_val}")
                repo_diff[repo_name][tag] = (old_val, new_val)
        if len(difference) > 0:
            flutes.log(f"{repo_name}: {', '.join(difference)}")
    return repo_diff
Пример #11
0
def main() -> None:
    if args.n_procs == 0:
        # Only do this on the single-threaded case.
        flutes.register_ipython_excepthook()
    flutes.log(f"Running with {args.n_procs} worker processes", "warning")

    # Check for/create output directories
    make_directory(args.output_dir)

    # Use RAM-backed memory for tmp if available
    if os.path.exists('/dev/shm'):
        tempfile.tempdir = '/dev/shm'

    flutes.set_log_file(args.log_file)
    write_pseudo_registry()

    # Obtain a list of all binaries
    binaries = get_binary_mapping(args.binary_mapping_cache_file)

    flutes.log(f"{len(binaries)} binaries to process.")
    file_count = 0
    db = ghcc.BinaryDB()

    with flutes.safe_pool(args.n_procs, closing=[db]) as pool:
        decompile_fn: Callable[[BinaryInfo],
                               DecompilationResult] = functools.partial(
                                   decompile,
                                   output_dir=args.output_dir,
                                   binary_dir=args.binaries_dir,
                                   timeout=args.timeout)
        for result in pool.imap_unordered(decompile_fn,
                                          iter_binaries(db, binaries)):
            file_count += 1
            if result is not None:
                db.add_binary(result.info["repo_owner"],
                              result.info["repo_name"], result.hash,
                              result.status is DecompilationStatus.Success)
            if file_count % 100 == 0:
                flutes.log(f"Processed {file_count} binaries",
                           force_console=True)
Пример #12
0
def main() -> None:
    if not ghcc.utils.verify_docker_image(verbose=True):
        exit(1)

    sys.setrecursionlimit(10000)
    args = Arguments()
    if args.pdb:
        flutes.register_ipython_excepthook()
        if args.n_procs == 0:
            globals()['match_functions'] = match_functions.__wrapped__

    if not args.verbose:
        flutes.set_logging_level("quiet", console=True, file=False)
    flutes.set_log_file(args.log_file)
    flutes.log("Running with arguments:\n" + args.to_string(),
               force_console=True)

    if os.path.exists(args.temp_dir):
        flutes.log(
            f"Removing contents of temporary folder '{args.temp_dir}'...",
            "warning",
            force_console=True)
        ghcc.utils.run_docker_command(
            ["rm", "-rf", "/usr/src/*"],
            user=0,
            directory_mapping={args.temp_dir: "/usr/src"})

    db = ghcc.MatchFuncDB()
    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    manager = flutes.ProgressBarManager(
        verbose=args.show_progress,
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]")
    with flutes.safe_pool(args.n_procs, closing=[db, manager]) as pool:
        iterator, stats = iter_repos(
            db,
            args.max_repos,
            skip_to=args.skip_to,
            cache_path=args.repo_binary_info_cache_path,
            force_reprocess=args.force_reprocess)
        match_fn: Callable[[RepoInfo], Result] = functools.partial(
            match_functions,
            archive_folder=args.archive_dir,
            temp_folder=args.temp_dir,
            decompile_folder=args.decompile_dir,
            use_fake_libc_headers=args.use_fake_libc_headers,
            preprocess_timeout=args.preprocess_timeout,
            progress_bar=manager.proxy)

        repo_count = stats.repo_count
        func_count = stats.func_count
        func_without_ast_count = stats.func_without_ast_count
        for result in pool.imap_unordered(match_fn, iterator):
            if result is None:
                # Exception occurred.
                if args.exit_on_exception:
                    flutes.log(
                        f"Exception occurred, exiting because 'exit_on_exception' is True",
                        "warning")
                    break
                continue

            # Write the matched functions to disk.
            result: Result  # type: ignore
            repo_dir = output_dir / result.repo_owner / result.repo_name
            repo_dir.mkdir(parents=True, exist_ok=True)
            with (repo_dir / "matched_funcs.jsonl").open("w") as f:
                for matched_func in result.matched_functions:
                    f.write(
                        json.dumps(matched_func._asdict(),
                                   separators=(',', ':')) + "\n")
            for sha, code in result.preprocessed_original_code.items():
                with (repo_dir / f"{sha}.c").open("w") as f:
                    pos = code.rfind(ghcc.parse.FAKE_LIBC_END_LINE)
                    if pos != -1:
                        code = code[(pos +
                                     len(ghcc.parse.FAKE_LIBC_END_LINE)):]
                    f.write(code)

            if args.write_db:
                db.add_repo(
                    result.repo_owner,
                    result.repo_name,
                    files_found=result.files_found,
                    funcs_found=result.functions_found,
                    funcs_matched=len(result.matched_functions),
                    funcs_matched_without_ast=result.funcs_without_asts)

            repo_count += 1
            func_count += len(result.matched_functions)
            func_without_ast_count += result.funcs_without_asts
            if repo_count % 100 == 0:
                flutes.log(
                    f"Processed {repo_count} repositories, {func_count} functions matched "
                    f"({func_without_ast_count} w/o AST)",
                    force_console=True)
Пример #13
0
def main():
    # flutes.register_ipython_excepthook()

    sys.setrecursionlimit(50000)
    args = Arguments()

    output_dir = Path(args.output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    flutes.log("Dataset creation start")

    db = ghcc.MatchFuncDB()
    original_code_set: Set[str] = set()
    n_duplicate = 0
    n_examples = 0
    manager = mp.Manager()
    example_queue: 'mp.Queue[QueueElem]' = manager.Queue(args.queue_size)
    with flutes.safe_pool(args.n_procs, closing=[db]) as pool:
        repos = [
            RepoInfo(entry['repo_owner'], entry['repo_name'])
            for entry in db.collection.find() if entry['funcs_matched'] > 0
        ]
        if args.max_repos is not None:
            repos = repos[:args.max_repos]
        process_fn: Callable[[RepoInfo],
                             None] = functools.partial(process,
                                                       data_dir=args.input_dir,
                                                       queue=example_queue)
        pool.map_async(process_fn, repos, error_callback=flutes.log_exception)
        end_signals = 0
        progress = tqdm.tqdm(total=len(repos))
        file_cnt = 0
        text_data = []

        def save_file():
            nonlocal file_cnt, text_data
            # Save text & AST separately
            with (output_dir / f"data_{file_cnt:03d}.pkl").open("wb") as f:
                pickle.dump(text_data, f, protocol=PICKLE_PROTOCOL)
            progress.write(f"Saved part {file_cnt:03d}")
            text_data = []
            file_cnt += 1

        while end_signals < len(repos):
            elem = example_queue.get()
            if elem == END_SIGNATURE:
                progress.update(1)
                end_signals += 1
                continue

            ex = pickle.loads(elem)
            original_code = ex[1]
            if original_code not in original_code_set:
                original_code_set.add(original_code)
                text_data.append(
                    ex)  # (decompiled, orig, var_names, repo, sha)
                n_examples += 1
            else:
                n_duplicate += 1
            if (n_examples + n_duplicate) % 100 == 0:
                progress.set_postfix(
                    {
                        "duplicate": n_duplicate,
                        "examples": n_examples
                    },
                    refresh=False)
                progress.refresh()
            if len(text_data) >= args.block_size:
                save_file()

        if len(text_data) > 0:
            save_file()
Пример #14
0
def match_functions(
        repo_info: RepoInfo,
        archive_folder: str,
        temp_folder: str,
        decompile_folder: str,
        use_fake_libc_headers: bool = True,
        preprocess_timeout: Optional[int] = None,
        *,
        progress_bar: Optional[flutes.ProgressBarManager.Proxy] = None
) -> Result:
    # Directions:
    # 1. Clone or extract from archive.
    # 2. For each Makefile, rerun the compilation process with the flag "-E", so only the preprocessor is run.
    #    This probably won't take long as the compiler exits after running the processor, and linking would fail.
    #    Also, consider using "-nostdlib -Ipath/to/fake_libc_include" as suggested by `pycparser`.
    # 3. The .o files are now preprocessed C code. Parse them using `pycparser` to obtain a list of functions.

    start_time = time.time()
    total_files = sum(
        len(makefile) for makefile in repo_info.makefiles.values())
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    archive_path = (Path(archive_folder) /
                    f"{repo_full_name}.tar.gz").absolute()
    repo_dir = (Path(temp_folder) / repo_folder_name).absolute()
    repo_src_path = repo_dir / "src"
    repo_binary_dir = repo_dir / "bin"
    repo_binary_dir.mkdir(parents=True, exist_ok=True)
    has_error = False

    if progress_bar is not None:
        worker_id = flutes.get_worker_id()
        process_name = f"Worker {worker_id}" if worker_id is not None else "Main Process"
        progress_bar.new(total=total_files,
                         desc=process_name + f" [{repo_full_name}]")

    flutes.log(f"Begin processing {repo_full_name} ({total_files} files)")

    if os.path.exists(archive_path):
        # Extract archive
        flutes.run_command(["tar", f"xzf", str(archive_path)],
                           cwd=str(repo_dir))
        (repo_dir / repo_folder_name).rename(repo_src_path)
    else:
        # Clone repo
        if repo_src_path.exists():
            shutil.rmtree(repo_src_path)
        ret = ghcc.clone(repo_info.repo_owner,
                         repo_info.repo_name,
                         clone_folder=str(repo_dir),
                         folder_name="src")
        if ret.error_type not in [None, ghcc.CloneErrorType.SubmodulesFailed]:
            flutes.log(
                f"Failed to clone {repo_full_name}: error type {ret.error_type}",
                "error")
            # Return a dummy result so this repo is ignored in the future.
            return Result(repo_info.repo_owner, repo_info.repo_name, [], {}, 0,
                          0, 0)

    # Write makefile info to pickle
    with (repo_binary_dir / "makefiles.pkl").open("wb") as f_pkl:
        pickle.dump(repo_info.makefiles, f_pkl)

    gcc_flags = "-E"
    directory_mapping = None
    if use_fake_libc_headers:
        gcc_flags = f"-E -nostdlib -I/usr/src/libc"
        directory_mapping = {ghcc.parse.FAKE_LIBC_PATH: "/usr/src/libc"}

    if progress_bar is not None:
        progress_bar.update(postfix={"status": "preprocessing"})
    makefiles = ghcc.docker_batch_compile(
        str(repo_binary_dir),
        str(repo_src_path),
        compile_timeout=preprocess_timeout,
        gcc_override_flags=gcc_flags,
        use_makefile_info_pkl=True,
        directory_mapping=directory_mapping,
        user_id=(repo_info.idx % 10000) + 30000,  # user IDs 30000 ~ 39999
        exception_log_fn=functools.partial(exception_handler,
                                           repo_info=repo_info))

    parser = CParser(lexer=ghcc.parse.CachedCLexer)
    lexer = ghcc.parse.LexerWrapper()
    decompile_path = Path(decompile_folder)
    extractor = ghcc.parse.FunctionExtractor()
    matched_functions: List[MatchedFunction] = []
    preprocessed_original_code: Dict[str, str] = {}
    files_found = 0
    functions_found = 0
    for makefile in makefiles:
        mkfile_dir = Path(makefile['directory'])
        for path, sha in zip(makefile["binaries"], makefile["sha256"]):
            # Load and parse preprocessed original code.
            code_path = str(mkfile_dir / path)
            json_path = decompile_path / (sha + ".jsonl")
            preprocessed_code_path = repo_binary_dir / sha
            if progress_bar is not None:
                progress_bar.update(1, postfix={"file": code_path})
            if not json_path.exists() or not preprocessed_code_path.exists():
                continue
            try:
                with preprocessed_code_path.open("r") as f:
                    code = f.read()
                code = LINE_CONTROL_REGEX.sub("", code)
            except UnicodeDecodeError:
                continue  # probably a real binary file
            preprocessed_original_code[sha] = code
            try:
                original_ast: ASTNode = parser.parse(code,
                                                     filename=os.path.join(
                                                         repo_full_name, path))
            except (pycparser.c_parser.ParseError, AssertionError) as e:
                # For some reason `pycparser` uses `assert`s in places where there should have been a check.
                flutes.log(
                    f"{repo_full_name}: Parser error when processing file "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True
                continue  # ignore parsing errors
            original_tokens = ghcc.parse.convert_to_tokens(
                code, parser.clex.cached_tokens)
            files_found += 1
            function_asts = extractor.find_functions(original_ast)
            functions_found += len(function_asts)

            # Collect decompiled functions with matching original code.
            with json_path.open("r") as f:
                decompiled_json = [
                    line for line in f if line
                ]  # don't decode, as we only need the function name
            decompiled_funcs: Dict[str,
                                   str] = {}  # (func_name) -> decompiled_code
            decompiled_var_names: Dict[str, Dict[str, Tuple[str, str]]] = {} \
                # (func_name) -> (var_id) -> (decomp_name, orig_name)

            for line_num, j in enumerate(decompiled_json):
                # Find function name from JSON line without parsing.
                match = JSON_FUNC_NAME_REGEX.search(j)
                assert match is not None
                func_name = match.group(1)
                if func_name not in function_asts:
                    continue

                try:
                    decompiled_data = json.loads(j)
                except json.JSONDecodeError as e:
                    flutes.log(
                        f"{repo_full_name}: Decode error when reading JSON file at {json_path}: "
                        f"{str(e)}", "error")
                    continue
                decompiled_code = decompiled_data["raw_code"]
                # Store the variable names used in the function.
                # We use a random string as the identifier prefix. Sadly, C89 (and `pycparser`) doesn't support Unicode.
                for length in range(3, 10 + 1):
                    var_identifier_prefix = "v" + "".join(
                        random.choices(string.ascii_lowercase, k=length))
                    if var_identifier_prefix not in decompiled_code:
                        break
                else:
                    # No way this is happening, right?
                    flutes.log(
                        f"{repo_full_name}: Could not find valid identifier prefix for "
                        f"{func_name} in {code_path} ({sha})", "error")
                    continue
                variables: Dict[str, Tuple[str, str]] = {
                }  # (var_id) -> (decompiled_name, original_name)
                for match in DECOMPILED_VAR_REGEX.finditer(decompiled_code):
                    var_id, decompiled_name, original_name = match.groups()
                    var_id = f"{var_identifier_prefix}_{var_id}"
                    if var_id in variables:
                        assert variables[var_id] == (decompiled_name,
                                                     original_name)
                    else:
                        variables[var_id] = (decompiled_name, original_name)
                decompiled_var_names[func_name] = variables
                # Remove irregularities in decompiled code to make the it parsable:
                # - Replace `@@VAR` with special identifiers (literally anything identifier that doesn't clash).
                # - Remove the register allocation indication in `var@<rdi>`.
                decompiled_code = DECOMPILED_VAR_REGEX.sub(
                    rf"{var_identifier_prefix}_\1", decompiled_code)
                decompiled_code = DECOMPILED_REG_ALLOC_REGEX.sub(
                    "", decompiled_code)
                if func_name.startswith("_"):
                    # For some reason, Hexrays would chomp off one leading underscore from function names in their
                    # generated code, which might lead to corrupt code (`_01inverse` -> `01inverse`). Here we
                    # heuristically try to find and replace the changed function name.
                    decompiled_code = re.sub(  # replace all identifiers with matching name
                        r"(?<![a-zA-Z0-9_])" + func_name[1:] +
                        r"(?![a-zA-Z0-9_])", func_name, decompiled_code)
                    # Note that this doesn't fix references of the function in other functions. But really, why would
                    # someone name their function `_01inverse`?
                decompiled_funcs[func_name] = decompiled_code

            # Generate code replacing original functions with decompiled functions.
            replacer = ghcc.parse.FunctionReplacer(decompiled_funcs)
            replaced_code = replacer.visit(original_ast)

            # Obtain AST for decompiled code by parsing it again.
            code_to_preprocess = DECOMPILED_CODE_HEADER + "\n" + replaced_code
            try:
                code_to_parse = ghcc.parse.preprocess(code_to_preprocess)
            except ghcc.parse.PreprocessError as e:
                msg = (
                    f"{repo_full_name}: GCC return value nonzero for decompiled code of "
                    f"{code_path} ({sha})")
                if len(e.args) > 0:
                    msg += ":\n" + str(e)
                flutes.log(msg, "error")
                has_error = True
                continue

            try:
                decompiled_ast, code_to_parse = ghcc.parse.parse_decompiled_code(
                    code_to_parse, lexer, parser)
                decompiled_tokens = ghcc.parse.convert_to_tokens(
                    code_to_parse, parser.clex.cached_tokens)
            except (ValueError, pycparser.c_parser.ParseError) as e:
                flutes.log(
                    f"{repo_full_name}: Could not parse decompiled code for "
                    f"{code_path} ({sha}): {str(e)}", "error")
                has_error = True

                # We don't have ASTs for decompiled functions, but we can still dump the code.
                # Use the dummy typedefs to extract functions.
                code_lines = code_to_parse.split("\n")
                func_begin_end: Dict[str, List[Optional[int]]] = defaultdict(
                    lambda: [None, None])
                for idx, line in enumerate(code_lines):
                    name, is_begin = replacer.extract_func_name(line)
                    if name is not None:
                        func_begin_end[name][0 if is_begin else 1] = idx
                for func_name, (begin, end) in func_begin_end.items():
                    if begin is not None and end is not None and func_name in function_asts:
                        decompiled_func_tokens = lexer.lex("\n".join(
                            code_lines[(begin + 1):end]))
                        original_func_ast = function_asts[func_name]
                        original_ast_json, original_func_tokens = serialize(
                            original_func_ast, original_tokens)
                        matched_func = MatchedFunction(
                            file_path=code_path,
                            binary_hash=sha,
                            func_name=func_name,
                            variable_names=decompiled_var_names[func_name],
                            original_tokens=original_func_tokens,
                            decompiled_tokens=decompiled_func_tokens,
                            original_ast_json=original_ast_json,
                            decompiled_ast_json=None)
                        matched_functions.append(matched_func)

            else:
                # We've successfully parsed decompiled code.
                decompiled_func_asts = extractor.find_functions(decompiled_ast)
                for func_name in decompiled_funcs.keys():
                    original_func_ast = function_asts[func_name]
                    if func_name not in decompiled_func_asts:
                        # Maybe there's other Hexrays-renamed functions that we didn't fix, just ignore them.
                        continue
                    decompiled_func_ast = decompiled_func_asts[func_name]
                    original_ast_json, original_func_tokens = serialize(
                        original_func_ast, original_tokens)
                    decompiled_ast_json, decompiled_func_tokens = serialize(
                        decompiled_func_ast, decompiled_tokens)
                    matched_func = MatchedFunction(
                        file_path=code_path,
                        binary_hash=sha,
                        func_name=func_name,
                        variable_names=decompiled_var_names[func_name],
                        original_tokens=original_func_tokens,
                        decompiled_tokens=decompiled_func_tokens,
                        original_ast_json=original_ast_json,
                        decompiled_ast_json=decompiled_ast_json)
                    matched_functions.append(matched_func)

    # Cleanup the folders; if errors occurred, keep the preprocessed code.
    status = ("success" if not has_error and len(matched_functions) > 0 else (
        "warning" if not has_error or len(matched_functions) > 0 else "error"))
    shutil.rmtree(repo_dir)

    end_time = time.time()
    funcs_without_asts = sum(matched_func.decompiled_ast_json is None
                             for matched_func in matched_functions)
    flutes.log(
        f"[{end_time - start_time:6.2f}s] "
        f"{repo_full_name}: "
        f"Files found: {files_found}/{total_files}, "
        f"functions matched: {len(matched_functions)}/{functions_found} "
        f"({funcs_without_asts} w/o ASTs)",
        status,
        force_console=True)
    return Result(repo_owner=repo_info.repo_owner,
                  repo_name=repo_info.repo_name,
                  matched_functions=matched_functions,
                  preprocessed_original_code=preprocessed_original_code,
                  files_found=files_found,
                  functions_found=functions_found,
                  funcs_without_asts=funcs_without_asts)
Пример #15
0
def clone_and_compile(
        repo_info: RepoInfo,
        clone_folder: str,
        binary_folder: str,
        archive_folder: str,
        recursive_clone: bool = True,
        clone_timeout: Optional[float] = None,
        compile_timeout: Optional[float] = None,
        force_reclone: bool = False,
        force_recompile: bool = False,
        docker_batch_compile: bool = True,
        max_archive_size: Optional[int] = None,
        compression_type: str = "gzip",
        record_libraries: bool = False,
        record_metainfo: bool = False,
        gcc_override_flags: Optional[str] = None) -> PipelineResult:
    r"""Perform the entire pipeline.

    :param repo_info: Information about the repository.
    :param clone_folder: Path to the folder where the repository will be stored. The actual destination folder will be
        ``clone_folder/repo_owner_____repo_name``, e.g., ``clone_folder/torvalds_____linux``.
        This strange notation is used in order to have a flat directory hierarchy, so we're not left with a bunch of
        empty folders for repository owners.
    :param binary_folder: Path to the folder where compiled binaries will be stored. The actual destination folder will
        be ``binary_folder/repo_owner/repo_name``, e.g., ``binary_folder/torvalds/linux``.
    :param archive_folder: Path to the folder where archived repositories will be stored. The actual archive file will
        be ``archive_folder/repo_owner/repo_name.tar.xz``, e.g., ``archive_folder/torvalds/linux.tar.xz``.

    :param recursive_clone: If ``True``, uses ``--recursive`` when cloning.
    :param clone_timeout: Timeout for cloning, or `None` (default) for unlimited time.
    :param compile_timeout: Timeout for compilation, or `None` (default) for unlimited time.
    :param force_reclone: If ``True``, always clone a fresh copy for compilation. If ``False``, only clone when there
        are no matching archives.
    :param force_recompile: If ``True``, the repository is compiled regardless of the value in DB.
    :param docker_batch_compile: If ``True``, compile all Makefiles within a repository in a single Docker container.
    :param max_archive_size: If specified, only archive repositories whose size is not larger than the given
        value (in bytes).
    :param compression_type: The file type of the archive to produce. Valid values are ``"gzip"`` (faster) and
        ``"xz"`` (smaller).
    :param record_libraries: If ``True``, record the libraries used in compilation.
    :param record_metainfo: If ``True``, record meta-info values.
    :param gcc_override_flags: If not ``None``, these flags will be appended to each invocation of GCC.

    :return: An entry to insert into the DB, or `None` if no operations are required.
    """
    repo_full_name = f"{repo_info.repo_owner}/{repo_info.repo_name}"
    repo_folder_name = f"{repo_info.repo_owner}_____{repo_info.repo_name}"
    repo_path = os.path.join(clone_folder, repo_folder_name)
    if compression_type == "xz":
        archive_extension = ".tar.xz"
        tar_type_flag = "J"
    elif compression_type == "gzip":
        archive_extension = ".tar.gz"
        tar_type_flag = "z"
    else:
        raise ValueError(f"Invalid compression type '{compression_type}'")
    archive_path = os.path.abspath(
        os.path.join(archive_folder, f"{repo_full_name}{archive_extension}"))

    repo_entry = repo_info.db_result
    clone_success = None

    # Skip repos that are fully processed
    if (repo_entry is not None
            and (repo_entry["clone_successful"] and not force_reclone)
            and (repo_entry["compiled"] and not force_recompile)):
        return PipelineResult(repo_info)

    # Stage 1: Cloning from GitHub.
    if not force_reclone and os.path.exists(archive_path):
        # Extract the archive instead of cloning.
        try:
            flutes.run_command(["tar", f"x{tar_type_flag}f", archive_path],
                               timeout=clone_timeout,
                               cwd=clone_folder)
            flutes.log(f"{repo_full_name} extracted from archive", "success")
        except (subprocess.TimeoutExpired, subprocess.CalledProcessError) as e:
            flutes.log(
                f"Unknown error when extracting {repo_full_name}. Captured output: '{e.output}'",
                "error")
            shutil.rmtree(repo_path)
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)
    elif (repo_entry is None or  # not processed
          force_reclone or (repo_entry["clone_successful"] and  # not compiled
                            (not repo_entry["compiled"] or force_recompile) and
                            not os.path.exists(repo_path))):
        clone_result = ghcc.clone(repo_info.repo_owner,
                                  repo_info.repo_name,
                                  clone_folder=clone_folder,
                                  folder_name=repo_folder_name,
                                  timeout=clone_timeout,
                                  skip_if_exists=False,
                                  recursive=recursive_clone)
        clone_success = clone_result.success
        if not clone_result.success:
            if clone_result.error_type is CloneErrorType.FolderExists:
                flutes.log(f"{repo_full_name} skipped because folder exists",
                           "warning")
            elif clone_result.error_type is CloneErrorType.PrivateOrNonexistent:
                flutes.log(
                    f"Failed to clone {repo_full_name} because repository is private or nonexistent",
                    "warning")
            else:
                if clone_result.error_type is CloneErrorType.Unknown:
                    msg = f"Failed to clone {repo_full_name} with unknown error"
                else:  # CloneErrorType.Timeout
                    msg = f"Time expired ({clone_timeout}s) when attempting to clone {repo_full_name}"
                if clone_result.captured_output is not None:
                    msg += f". Captured output: '{clone_result.captured_output!r}'"
                flutes.log(msg, "error")

                if clone_result.error_type is CloneErrorType.Unknown:
                    return PipelineResult(repo_info)  # return dummy info

            return PipelineResult(repo_info, clone_success=clone_success)

        elif clone_result.error_type is CloneErrorType.SubmodulesFailed:
            msg = f"Submodules in {repo_full_name} ignored due to error"
            if clone_result.captured_output is not None:
                msg += f". Captured output: '{clone_result.captured_output!r}'"
            flutes.log(msg, "warning")

        repo_size = flutes.get_folder_size(repo_path)
        flutes.log(
            f"{repo_full_name} successfully cloned ({clone_result.time:.2f}s, "
            f"{flutes.readable_size(repo_size)})", "success")
    else:
        if not repo_entry["clone_successful"]:
            return PipelineResult(repo_info)  # return dummy info
        repo_size = flutes.get_folder_size(repo_path)

    makefiles = None
    libraries = None
    meta_info: Optional[PipelineMetaInfo] = None
    if not repo_entry or not repo_entry["compiled"] or force_recompile:
        # # SPECIAL CHECK: Do not attempt to compile OS kernels!
        # kernel_name = None
        # if contains_in_file(os.path.join(repo_path, "README"), "Linux kernel release"):
        #     kernel_name = "Linux"
        # elif contains_in_file(os.path.join(repo_path, "README"), "FreeBSD source directory"):
        #     kernel_name = "FreeBSD"
        # if kernel_name is not None:
        #     shutil.rmtree(repo_path)
        #     ghcc.log(f"Found {kernel_name} kernel in {repo_full_name}, will not attempt to compile. "
        #              f"Repository deleted", "warning")
        #     return PipelineResult(repo_info, clone_success=clone_success, makefiles=[])

        # Stage 2: Finding Makefiles.
        makefile_dirs = ghcc.find_makefiles(repo_path)
        if len(makefile_dirs) == 0:
            # Repo has no Makefiles, delete.
            shutil.rmtree(repo_path)
            flutes.log(
                f"No Makefiles found in {repo_full_name}, repository deleted",
                "warning")
            return PipelineResult(repo_info,
                                  clone_success=clone_success,
                                  makefiles=[])
        else:
            pass

        # Stage 3: Compile each Makefile.
        repo_binary_dir = os.path.join(binary_folder, repo_full_name)
        if not os.path.exists(repo_binary_dir):
            os.makedirs(repo_binary_dir)
        flutes.log(f"Starting compilation for {repo_full_name}...")

        if docker_batch_compile:
            makefiles = ghcc.docker_batch_compile(
                repo_binary_dir,
                repo_path,
                compile_timeout,
                record_libraries,
                gcc_override_flags,
                user_id=(repo_info.idx % 10000) +
                30000,  # user IDs 30000 ~ 39999
                exception_log_fn=functools.partial(exception_handler,
                                                   repo_info=repo_info))
        else:
            makefiles = list(
                ghcc.compile_and_move(repo_binary_dir, repo_path,
                                      makefile_dirs, compile_timeout,
                                      record_libraries, gcc_override_flags))
        num_succeeded = sum(makefile["success"] for makefile in makefiles)
        if record_libraries:
            library_log_path = os.path.join(repo_binary_dir, "libraries.txt")
            if os.path.exists(library_log_path):
                with open(library_log_path) as f:
                    libraries = list(set(f.read().split()))
            else:
                libraries = []
        num_binaries = sum(len(makefile["binaries"]) for makefile in makefiles)

        msg = f"{num_succeeded} ({len(makefiles)}) out of {len(makefile_dirs)} Makefile(s) " \
              f"in {repo_full_name} compiled (partially), yielding {num_binaries} binaries"
        flutes.log(
            msg,
            "success" if num_succeeded == len(makefile_dirs) else "warning")

        if record_metainfo:
            meta_info = PipelineMetaInfo({
                "num_makefiles":
                len(makefile_dirs),
                "has_gitmodules":
                os.path.exists(os.path.join(repo_path, ".gitmodules")),
                "makefiles_using_automake":
                sum(
                    ghcc.contains_files(directory,
                                        ["configure.ac", "configure.in"])
                    for directory in makefile_dirs)
            })

        # Stage 4: Clean and zip repo.
        if max_archive_size is not None and repo_size > max_archive_size:
            shutil.rmtree(repo_path)
            flutes.log(
                f"Removed {repo_full_name} because repository size ({flutes.readable_size(repo_size)}) "
                f"exceeds limits", "info")
        else:
            # Repository is already cleaned in the compile stage.
            os.makedirs(os.path.split(archive_path)[0], exist_ok=True)
            compress_success = False
            try:
                flutes.run_command([
                    "tar", f"c{tar_type_flag}f", archive_path, repo_folder_name
                ],
                                   timeout=clone_timeout,
                                   cwd=clone_folder)
                compress_success = True
            except subprocess.TimeoutExpired:
                flutes.log(
                    f"Compression timeout for {repo_full_name}, giving up",
                    "error")
            except subprocess.CalledProcessError as e:
                flutes.log(
                    f"Unknown error when compressing {repo_full_name}. Captured output: '{e.output}'",
                    "error")
            shutil.rmtree(repo_path)
            if compress_success:
                flutes.log(f"Compressed {repo_full_name}, folder removed",
                           "info")
            elif os.path.exists(archive_path):
                os.remove(archive_path)

    return PipelineResult(repo_info,
                          clone_success=clone_success,
                          repo_size=repo_size,
                          makefiles=makefiles,
                          libraries=libraries,
                          meta_info=meta_info)
Пример #16
0
def main() -> None:
    if not ghcc.utils.verify_docker_image(verbose=True):
        exit(1)

    args = Arguments()
    if args.n_procs == 0:
        # Only do this on the single-threaded case.
        flutes.register_ipython_excepthook()
    flutes.set_log_file(args.log_file)
    flutes.set_logging_level(args.logging_level, console=True, file=False)
    flutes.log("Running with arguments:\n" + args.to_string(),
               force_console=True)

    if os.path.exists(args.clone_folder):
        flutes.log(
            f"Removing contents of clone folder '{args.clone_folder}'...",
            "warning",
            force_console=True)
        ghcc.utils.run_docker_command(
            ["rm", "-rf", "/usr/src/*"],
            user=0,
            directory_mapping={args.clone_folder: "/usr/src"})

    flutes.log("Crawling starts...", "warning", force_console=True)
    db = ghcc.RepoDB()
    libraries: Set[str] = set()
    if args.record_libraries is not None and os.path.exists(
            args.record_libraries):
        with open(args.record_libraries, "r") as f:
            libraries = set(f.read().split())

    def flush_libraries():
        if args.record_libraries is not None:
            with open(args.record_libraries, "w") as f:
                f.write("\n".join(libraries))

    with flutes.safe_pool(args.n_procs, closing=[db, flush_libraries]) as pool:
        iterator = iter_repos(db, args.repo_list_file, args.max_repos)
        pipeline_fn: Callable[
            [RepoInfo], Optional[PipelineResult]] = functools.partial(
                clone_and_compile,
                clone_folder=args.clone_folder,
                binary_folder=args.binary_folder,
                archive_folder=args.archive_folder,
                recursive_clone=args.recursive_clone,
                clone_timeout=args.clone_timeout,
                compile_timeout=args.compile_timeout,
                force_reclone=args.force_reclone,
                force_recompile=args.force_recompile,
                docker_batch_compile=args.docker_batch_compile,
                max_archive_size=args.max_archive_size,
                compression_type=args.compression_type,
                record_libraries=(args.record_libraries is not None),
                record_metainfo=args.record_metainfo,
                gcc_override_flags=args.gcc_override_flags)
        repo_count = 0
        meta_info = MetaInfo()
        for result in pool.imap_unordered(pipeline_fn, iterator):
            repo_count += 1
            if repo_count % 100 == 0:
                flutes.log(f"Processed {repo_count} repositories",
                           force_console=True)
            if result is None:
                continue
            repo_owner, repo_name = result.repo_info.repo_owner, result.repo_info.repo_name
            if args.write_db:
                if result.clone_success is not None or result.repo_info.db_result is None:
                    # There's probably an inconsistency somewhere if we didn't clone while `db_result` is None.
                    # To prevent more errors, just add it to the DB.
                    repo_size = result.repo_size or -1  # a value of zero is probably also wrong
                    clone_success = result.clone_success if result.clone_success is not None else True
                    db.add_repo(repo_owner,
                                repo_name,
                                clone_success,
                                repo_size=repo_size)
                    flutes.log(f"Added {repo_owner}/{repo_name} to DB")
                if result.makefiles is not None:
                    update_result = db.update_makefile(
                        repo_owner,
                        repo_name,
                        result.makefiles,
                        ignore_length_mismatch=True)
                    if not update_result:
                        flutes.log(
                            f"Makefiles of {repo_owner}/{repo_name} not saved to DB due to Unicode encoding "
                            f"errors", "error")
            if result.libraries is not None:
                libraries.update(result.libraries)
                if repo_count % 10 == 0:  # flush every 10 repos
                    flush_libraries()

            if args.record_metainfo:
                meta_info.add_repo(result)
                if repo_count % 100 == 0:
                    flutes.log(repr(meta_info), force_console=True)

        flutes.log(repr(meta_info), force_console=True)
Пример #17
0
import os
import subprocess

import flutes

import ghcc

parser = argparse.ArgumentParser()
parser.add_argument("folder", type=str)  # the folder to clean up
parser.add_argument("-y", action="store_true", default=False)  # yes
args = parser.parse_args()

try:
    parent = os.path.abspath(os.path.join(args.folder, ".."))
    folder = os.path.split(os.path.abspath(args.folder))[1]
    yes = args.y
    if not yes:
        confirm = input(
            f"This will delete {parent} / {folder}. Confirm? [y/N] ")
        yes = confirm.lower() in ["y", "yes"]
    if yes:
        ghcc.utils.run_docker_command(["rm", "-rf", f"/usr/src/{folder}"],
                                      user=0,
                                      directory_mapping={parent: "/usr/src"})
except subprocess.CalledProcessError as e:
    flutes.log(f"Command failed with retcode {e.returncode}", "error")
    output = e.output.decode("utf-8")
    if len(output) > 200:
        output = output[:200] + "... (omitted)"
    flutes.log("Captured output: " + output)
Пример #18
0
def decompile(binary_info: BinaryInfo,
              output_dir: str,
              binary_dir: str,
              timeout: Optional[int] = None) -> DecompilationResult:
    binary_path = binary_info["path"]
    original_path = binary_info["path_in_repo"]
    binary_hash = os.path.split(binary_path)[1]

    def create_result(
            status: DecompilationStatus,
            time: Optional[datetime.timedelta] = None) -> DecompilationResult:
        return DecompilationResult(binary_info, binary_hash, status, time)

    output_path = os.path.join(output_dir, f"{binary_hash}.jsonl")
    if os.path.exists(output_path):
        # Binary already decompiled, but for some reason it wasn't written to the DB.
        return create_result(DecompilationStatus.Success)

    start = datetime.datetime.now()
    env: EnvDict = os.environ.copy()
    env['IDALOG'] = '/dev/stdout'
    env['PREFIX'] = binary_hash
    file_path = os.path.join(binary_dir, binary_path)

    # Create a temporary directory, since the decompiler makes a lot of additional
    # files that we can't clean up from here.
    with tempfile.TemporaryDirectory() as tempdir:
        # Put the output JSONL file here as well to prevent partially-generated files.
        env['OUTPUT_DIR'] = os.path.abspath(tempdir)
        with tempfile.NamedTemporaryFile(dir=tempdir) as collected_vars:
            # First collect variables.
            env['COLLECTED_VARS'] = collected_vars.name
            with tempfile.NamedTemporaryFile(dir=tempdir) as orig:
                flutes.run_command(['cp', file_path, orig.name])
                # Timeout after 30 seconds for first run.
                try:
                    run_decompiler(orig.name,
                                   COLLECT,
                                   env=env,
                                   timeout=timeout)
                except subprocess.TimeoutExpired:
                    flutes.log(f"[TIMED OUT] {original_path} ({binary_path})",
                               "warning")
                    return create_result(DecompilationStatus.TimedOut)
                try:
                    assert pickle.load(collected_vars)  # non-empty
                except:
                    flutes.log(f"[NO VARS] {original_path} ({binary_path})",
                               "warning")
                    return create_result(DecompilationStatus.NoVariables)
            # Make a new stripped copy and pass it the collected vars.
            with tempfile.NamedTemporaryFile(dir=tempdir) as stripped:
                flutes.run_command(['cp', file_path, stripped.name])
                flutes.run_command(['strip', '--strip-debug', stripped.name])
                # Dump the trees.
                # No timeout here, we know it'll run in a reasonable amount of
                # time and don't want mismatched files.
                run_decompiler(stripped.name, DUMP_TREES, env=env)
        jsonl_path = os.path.join(tempdir, f"{binary_hash}.jsonl")
        flutes.run_command(['cp', jsonl_path, output_path])
    end = datetime.datetime.now()
    duration = end - start
    flutes.log(
        f"[OK {duration.total_seconds():5.2f}s] {original_path} ({binary_path})",
        "success")
    return create_result(DecompilationStatus.Success, duration)
Пример #19
0
def main(args) -> None:

    pids_with_text = set()
    if args.valid_pids and args.valid_pids.exists():
        pids_with_text = set()
        with args.valid_pids.open("rb") as f:
            data = pickle.load(f)
            buf = set()
            if args.mode == "citation":
                for k, d in tqdm(enumerate(data.values()),
                                 ncols=88,
                                 ascii=True):
                    buf = buf.union(
                        set([pid for i in d for pid in i[1] + i[2]]))
                    if k % 500 == 0:
                        pids_with_text = pids_with_text.union(buf)
                        buf = set()
            elif args.mode == "paper":
                for k, d in tqdm(data.items(), ncols=88, ascii=True):
                    buf = buf.union(set([i[0] for i in d]))
                    if k % 500 == 0:
                        pids_with_text = pids_with_text.union(buf)
                        buf = set()

            # remaining one
            pids_with_text = pids_with_text.union(buf)

    flutes.log(f"# of valid pids to consider: {len(pids_with_text)}")

    if args.legacy:
        # glob takes more time than this?
        files = ((args.input_dir / f"{i}.jsonl.gz", pids_with_text)
                 for i in range(10000))
        Proc = LegacyFilter
    else:
        files = ((args.input_dir / f"pdf_parses_{i}.jsonl.gz", pids_with_text)
                 for i in range(100))
        Proc = Filter

    with flutes.work_in_progress("Parallel"):
        total_map = {}
        with flutes.safe_pool(processes=args.njobs,
                              state_class=Proc) as pool_stateful:
            for idx, _ in enumerate(
                    pool_stateful.imap_unordered(Proc.make_map,
                                                 files,
                                                 chunksize=10)):
                flutes.log(f"Processed {(idx + 1)} files")

            with flutes.work_in_progress("Get states"):
                states = pool_stateful.get_states()
            for state in states:
                # TODO: Incorporate incite number
                total_map.update(state.results)

        flutes.log(f"Total map size: {len(total_map)}")

    with args.output.open("w") as f:
        for k, v in total_map.items():
            print(k, v[0], v[1], sep="\t", file=f)

        flutes.log(f"Dumped to {args.output}")