def load_func_features(input_list, options, features): grouped_bins, packages = group_binaries(input_list) func_features_list = do_multiprocess( load_func_features_helper, grouped_bins.values(), chunk_size=1, threshold=1, initializer=_init_load, initargs=(options, features), ) funcs = {} for func_features in func_features_list: funcs.update(func_features) return funcs
def run(self, input_path): elfs = self.get_elf_files(input_path) logger.info("[+] start extracting {0} files ...".format(len(elfs))) t0 = time.time() if self.debug: # We only fetch the first ELF for debugging. elfs = [elfs[0]] # IDA's processing time for each binary is significantly different. # Thus, it is better to set the chunk size to 1. res = do_multiprocess(self.run_helper, elfs, chunk_size=1, threshold=1) logger.info("done in: (%0.3fs)" % (time.time() - t0)) return res
def calc_metric(funcs, funcs_strs, dst_options): # now select for features. this find local optimum value using hill # climbing. metric_results = do_multiprocess( calc_metric_helper, funcs.keys(), chunk_size=1, threshold=1, initializer=_init_calc, initargs=(funcs, funcs_strs, dst_options), ) func_keys, tp_results, tn_results, target_opts = zip(*metric_results) # merge results into one numpy array tp_results = np.vstack([x for x in tp_results if len(x)]) tn_results = np.vstack([x for x in tn_results if len(x)]) assert len(tp_results) == len(tn_results) return func_keys, tp_results, tn_results, target_opts
def calc_metric(funcs, options, target_key, option_idx, feature_indices): # now select for features. this find local optimum value using hill # climbing. metric_results = do_multiprocess( calc_metric_helper, funcs.keys(), chunk_size=1, threshold=1, initializer=_init_calc, initargs=(funcs, options, target_key, option_idx, feature_indices), ) func_keys, results_arch, results = zip(*metric_results) scores_arch = {} scores = {} for idx, func_key in enumerate(func_keys): scores_arch[func_key] = results_arch[idx] scores[func_key] = results[idx] return scores_arch, scores
def load_func_features(input_list, options, features, str_features): grouped_bins, packages = group_binaries(input_list, options) func_features_list = do_multiprocess( load_func_features_helper, grouped_bins.values(), chunk_size=1, threshold=1, initializer=_init_load, initargs=(options, features, str_features), ) funcs = {} funcs_strs = {} duplicate_cnt = 0 for func_features, func_types, dup_cnt in func_features_list: funcs.update(func_features) funcs_strs.update(func_types) duplicate_cnt += dup_cnt num_funcs = sum([len(x) for x in funcs.values()]) logger.info("%d functions loaded.", num_funcs) logger.info("%d compiler-generated duplicates.", duplicate_cnt) return funcs, funcs_strs
def get_rank(func_keys, scores, options, target_key, interested_keys, target_option): src_options = [op for op in options if op != target_option] metric_results = do_multiprocess( get_rank_helper, src_options, chunk_size=1, threshold=1, initializer=_init_rank, initargs=(func_keys, scores, options, target_key, interested_keys), ) src_option, total_rank, total_funcs, total_other_ranks = zip( *metric_results) ranks = {} func_counts = {} other_ranks = {} for idx, option in enumerate(src_option): if option == None: continue ranks[option] = total_rank[idx] func_counts[option] = total_funcs[idx] other_ranks[option] = total_other_ranks[idx] return [ranks, func_counts, other_ranks]
default=1, help="number of binaries to handle in each process", ) op.add_option( "--pool_size", type="int", action="store", dest="pool_size", default=multiprocessing.cpu_count(), help="number of processes", ) op.add_option("--debug", action="store_true", dest="debug") (opts, args) = op.parse_args() assert opts.input_list and os.path.isfile(opts.input_list) # Add features to functions in each binary with open(opts.input_list, "r") as f: bins = f.read().splitlines() if opts.debug: bins = [bins[0]] t0 = time.time() logger.info("Processing %d binaries ...", len(bins)) do_multiprocess( extract_features, bins, chunk_size=opts.chunk_size, pool_size=opts.pool_size, threshold=opts.threshold, ) logger.info("done. (%0.3fs)", (time.time() - t0))
for ctags_fname in glob.glob( os.path.join(opts.ctags_dir, "[!include]*.tags")): update_type_map(type_map, ctags_fname) logger.info("done ... %0.3fs", time.time() - t0) store_cache(type_map, fname="ctags_cache", cache_dir=".tiknib_cache") # Add abstracted type data to functions in each binary with open(opts.input_list, "r") as f: bins = f.read().splitlines() t0 = time.time() logger.info("Processing %d binaries ...", len(bins)) bins = list(map(lambda x: (type_map, x), bins)) do_multiprocess(extract_func_types, bins, chunk_size=opts.chunk_size, threshold=opts.threshold) logger.info("done. (%0.3fs)", (time.time() - t0)) # Below code exist is not used for now. # t0 = time.time() # func_cnt = 0 # for i in range(0, len(bins), opts.chunk_size): # logger.info("Processing %d/%d binaries ...", i, len(bins)) # args = do_multiprocess(load_func_data, # bins[i:i+opts.chunk_size], # chunk_size=1) # # Do not want to share large type_mapping dictionary, so that process it # # sequentially # args = make_functype_abstract(type_map, args) # args = list(filter(lambda x: x and x[1], args))
bins = f.read().splitlines() pack_bins = {} for bin_path in bins: package, compiler, arch, opti, bin_name = parse_fname(bin_path) if package not in pack_bins: pack_bins[package] = [] pack_bins[package].append(bin_path) result = {} logger.info("Processing %d binaries ...", len(bins)) t0 = time.time() for package, bin_list in pack_bins.items(): logger.info("Processing %d binaries in %s ...", len(bin_list), package) numbers = do_multiprocess( filter_funcs, bin_list, chunk_size=opts.chunk_size, threshold=opts.threshold ) numbers.sort() # build oracle to pick functions uniquely. oracle = {} done = {} for data in numbers: pack_name, bin_path, num_funcs, names, sources = data package, compiler, arch, opti, bin_name = parse_fname(bin_path) if pack_name not in oracle: oracle[pack_name] = {} done[pack_name] = set() if bin_name not in oracle[pack_name]: oracle[pack_name][bin_name] = {} # sources = (source file, source line)
#bins = list(filter(lambda x: "_find" in x, bins)) # Fix this function to filter out specific options. def filter_bins(bin_path): package, compiler, arch, opti, bin_name = parse_fname(bin_path) if compiler not in ["clang-7.0", "gcc-8.2.0"]: return False return True bins = list(filter(filter_bins, bins)) result = {} logger.info("Processing %d binaries ...", len(bins)) t0 = time.time() numbers = do_multiprocess(count_funcs, bins, chunk_size=opts.chunk_size, threshold=opts.threshold) logger.info("done. (%0.3fs)", (time.time() - t0)) filtered_num_funcs = {} filtered_num_bbs = {} for data in numbers: bin_path, num_funcs, num_bbs = data package, compiler, arch, opti, bin_name = parse_fname(bin_path) if arch.endswith("_64"): continue if "eb" in arch: continue #compiler = compiler.split("-")[0] key = (opti, arch, compiler) if key not in filtered_num_funcs:
dest="chunk_size", default=1, help="number of binaries to process in each process", ) op.add_option("--force", action="store_true", dest="force") (opts, args) = op.parse_args() assert opts.input_list with open(opts.input_list, "r") as f: bins = f.read().splitlines() t0 = time.time() logger.info("Processing %d binaries ...", len(bins)) failed_bins = do_multiprocess(extract_func_lineno, bins, chunk_size=opts.chunk_size, threshold=opts.threshold) logger.info("done. (%0.3fs)", (time.time() - t0)) failed_bins = list(filter(lambda x: x is not None, failed_bins)) if failed_bins: print("{} bins failed.".format(len(failed_bins))) with open("failed_bins.txt", "w") as f: for b in failed_bins: f.write(b + "\n") from tiknib.idascript import IDAScript idascript = IDAScript( idapath=IDA_PATH, idc=IDA_FETCH_FUNCDATA,