def load_func_features_helper(bin_paths): # returns {function_key: {option_idx: np.array(feature_values)}} global g_options, g_features func_features = {} num_features = len(g_features) optionidx_map = get_optionidx_map(g_options) for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) _, func_data_list = load_func_data(bin_path) for func_data in func_data_list: # Use only .text functions for testing if func_data["seg_name"] != ".text": continue if func_data["name"].startswith("sub_"): continue func_key = (package, bin_name, func_data["name"]) option_key = (opti, arch, compiler, others) if option_key not in optionidx_map: continue option_idx = optionidx_map[option_key] if func_key not in func_features: func_features[func_key] = {} if option_key not in func_features[func_key]: func_features[func_key][option_idx] = np.zeros( num_features, dtype=np.float64) for feature_idx, feature in enumerate(g_features): if feature not in func_data["feature"]: continue val = func_data["feature"][feature] func_features[func_key][option_idx][feature_idx] = val return func_features
def load_func_features_helper(bin_paths): # TODO: handle suffix correctly. # returns {function_key: {option_idx: np.array(feature_values)}} global g_options, g_features, g_str_features func_features = {} func_str_features = {} num_features = len(g_features) + len(g_str_features) optionidx_map = get_optionidx_map(g_options) # This counts compiler-generated duplicates (.isra, .part, .cold) duplicate_cnt = 0 for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) _, func_data_list = load_func_data(bin_path, suffix="filtered2") for func_data in func_data_list: # Use only .text functions for testing # These are already filtered in filter_functions.py if func_data["seg_name"] != ".text": continue if func_data["name"].startswith("sub_"): continue #func_key = (package, bin_name, func_data["name"]) func_key = (package, bin_name, func_data["src_file"], func_data["src_line"]) option_key = (opti, arch, compiler, others) if option_key not in optionidx_map: continue option_idx = optionidx_map[option_key] if func_key not in func_features: func_features[func_key] = {} func_str_features[func_key] = {} # in the below condition by using option_key instead of option_idx, # we can filter duplicate functions and only leave the last one. # TODO: move this filtering to filter_functions.py if option_key not in func_features[func_key]: func_features[func_key][option_idx] = np.zeros( num_features, dtype=np.float64) func_str_features[func_key][option_idx] = [] else: duplicate_cnt += 1 for feature_idx, feature in enumerate(g_features): if feature not in func_data["feature"]: continue val = func_data["feature"][feature] func_features[func_key][option_idx][feature_idx] = val for feature_idx, str_feature in enumerate(g_str_features): if str_feature not in func_data: continue val = func_data[str_feature] if "type" in str_feature: if not isinstance(val, list): val = [val] val = normalize_type(val) val = list(enumerate(val)) func_str_features[func_key][option_idx].append(val) return func_features, func_str_features, duplicate_cnt
def filter_funcs(bin_path): global g_oracle bin_path, func_data_list = load_func_data(bin_path) func_data_list = sorted(func_data_list, key=lambda x: x['name']) num_orig_funcs = len(func_data_list) pack_name = func_data_list[0]['package'] # filter functions by segment name (consider functions in code segment) funcs = list(filter(lambda x: x['seg_name'] == '.text', func_data_list)) num_code_funcs = len(funcs) funcs = list(filter(lambda x: 'src_path' in x and x['src_path'], funcs)) num_src_funcs = len(funcs) # To identify functions inserted by compilers #for func in funcs: # if func['package'] not in func['src_file']: # print(func['name'], func['src_file'], func['src_line']) # filter functions by package name (remove functions inserted by compilers) funcs = list(filter(lambda x: pack_name in x['src_path'], funcs)) num_pack_funcs = len(funcs) if num_pack_funcs == 0: print("No functions: ", pack_name, bin_path, num_orig_funcs) funcs = list(filter(lambda x: not x['name'].startswith('sub_'), funcs)) num_sub_funcs = len(funcs) names = set(map(lambda x: x['name'], funcs)) sources = set(map(lambda x: (x['src_file'], x['src_line']), funcs)) if g_oracle: package, compiler, arch, opti, bin_name = parse_fname(bin_path) funcs = list(filter( lambda x: x['src_file'] in g_oracle[pack_name][bin_name] and x['src_line'] in g_oracle[pack_name][bin_name][x['src_file']], funcs)) # TODO: handle suffix correctly. store_func_data(bin_path, funcs, suffix="filtered") num_oracle_funcs = len(funcs) num_readelf_funcs = 0 # if g_oracle: # cmd = "readelf -s {} | grep FUNC | grep -v UND | wc -l".format(bin_path) # cmd = " objdump --syms -j .text {} | grep \"F .text\" | ".format(bin_path) # cmd += " cut -d \" \" -f 1 | sort | uniq | wc -l" # num_readelf_funcs = int(system(cmd)) num_funcs = (num_orig_funcs, num_code_funcs, num_src_funcs, num_pack_funcs, num_sub_funcs, num_oracle_funcs, num_readelf_funcs) return pack_name, bin_path, num_funcs, names, sources
def group_binaries(input_list): with open(input_list, "r") as f: bin_paths = f.read().splitlines() bins = {} packages = set() for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) key = (package, bin_name) if key not in bins: bins[key] = [] bins[key].append(bin_path) packages.add(package) logger.info( "%d packages, %d unique binaries, total %d binaries", len(packages), len(bins), len(bin_paths), ) return bins, packages
def group_binaries(input_list, options): with open(input_list, "r") as f: bin_paths = f.read().splitlines() bins = {} packages = set() check_options = set(options) for bin_path in bin_paths: package, compiler, arch, opti, bin_name = parse_fname(bin_path) others = parse_other_options(bin_path) option_key = (opti, arch, compiler, others) # Filter unnecessary binaries to speed up testing. if option_key not in check_options: continue key = (package, bin_name) if key not in bins: bins[key] = [] bins[key].append(bin_path) packages.add(package) logger.info("%d packages, %d unique binaries, total %d binaries", len(packages), len(bins), sum(map(len, bins.values()))) return bins, packages
def main(): # Get IDA default information bin_path = ida_nalt.get_input_file_path() with open(bin_path, "rb") as f: bin_hash = sha1(f.read()).hexdigest() img_base = idaapi.get_imagebase() info = idaapi.get_inf_structure() if info.is_64bit(): bits = 64 elif info.is_32bit(): bits = 32 else: bits = 16 endian = "little" if info.is_be(): endian = "big" arch = "_".join([info.procName, str(bits), endian]) arch = get_arch(arch) # Parse option information package, compiler, arch, opti, bin_name = parse_fname(bin_path) if "_noinline" in bin_path: other_option = "noinline" elif "_pie" in bin_path: other_option = "pie" elif "_lto" in bin_path: other_option = "lto" else: other_option = "normal" # Prepare default information for processing caller_map, callee_map = get_call_graph() edge_map, bb_callee_map = get_bb_graph(caller_map, callee_map) # Now extract function information func_data = [] for idx, addr in enumerate(list(idautils.Functions())): function = idaapi.get_func(addr) if (not function or function.start_ea == idaapi.BADADDR or function.end_ea == idaapi.BADADDR): continue # IDA's default function information func_name = get_func_name(addr).strip() demangled_name, demangled_full_name = demangle(func_name) graph = idaapi.FlowChart(function, flags=idaapi.FC_PREDS) data = idc.get_bytes(addr, function.size()) or "" data_hash = sha1(data).hexdigest() stack_size = get_frame_size(addr) # Get imported callees. Note that the segment name is used because # idaapi.get_import_module_name() sometimes returns bad results ... imported_callees = [] if func_name in callee_map: imported_callees = list( filter(lambda x: get_segm_name(x[1]) != get_segm_name(addr), callee_map[func_name])) # Get type information from IDA func_type, ret_type, args = get_type(addr) # Prepare basic block information for feature extraction func_strings = [] func_consts = [] bb_data = [] for bb in graph: if bb.start_ea == idaapi.BADADDR or bb.end_ea == idaapi.BADADDR: continue bb_size = bb.end_ea - bb.start_ea block_data = idc.get_bytes(bb.start_ea, bb_size) or b"" block_data_hash = sha1(block_data).hexdigest() bb_strings = get_strings(bb.start_ea, bb.end_ea) bb_consts = get_consts(bb.start_ea, bb.end_ea) bb_callees = list( filter(lambda x: x[0] == bb.id, bb_callee_map[func_name])) bb_data.append({ "size": bb_size, "block_id": bb.id, "startEA": bb.start_ea, "endEA": bb.end_ea, "type": bb.type, "is_ret": idaapi.is_ret_block(bb.type), "hash": block_data_hash, "callees": bb_callees, "strings": bb_strings, "consts": bb_consts, }) func_strings.extend(bb_strings) func_consts.extend(bb_consts) func_data.append({ "ida_idx": idx, "seg_name": get_segm_name(addr), "name": func_name, "demangled_name": demangled_name, "demangled_full_name": demangled_full_name, "hash": data_hash, "size": function.size(), "startEA": function.start_ea, "endEA": function.end_ea, "cfg_size": graph.size, "img_base": img_base, "bin_path": bin_path, "bin_hash": bin_hash, "bin_offset": addr - img_base, "stack_size": stack_size, "package": package, "compiler": compiler, "arch": arch, "opti": opti, "others": other_option, "bin_name": bin_name, "func_type": func_type, "ret_type": ret_type, "args": args, "callers": caller_map[func_name], "callees": callee_map[func_name], "imported_callees": imported_callees, "cfg": edge_map[func_name], "strings": func_strings, "consts": func_consts, "bb_data": bb_data, }) return func_data
action="store", dest="chunk_size", default=1, help="number of binaries to process in each process", ) op.add_option("--force", action="store_true", dest="force") (opts, args) = op.parse_args() assert opts.input_list with open(opts.input_list, "r") as f: bins = f.read().splitlines() pack_bins = {} for bin_path in bins: package, compiler, arch, opti, bin_name = parse_fname(bin_path) if package not in pack_bins: pack_bins[package] = [] pack_bins[package].append(bin_path) result = {} logger.info("Processing %d binaries ...", len(bins)) t0 = time.time() for package, bin_list in pack_bins.items(): logger.info("Processing %d binaries in %s ...", len(bin_list), package) numbers = do_multiprocess( filter_funcs, bin_list, chunk_size=opts.chunk_size, threshold=opts.threshold ) numbers.sort()
def filter_bins(bin_path): package, compiler, arch, opti, bin_name = parse_fname(bin_path) if compiler not in ["clang-7.0", "gcc-8.2.0"]: return False return True