예제 #1
0
def load_func_features_helper(bin_paths):
    # returns {function_key: {option_idx: np.array(feature_values)}}
    global g_options, g_features
    func_features = {}
    num_features = len(g_features)
    optionidx_map = get_optionidx_map(g_options)
    for bin_path in bin_paths:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        others = parse_other_options(bin_path)
        _, func_data_list = load_func_data(bin_path)
        for func_data in func_data_list:
            # Use only .text functions for testing
            if func_data["seg_name"] != ".text":
                continue
            if func_data["name"].startswith("sub_"):
                continue
            func_key = (package, bin_name, func_data["name"])
            option_key = (opti, arch, compiler, others)
            if option_key not in optionidx_map:
                continue
            option_idx = optionidx_map[option_key]
            if func_key not in func_features:
                func_features[func_key] = {}
            if option_key not in func_features[func_key]:
                func_features[func_key][option_idx] = np.zeros(
                    num_features, dtype=np.float64)
            for feature_idx, feature in enumerate(g_features):
                if feature not in func_data["feature"]:
                    continue
                val = func_data["feature"][feature]
                func_features[func_key][option_idx][feature_idx] = val

    return func_features
예제 #2
0
def load_func_features_helper(bin_paths):
    # TODO: handle suffix correctly.
    # returns {function_key: {option_idx: np.array(feature_values)}}
    global g_options, g_features, g_str_features
    func_features = {}
    func_str_features = {}
    num_features = len(g_features) + len(g_str_features)
    optionidx_map = get_optionidx_map(g_options)
    # This counts compiler-generated duplicates (.isra, .part, .cold)
    duplicate_cnt = 0
    for bin_path in bin_paths:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        others = parse_other_options(bin_path)
        _, func_data_list = load_func_data(bin_path, suffix="filtered2")
        for func_data in func_data_list:
            # Use only .text functions for testing
            # These are already filtered in filter_functions.py
            if func_data["seg_name"] != ".text":
                continue
            if func_data["name"].startswith("sub_"):
                continue
            #func_key = (package, bin_name, func_data["name"])
            func_key = (package, bin_name, func_data["src_file"],
                        func_data["src_line"])
            option_key = (opti, arch, compiler, others)
            if option_key not in optionidx_map:
                continue
            option_idx = optionidx_map[option_key]
            if func_key not in func_features:
                func_features[func_key] = {}
                func_str_features[func_key] = {}
            # in the below condition by using option_key instead of option_idx,
            # we can filter duplicate functions and only leave the last one.
            # TODO: move this filtering to filter_functions.py
            if option_key not in func_features[func_key]:
                func_features[func_key][option_idx] = np.zeros(
                    num_features, dtype=np.float64)
                func_str_features[func_key][option_idx] = []
            else:
                duplicate_cnt += 1

            for feature_idx, feature in enumerate(g_features):
                if feature not in func_data["feature"]:
                    continue
                val = func_data["feature"][feature]
                func_features[func_key][option_idx][feature_idx] = val

            for feature_idx, str_feature in enumerate(g_str_features):
                if str_feature not in func_data:
                    continue
                val = func_data[str_feature]
                if "type" in str_feature:
                    if not isinstance(val, list):
                        val = [val]
                    val = normalize_type(val)
                    val = list(enumerate(val))
                func_str_features[func_key][option_idx].append(val)

    return func_features, func_str_features, duplicate_cnt
예제 #3
0
def filter_funcs(bin_path):
    global g_oracle
    bin_path, func_data_list = load_func_data(bin_path)
    func_data_list = sorted(func_data_list, key=lambda x: x['name'])
    num_orig_funcs = len(func_data_list)
    pack_name = func_data_list[0]['package']

    # filter functions by segment name (consider functions in code segment)
    funcs = list(filter(lambda x: x['seg_name'] == '.text', func_data_list))
    num_code_funcs = len(funcs)

    funcs = list(filter(lambda x: 'src_path' in x and x['src_path'], funcs))
    num_src_funcs = len(funcs)

    # To identify functions inserted by compilers
    #for func in funcs:
    #    if func['package'] not in func['src_file']:
    #        print(func['name'], func['src_file'], func['src_line'])

    # filter functions by package name (remove functions inserted by compilers)
    funcs = list(filter(lambda x: pack_name in x['src_path'], funcs))
    num_pack_funcs = len(funcs)

    if num_pack_funcs == 0:
        print("No functions: ", pack_name, bin_path, num_orig_funcs)

    funcs = list(filter(lambda x: not x['name'].startswith('sub_'), funcs))
    num_sub_funcs = len(funcs)

    names = set(map(lambda x: x['name'], funcs))
    sources = set(map(lambda x: (x['src_file'], x['src_line']), funcs))

    if g_oracle:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        funcs = list(filter(
            lambda x:
            x['src_file'] in g_oracle[pack_name][bin_name]
            and x['src_line'] in g_oracle[pack_name][bin_name][x['src_file']],
            funcs))
        # TODO: handle suffix correctly.
        store_func_data(bin_path, funcs, suffix="filtered")
    num_oracle_funcs = len(funcs)
    num_readelf_funcs = 0
#    if g_oracle:
#        cmd = "readelf -s {} | grep FUNC | grep -v UND | wc -l".format(bin_path)
#        cmd = " objdump --syms -j .text {} | grep \"F .text\" | ".format(bin_path)
#        cmd += " cut -d \" \" -f 1 | sort | uniq | wc -l"
#        num_readelf_funcs = int(system(cmd))
    num_funcs = (num_orig_funcs, num_code_funcs, num_src_funcs, num_pack_funcs,
                 num_sub_funcs, num_oracle_funcs, num_readelf_funcs)
    return pack_name, bin_path, num_funcs, names, sources
예제 #4
0
def group_binaries(input_list):
    with open(input_list, "r") as f:
        bin_paths = f.read().splitlines()
    bins = {}
    packages = set()
    for bin_path in bin_paths:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        others = parse_other_options(bin_path)
        key = (package, bin_name)
        if key not in bins:
            bins[key] = []
        bins[key].append(bin_path)
        packages.add(package)
    logger.info(
        "%d packages, %d unique binaries, total %d binaries",
        len(packages),
        len(bins),
        len(bin_paths),
    )
    return bins, packages
예제 #5
0
def group_binaries(input_list, options):
    with open(input_list, "r") as f:
        bin_paths = f.read().splitlines()
    bins = {}
    packages = set()
    check_options = set(options)
    for bin_path in bin_paths:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        others = parse_other_options(bin_path)
        option_key = (opti, arch, compiler, others)
        # Filter unnecessary binaries to speed up testing.
        if option_key not in check_options:
            continue
        key = (package, bin_name)
        if key not in bins:
            bins[key] = []
        bins[key].append(bin_path)
        packages.add(package)
    logger.info("%d packages, %d unique binaries, total %d binaries",
                len(packages), len(bins), sum(map(len, bins.values())))
    return bins, packages
예제 #6
0
def main():
    # Get IDA default information
    bin_path = ida_nalt.get_input_file_path()
    with open(bin_path, "rb") as f:
        bin_hash = sha1(f.read()).hexdigest()
    img_base = idaapi.get_imagebase()
    info = idaapi.get_inf_structure()
    if info.is_64bit():
        bits = 64
    elif info.is_32bit():
        bits = 32
    else:
        bits = 16

    endian = "little"
    if info.is_be():
        endian = "big"
    arch = "_".join([info.procName, str(bits), endian])
    arch = get_arch(arch)

    # Parse option information
    package, compiler, arch, opti, bin_name = parse_fname(bin_path)
    if "_noinline" in bin_path:
        other_option = "noinline"
    elif "_pie" in bin_path:
        other_option = "pie"
    elif "_lto" in bin_path:
        other_option = "lto"
    else:
        other_option = "normal"

    # Prepare default information for processing
    caller_map, callee_map = get_call_graph()
    edge_map, bb_callee_map = get_bb_graph(caller_map, callee_map)

    # Now extract function information
    func_data = []
    for idx, addr in enumerate(list(idautils.Functions())):
        function = idaapi.get_func(addr)
        if (not function or function.start_ea == idaapi.BADADDR
                or function.end_ea == idaapi.BADADDR):
            continue

        # IDA's default function information
        func_name = get_func_name(addr).strip()
        demangled_name, demangled_full_name = demangle(func_name)
        graph = idaapi.FlowChart(function, flags=idaapi.FC_PREDS)
        data = idc.get_bytes(addr, function.size()) or ""
        data_hash = sha1(data).hexdigest()
        stack_size = get_frame_size(addr)

        # Get imported callees. Note that the segment name is used because
        # idaapi.get_import_module_name() sometimes returns bad results ...
        imported_callees = []
        if func_name in callee_map:
            imported_callees = list(
                filter(lambda x: get_segm_name(x[1]) != get_segm_name(addr),
                       callee_map[func_name]))

        # Get type information from IDA
        func_type, ret_type, args = get_type(addr)

        # Prepare basic block information for feature extraction
        func_strings = []
        func_consts = []
        bb_data = []
        for bb in graph:
            if bb.start_ea == idaapi.BADADDR or bb.end_ea == idaapi.BADADDR:
                continue

            bb_size = bb.end_ea - bb.start_ea
            block_data = idc.get_bytes(bb.start_ea, bb_size) or b""
            block_data_hash = sha1(block_data).hexdigest()
            bb_strings = get_strings(bb.start_ea, bb.end_ea)
            bb_consts = get_consts(bb.start_ea, bb.end_ea)
            bb_callees = list(
                filter(lambda x: x[0] == bb.id, bb_callee_map[func_name]))
            bb_data.append({
                "size": bb_size,
                "block_id": bb.id,
                "startEA": bb.start_ea,
                "endEA": bb.end_ea,
                "type": bb.type,
                "is_ret": idaapi.is_ret_block(bb.type),
                "hash": block_data_hash,
                "callees": bb_callees,
                "strings": bb_strings,
                "consts": bb_consts,
            })
        func_strings.extend(bb_strings)
        func_consts.extend(bb_consts)
        func_data.append({
            "ida_idx": idx,
            "seg_name": get_segm_name(addr),
            "name": func_name,
            "demangled_name": demangled_name,
            "demangled_full_name": demangled_full_name,
            "hash": data_hash,
            "size": function.size(),
            "startEA": function.start_ea,
            "endEA": function.end_ea,
            "cfg_size": graph.size,
            "img_base": img_base,
            "bin_path": bin_path,
            "bin_hash": bin_hash,
            "bin_offset": addr - img_base,
            "stack_size": stack_size,
            "package": package,
            "compiler": compiler,
            "arch": arch,
            "opti": opti,
            "others": other_option,
            "bin_name": bin_name,
            "func_type": func_type,
            "ret_type": ret_type,
            "args": args,
            "callers": caller_map[func_name],
            "callees": callee_map[func_name],
            "imported_callees": imported_callees,
            "cfg": edge_map[func_name],
            "strings": func_strings,
            "consts": func_consts,
            "bb_data": bb_data,
        })
    return func_data
예제 #7
0
        action="store",
        dest="chunk_size",
        default=1,
        help="number of binaries to process in each process",
    )
    op.add_option("--force", action="store_true", dest="force")
    (opts, args) = op.parse_args()

    assert opts.input_list

    with open(opts.input_list, "r") as f:
        bins = f.read().splitlines()

    pack_bins = {}
    for bin_path in bins:
        package, compiler, arch, opti, bin_name = parse_fname(bin_path)
        if package not in pack_bins:
            pack_bins[package] = []
        pack_bins[package].append(bin_path)

    result = {}
    logger.info("Processing %d binaries ...", len(bins))
    t0 = time.time()
    for package, bin_list in pack_bins.items():
        logger.info("Processing %d binaries in %s ...", len(bin_list), package)
        numbers = do_multiprocess(
            filter_funcs, bin_list, chunk_size=opts.chunk_size,
            threshold=opts.threshold
        )
        numbers.sort()
예제 #8
0
 def filter_bins(bin_path):
     package, compiler, arch, opti, bin_name = parse_fname(bin_path)
     if compiler not in ["clang-7.0", "gcc-8.2.0"]:
         return False
     return True