Пример #1
0
def extract_file_function_names(vw, file_path):
    """
    extract the names of statically-linked library functions.
    """
    for va in sorted(vw.getFunctions()):
        if viv_utils.flirt.is_library_function(vw, va):
            name = viv_utils.get_function_name(vw, va)
            yield FunctionName(name), va
Пример #2
0
def extract_file_function_names(vw, **kwargs):
    """
    extract the names of statically-linked library functions.
    """
    for va in sorted(vw.getFunctions()):
        if viv_utils.flirt.is_library_function(vw, va):
            name = viv_utils.get_function_name(vw, va)
            yield FunctionName(name), va
            if name.startswith("_"):
                # some linkers may prefix linked routines with a `_` to avoid name collisions.
                # extract features for both the mangled and un-mangled representations.
                # e.g. `_fwrite` -> `fwrite`
                # see: https://stackoverflow.com/a/2628384/87207
                yield FunctionName(name[1:]), va
Пример #3
0
def find_decoding_function_features(
        vw,
        functions,
        disable_progress=False) -> Tuple[Dict[int, Dict], Dict[int, str]]:
    decoding_candidate_functions: DefaultDict[
        int, Dict] = collections.defaultdict(dict)

    library_functions: Dict[int, str] = dict()

    pbar = tqdm.tqdm
    if disable_progress:
        logger.info("identifying decoding function features...")
        # do not use tqdm to avoid unnecessary side effects when caller intends
        # to disable progress completely
        pbar = lambda s, *args, **kwargs: s

    functions = sorted(functions)
    n_funcs = len(functions)

    pb = pbar(functions,
              desc="finding decoding function features",
              unit=" functions",
              postfix="skipped 0 library functions")
    with logging_redirect_tqdm(), redirecting_print_to_tqdm():
        for f in pb:
            function_address = int(f)

            if is_thunk_function(vw, function_address):
                continue

            if viv_utils.flirt.is_library_function(vw, function_address):
                # TODO handle j_j_j__free_base (lib function wrappers), e.g. 0x140035AF0 in d2ca76...
                # TODO ignore function called to by library functions
                function_name = viv_utils.get_function_name(
                    vw, function_address)
                logger.debug("skipping library function 0x%x (%s)",
                             function_address, function_name)
                library_functions[function_address] = function_name
                n_libs = len(library_functions)
                percentage = 100 * (n_libs / n_funcs)
                if isinstance(pb, tqdm.tqdm):
                    pb.set_postfix_str("skipped %d library functions (%d%%)" %
                                       (n_libs, percentage))
                continue

            f = viv_utils.Function(vw, function_address)

            function_data = {"meta": get_function_meta(f), "features": list()}

            # meta data features
            function_data["features"].append(
                BlockCount(function_data["meta"].get("block_count")))
            function_data["features"].append(
                InstructionCount(
                    function_data["meta"].get("instruction_count")))
            function_data["features"].append(
                Arguments(function_data["meta"].get("api",
                                                    []).get("arguments")))

            for feature in extract_function_features(f):
                function_data["features"].append(feature)

            for bb in f.basic_blocks:
                for feature in extract_basic_block_features(f, bb):
                    function_data["features"].append(feature)

                for insn in bb.instructions:
                    for feature in extract_insn_features(f, bb, insn):
                        function_data["features"].append(feature)

            for feature in abstract_features(function_data["features"]):
                function_data["features"].append(feature)

            function_data["score"] = get_function_score_weighted(
                function_data["features"])

            logger.debug("analyzed function 0x%x - total score: %f",
                         function_address, function_data["score"])
            for feat in function_data["features"]:
                logger.trace("  %s", feat)

            decoding_candidate_functions[function_address] = function_data

        return decoding_candidate_functions, library_functions
Пример #4
0
def match_function_flirt_signatures(matcher: flirt.FlirtMatcher,
                                    vw: vivisect.VivWorkspace,
                                    va: int,
                                    cache=None):
    """
    match the given FLIRT signatures against the function at the given address.
    upon success, update the workspace with match metadata, setting the
    function as a library function and assigning its name.

    if multiple different signatures match the function, don't do anything.

    args:
      match (flirt.FlirtMatcher): the compiled FLIRT signature matcher.
      vw (vivisect.workspace): the analyzed program's workspace.
      va (int): the virtual address of a function to match.
      cache (Optional[Dict[int, Union[str, None]]]): internal cache of matches VA -> name or None on "no match".
       no need to provide as external caller.

    returns:
      Optional[str]: the recognized function name, or `None`.
    """
    if cache is None:
        # we cache both successful and failed lookups.
        #
        # (callers of this function don't need to initialize the cache.
        #  we'll provide one during recursive calls when we need it.)
        #
        # while we can use funcmeta to retrieve existing successful matches,
        # we don't persist failed matches,
        # because another FLIRT matcher might come along with better knowledge.
        #
        # however, when we match reference names, especially chained together,
        # then we need to cache the negative result, or we do a ton of extra work.
        # "accidentally quadratic" or worse.
        # see https://github.com/fireeye/capa/issues/448
        cache = {}

    function_meta = vw.funcmeta.get(va)
    if not function_meta:
        # not a function, we're not going to consider this.
        return None

    if va in cache:
        return cache[va]

    if is_library_function(vw, va):
        # already matched here.
        # this might be the case if recursive matching visited this address.
        name = viv_utils.get_function_name(vw, va)
        cache[va] = name
        return name

    # 0x200 comes from:
    #  0x20 bytes for default byte signature size in flirt
    #  0x100 bytes for max checksum data size
    #  some wiggle room for tail bytes
    size = function_meta.get("Size", 0x200)
    # viv returns truncated data at the end of sections,
    # no need for any special logic here.
    buf = vw.readMemory(va, size)

    matches = []
    for match in matcher.match(buf):
        # collect all the name tuples (name, type, offset) with type==reference.
        # ignores other name types like "public" and "local".
        references = list(filter(lambda n: n[1] == "reference", match.names))

        if not references:
            # there are no references that we need to check, so this is a complete match.
            # common case.
            matches.append(match)

        else:
            # flirt uses reference names to assert that
            # the function contains a reference to another function with a given name.
            #
            # we need to loop through these references,
            # potentially recursively FLIRT match,
            # and check the name matches (or doesn't).

            # at the end of the following loop,
            # if this flag is still true,
            # then all the references have been validated.
            does_match_references = True

            for (ref_name, _, ref_offset) in references:
                ref_va = va + ref_offset

                # the reference offset may be inside an instruction,
                # so we use getLocation to select the containing instruction address.
                loc_va = vw.getLocation(ref_va)[vivisect.const.L_VA]

                # an instruction may have multiple xrefs from
                # so we loop through all code references,
                # searching for that name.
                #
                # if the name is found, then this flag will be set.
                does_match_the_reference = False
                for xref in vw.getXrefsFrom(loc_va):
                    # FLIRT signatures only match code,
                    # so we're only going to resolve references that point to code.
                    if xref[vivisect.const.
                            XR_RTYPE] != vivisect.const.REF_CODE:
                        continue

                    target = xref[vivisect.const.XR_TO]
                    found_name = match_function_flirt_signatures(
                        matcher, vw, target, cache)

                    if found_name == ref_name:
                        does_match_the_reference = True
                        break

                if not does_match_the_reference:
                    does_match_references = False
                    break

            if does_match_references:
                # only if all references pass do we count it.
                matches.append(match)

    if matches:
        # we may have multiple signatures that match the same function, like `strcpy`.
        # these could be copies from multiple libraries.
        # so we don't mind if there are multiple matches, as long as names are the same.
        #
        # but if there are multiple candidate names, that's a problem.
        # our signatures are not precise enough.
        # we could maybe mark the function as "is a library function", but not assign name.
        # though, if we have signature FPs among library functions, it could easily FP with user code too.
        # so safest thing to do is not make any claim about the function.
        names = list(set(map(get_match_name, matches)))
        if len(names) == 1:
            name = names[0]
            add_function_flirt_match(vw, va, name)
            cache[va] = name
            logger.debug("found library function: 0x%x: %s", va, name)
            return name
        else:
            cache[va] = None
            logger.warning("conflicting names: 0x%x: %s", va, names)
            return None

    else:
        cache[va] = None
        return None
Пример #5
0
 def get_function_name(self, va):
     return viv_utils.get_function_name(self.vw, va)
Пример #6
0
Файл: insn.py Проект: H1d3r/capa
def extract_insn_api_features(f, bb, insn):
    """parse API features from the given instruction."""

    # example:
    #
    #    call dword [0x00473038]
    if insn.mnem not in ("call", "jmp"):
        return

    if insn.mnem == "jmp":
        if f.vw.getFunctionMeta(f.va, "Thunk"):
            return

    # traditional call via IAT
    if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper):
        oper = insn.opers[0]
        target = oper.getOperAddr(insn)

        imports = get_imports(f.vw)
        if target in imports:
            dll, symbol = imports[target]
            for name in capa.features.extractors.helpers.generate_symbols(
                    dll, symbol):
                yield API(name), insn.va

    # call via thunk on x86,
    # see 9324d1a8ae37a36ae560c37448c9705a at 0x407985
    #
    # this is also how calls to internal functions may be decoded on x32 and x64.
    # see Lab21-01.exe_:0x140001178
    #
    # follow chained thunks, e.g. in 82bf6347acf15e5d883715dc289d8a2b at 0x14005E0FF in
    # 0x140059342 (viv) / 0x14005E0C0 (IDA)
    # 14005E0FF call    j_ElfClearEventLogFileW (14005AAF8)
    #   14005AAF8 jmp     ElfClearEventLogFileW (14005E196)
    #     14005E196 jmp     cs:__imp_ElfClearEventLogFileW

    elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper):
        imports = get_imports(f.vw)
        target = capa.features.extractors.viv.helpers.get_coderef_from(
            f.vw, insn.va)
        if not target:
            return

        if viv_utils.flirt.is_library_function(f.vw, target):
            name = viv_utils.get_function_name(f.vw, target)
            yield API(name), insn.va
            return

        for _ in range(THUNK_CHAIN_DEPTH_DELTA):
            if target in imports:
                dll, symbol = imports[target]
                for name in capa.features.extractors.helpers.generate_symbols(
                        dll, symbol):
                    yield API(name), insn.va

            # if jump leads to an ENDBRANCH instruction, skip it
            if f.vw.getByteDef(target)[1].startswith(b"\xf3\x0f\x1e"):
                target += 4

            target = capa.features.extractors.viv.helpers.get_coderef_from(
                f.vw, target)
            if not target:
                return

    # call via import on x64
    # see Lab21-01.exe_:0x14000118C
    elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper):
        op = insn.opers[0]
        target = op.getOperAddr(insn)

        imports = get_imports(f.vw)
        if target in imports:
            dll, symbol = imports[target]
            for name in capa.features.extractors.helpers.generate_symbols(
                    dll, symbol):
                yield API(name), insn.va

    elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper):
        try:
            (_, target) = resolve_indirect_call(f.vw, insn.va, insn=insn)
        except NotFoundError:
            # not able to resolve the indirect call, sorry
            return

        if target is None:
            # not able to resolve the indirect call, sorry
            return

        imports = get_imports(f.vw)
        if target in imports:
            dll, symbol = imports[target]
            for name in capa.features.extractors.helpers.generate_symbols(
                    dll, symbol):
                yield API(name), insn.va