def extract_file_function_names(vw, file_path): """ extract the names of statically-linked library functions. """ for va in sorted(vw.getFunctions()): if viv_utils.flirt.is_library_function(vw, va): name = viv_utils.get_function_name(vw, va) yield FunctionName(name), va
def extract_file_function_names(vw, **kwargs): """ extract the names of statically-linked library functions. """ for va in sorted(vw.getFunctions()): if viv_utils.flirt.is_library_function(vw, va): name = viv_utils.get_function_name(vw, va) yield FunctionName(name), va if name.startswith("_"): # some linkers may prefix linked routines with a `_` to avoid name collisions. # extract features for both the mangled and un-mangled representations. # e.g. `_fwrite` -> `fwrite` # see: https://stackoverflow.com/a/2628384/87207 yield FunctionName(name[1:]), va
def find_decoding_function_features( vw, functions, disable_progress=False) -> Tuple[Dict[int, Dict], Dict[int, str]]: decoding_candidate_functions: DefaultDict[ int, Dict] = collections.defaultdict(dict) library_functions: Dict[int, str] = dict() pbar = tqdm.tqdm if disable_progress: logger.info("identifying decoding function features...") # do not use tqdm to avoid unnecessary side effects when caller intends # to disable progress completely pbar = lambda s, *args, **kwargs: s functions = sorted(functions) n_funcs = len(functions) pb = pbar(functions, desc="finding decoding function features", unit=" functions", postfix="skipped 0 library functions") with logging_redirect_tqdm(), redirecting_print_to_tqdm(): for f in pb: function_address = int(f) if is_thunk_function(vw, function_address): continue if viv_utils.flirt.is_library_function(vw, function_address): # TODO handle j_j_j__free_base (lib function wrappers), e.g. 0x140035AF0 in d2ca76... # TODO ignore function called to by library functions function_name = viv_utils.get_function_name( vw, function_address) logger.debug("skipping library function 0x%x (%s)", function_address, function_name) library_functions[function_address] = function_name n_libs = len(library_functions) percentage = 100 * (n_libs / n_funcs) if isinstance(pb, tqdm.tqdm): pb.set_postfix_str("skipped %d library functions (%d%%)" % (n_libs, percentage)) continue f = viv_utils.Function(vw, function_address) function_data = {"meta": get_function_meta(f), "features": list()} # meta data features function_data["features"].append( BlockCount(function_data["meta"].get("block_count"))) function_data["features"].append( InstructionCount( function_data["meta"].get("instruction_count"))) function_data["features"].append( Arguments(function_data["meta"].get("api", []).get("arguments"))) for feature in extract_function_features(f): function_data["features"].append(feature) for bb in f.basic_blocks: for feature in extract_basic_block_features(f, bb): function_data["features"].append(feature) for insn in bb.instructions: for feature in extract_insn_features(f, bb, insn): function_data["features"].append(feature) for feature in abstract_features(function_data["features"]): function_data["features"].append(feature) function_data["score"] = get_function_score_weighted( function_data["features"]) logger.debug("analyzed function 0x%x - total score: %f", function_address, function_data["score"]) for feat in function_data["features"]: logger.trace(" %s", feat) decoding_candidate_functions[function_address] = function_data return decoding_candidate_functions, library_functions
def match_function_flirt_signatures(matcher: flirt.FlirtMatcher, vw: vivisect.VivWorkspace, va: int, cache=None): """ match the given FLIRT signatures against the function at the given address. upon success, update the workspace with match metadata, setting the function as a library function and assigning its name. if multiple different signatures match the function, don't do anything. args: match (flirt.FlirtMatcher): the compiled FLIRT signature matcher. vw (vivisect.workspace): the analyzed program's workspace. va (int): the virtual address of a function to match. cache (Optional[Dict[int, Union[str, None]]]): internal cache of matches VA -> name or None on "no match". no need to provide as external caller. returns: Optional[str]: the recognized function name, or `None`. """ if cache is None: # we cache both successful and failed lookups. # # (callers of this function don't need to initialize the cache. # we'll provide one during recursive calls when we need it.) # # while we can use funcmeta to retrieve existing successful matches, # we don't persist failed matches, # because another FLIRT matcher might come along with better knowledge. # # however, when we match reference names, especially chained together, # then we need to cache the negative result, or we do a ton of extra work. # "accidentally quadratic" or worse. # see https://github.com/fireeye/capa/issues/448 cache = {} function_meta = vw.funcmeta.get(va) if not function_meta: # not a function, we're not going to consider this. return None if va in cache: return cache[va] if is_library_function(vw, va): # already matched here. # this might be the case if recursive matching visited this address. name = viv_utils.get_function_name(vw, va) cache[va] = name return name # 0x200 comes from: # 0x20 bytes for default byte signature size in flirt # 0x100 bytes for max checksum data size # some wiggle room for tail bytes size = function_meta.get("Size", 0x200) # viv returns truncated data at the end of sections, # no need for any special logic here. buf = vw.readMemory(va, size) matches = [] for match in matcher.match(buf): # collect all the name tuples (name, type, offset) with type==reference. # ignores other name types like "public" and "local". references = list(filter(lambda n: n[1] == "reference", match.names)) if not references: # there are no references that we need to check, so this is a complete match. # common case. matches.append(match) else: # flirt uses reference names to assert that # the function contains a reference to another function with a given name. # # we need to loop through these references, # potentially recursively FLIRT match, # and check the name matches (or doesn't). # at the end of the following loop, # if this flag is still true, # then all the references have been validated. does_match_references = True for (ref_name, _, ref_offset) in references: ref_va = va + ref_offset # the reference offset may be inside an instruction, # so we use getLocation to select the containing instruction address. loc_va = vw.getLocation(ref_va)[vivisect.const.L_VA] # an instruction may have multiple xrefs from # so we loop through all code references, # searching for that name. # # if the name is found, then this flag will be set. does_match_the_reference = False for xref in vw.getXrefsFrom(loc_va): # FLIRT signatures only match code, # so we're only going to resolve references that point to code. if xref[vivisect.const. XR_RTYPE] != vivisect.const.REF_CODE: continue target = xref[vivisect.const.XR_TO] found_name = match_function_flirt_signatures( matcher, vw, target, cache) if found_name == ref_name: does_match_the_reference = True break if not does_match_the_reference: does_match_references = False break if does_match_references: # only if all references pass do we count it. matches.append(match) if matches: # we may have multiple signatures that match the same function, like `strcpy`. # these could be copies from multiple libraries. # so we don't mind if there are multiple matches, as long as names are the same. # # but if there are multiple candidate names, that's a problem. # our signatures are not precise enough. # we could maybe mark the function as "is a library function", but not assign name. # though, if we have signature FPs among library functions, it could easily FP with user code too. # so safest thing to do is not make any claim about the function. names = list(set(map(get_match_name, matches))) if len(names) == 1: name = names[0] add_function_flirt_match(vw, va, name) cache[va] = name logger.debug("found library function: 0x%x: %s", va, name) return name else: cache[va] = None logger.warning("conflicting names: 0x%x: %s", va, names) return None else: cache[va] = None return None
def get_function_name(self, va): return viv_utils.get_function_name(self.vw, va)
def extract_insn_api_features(f, bb, insn): """parse API features from the given instruction.""" # example: # # call dword [0x00473038] if insn.mnem not in ("call", "jmp"): return if insn.mnem == "jmp": if f.vw.getFunctionMeta(f.va, "Thunk"): return # traditional call via IAT if isinstance(insn.opers[0], envi.archs.i386.disasm.i386ImmMemOper): oper = insn.opers[0] target = oper.getOperAddr(insn) imports = get_imports(f.vw) if target in imports: dll, symbol = imports[target] for name in capa.features.extractors.helpers.generate_symbols( dll, symbol): yield API(name), insn.va # call via thunk on x86, # see 9324d1a8ae37a36ae560c37448c9705a at 0x407985 # # this is also how calls to internal functions may be decoded on x32 and x64. # see Lab21-01.exe_:0x140001178 # # follow chained thunks, e.g. in 82bf6347acf15e5d883715dc289d8a2b at 0x14005E0FF in # 0x140059342 (viv) / 0x14005E0C0 (IDA) # 14005E0FF call j_ElfClearEventLogFileW (14005AAF8) # 14005AAF8 jmp ElfClearEventLogFileW (14005E196) # 14005E196 jmp cs:__imp_ElfClearEventLogFileW elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386PcRelOper): imports = get_imports(f.vw) target = capa.features.extractors.viv.helpers.get_coderef_from( f.vw, insn.va) if not target: return if viv_utils.flirt.is_library_function(f.vw, target): name = viv_utils.get_function_name(f.vw, target) yield API(name), insn.va return for _ in range(THUNK_CHAIN_DEPTH_DELTA): if target in imports: dll, symbol = imports[target] for name in capa.features.extractors.helpers.generate_symbols( dll, symbol): yield API(name), insn.va # if jump leads to an ENDBRANCH instruction, skip it if f.vw.getByteDef(target)[1].startswith(b"\xf3\x0f\x1e"): target += 4 target = capa.features.extractors.viv.helpers.get_coderef_from( f.vw, target) if not target: return # call via import on x64 # see Lab21-01.exe_:0x14000118C elif isinstance(insn.opers[0], envi.archs.amd64.disasm.Amd64RipRelOper): op = insn.opers[0] target = op.getOperAddr(insn) imports = get_imports(f.vw) if target in imports: dll, symbol = imports[target] for name in capa.features.extractors.helpers.generate_symbols( dll, symbol): yield API(name), insn.va elif isinstance(insn.opers[0], envi.archs.i386.disasm.i386RegOper): try: (_, target) = resolve_indirect_call(f.vw, insn.va, insn=insn) except NotFoundError: # not able to resolve the indirect call, sorry return if target is None: # not able to resolve the indirect call, sorry return imports = get_imports(f.vw) if target in imports: dll, symbol = imports[target] for name in capa.features.extractors.helpers.generate_symbols( dll, symbol): yield API(name), insn.va