def __init__(self): self.name = "CryptoIdentifier" print ("[*] loading CryptoIdentifier") self.time = time self.re = re self.GraphHelper = GraphHelper self.CryptoSignatureHit = CryptoSignatureHit self.AritlogBasicBlock = AritlogBasicBlock self.Segment = Segment self.pm = PatternManager() self.low_rating_threshold = 0.4 self.high_rating_threshold = 1.0 self.low_instruction_threshold = 8 self.high_instruction_threshold = 100 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_instruction_threshold = 100 self.low_call_threshold = 0 self.high_call_threshold = 1 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_call_threshold = 10 # if at least this fraction of a signature's length' has been identified # consecutively, the location is marked as a signature hit. self.match_filter_factor = 0.5 self.aritlog_blocks = [] self.signature_hits = [] self.ida_proxy = IdaProxy() return
def __init__(self, idascope_config): print ("[|] loading DocumentationHelper") self.ida_proxy = IdaProxy() # default colors are grey / light red / red self.default_neutral_color = 0xCCCCCC self.default_base_color = 0xB3B3FF self.default_highlight_color = 0x3333FF self.color_state = "unknown" self.idascope_config = idascope_config self._loadConfig(self.idascope_config.semantics_file) return
def __init__(self, idascope_config): print ("[|] loading SemanticIdentifier") self.os = os self.re = re self.time = time self.ida_proxy = IdaProxy() self.FunctionContext = FunctionContext self.FunctionContextFilter = FunctionContextFilter self.CallContext = CallContext self.ParameterContext = ParameterContext # fields self.semantics = {} self.active_semantics = {} self.renaming_seperator = "_" self.semantic_groups = [] self.semantic_definitions = [] self.real_api_names = {} self.last_scan_result = {} self.idascope_config = idascope_config self._getRealApiNames() self._loadSemantics(self.idascope_config) return
def __init__(self, idascope_config): print("[|] Loading WinApiProvider") self.os = os self.string = string self.ida_proxy = IdaProxy() self.downloader = Downloader() self.downloader.downloadFinished.connect(self.onDownloadFinished) self.idascope_config = idascope_config self.winapi_data = {} if self.idascope_config.winapi_load_keyword_database: self._loadKeywords() self.online_msdn_enabled = self.idascope_config.winapi_online_enabled self.last_delivered_filepath = self.idascope_config.winapi_rootdir self.backward_history = [] self.forward_history = [] self.is_appending_to_history = True self.download_receivers = []
class CryptoIdentifier(): """ This class contains the logic to perform Crypto identification. Two techniques are currently supported: 1. A heuristic approach that identifies functions and basic blocks based on the ratio of arithmetic/logic instructions to all instructions 2. A signature-based approach, using the signatures defined in PatternManager """ def __init__(self): self.name = "CryptoIdentifier" print("[*] loading CryptoIdentifier") self.time = time self.re = re self.GraphHelper = GraphHelper self.CryptoSignatureHit = CryptoSignatureHit self.AritlogBasicBlock = AritlogBasicBlock self.Segment = Segment self.pm = PatternManager() self.low_rating_threshold = 0.4 self.high_rating_threshold = 1.0 self.low_instruction_threshold = 8 self.high_instruction_threshold = 100 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_instruction_threshold = 100 self.low_call_threshold = 0 self.high_call_threshold = 1 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_call_threshold = 10 # if at least this fraction of a signature's length' has been identified # consecutively, the location is marked as a signature hit. self.match_filter_factor = 0.5 self.aritlog_blocks = [] self.signature_hits = [] self.ida_proxy = IdaProxy() return def scan(self): """ Scan the whole IDB with all available techniques. """ self.scanAritlog() self.scanCryptoPatterns() ################################################################################ # Aritlog scanning ################################################################################ def scanAritlog(self): """ scan with the arithmetic/logic heuristic @return: a list of AritLogBasicBlock data objects that fulfill the parameters as specified """ print("[*] CryptoIdentifier: Starting aritlog heuristic analysis.") self.aritlog_blocks = [] time_before = self.time.time() for function_ea in self.ida_proxy.Functions(): function_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(function_ea)) calls_in_function = 0 function_blocks = [] function_dgraph = {} blocks_in_loops = set() for current_block in function_chart: block = self.AritlogBasicBlock(current_block.startEA, current_block.endEA) for instruction in self.ida_proxy.Heads( block.start_ea, block.end_ea): if self.ida_proxy.isCode( self.ida_proxy.GetFlags(instruction)): mnemonic = self.ida_proxy.GetMnem(instruction) has_identical_operands = self.ida_proxy.GetOperandValue(instruction, 0) == \ self.ida_proxy.GetOperandValue(instruction, 1) block.updateInstructionCount(mnemonic, has_identical_operands) if mnemonic == "call": calls_in_function += 1 function_blocks.append(block) # prepare graph for Tarjan's algorithm succeeding_blocks = [ succ.startEA for succ in current_block.succs() ] function_dgraph[current_block.startEA] = succeeding_blocks # add trivial loops if current_block.startEA in succeeding_blocks: block.is_contained_in_trivial_loop = True blocks_in_loops.update([current_block.startEA]) # perform Tarjan's algorithm to identify strongly connected components (= loops) in the function graph graph_helper = self.GraphHelper() strongly_connected = graph_helper.calculateStronglyConnectedComponents( function_dgraph) non_trivial_loops = [ component for component in strongly_connected if len(component) > 1 ] for component in non_trivial_loops: for block in component: blocks_in_loops.update([block]) for block in function_blocks: if block.start_ea in blocks_in_loops: block.is_contained_in_loop = True block.num_calls_in_function = calls_in_function self.aritlog_blocks.extend(function_blocks) print("[*] Heuristics analysis took %3.2f seconds." % (self.time.time() - time_before)) return self.getAritlogBlocks( self.low_rating_threshold, self.high_rating_threshold, self.low_instruction_threshold, self.high_instruction_threshold, self.low_call_threshold, self.high_call_threshold, False, False, False) def _updateThresholds(self, min_rating, max_rating, min_instr, max_instr, min_call, max_call): """ update all six threshold bounds @param min_rating: the minimum arit/log ratio a basic block must have @type min_rating: float @param max_rating: the maximum arit/log ratio a basic block can have @type max_rating: float @param min_instr: the minimum number of instructions a basic block must have @type min_instr: int @param max_instr: the minimum number of instructions a basic block can have @type max_instr: int @param min_call: the minimum number of calls a basic block must have @type min_call: int @param max_call: the minimum number of calls a basic block can have @type max_call: int """ self.low_rating_threshold = max(0.0, min_rating) self.high_rating_threshold = min(1.0, max_rating) self.low_instruction_threshold = max(0, min_instr) if max_instr >= self.max_instruction_threshold: # we cap the value here and safely assume there is no block with more than 1000000 instructions self.high_instruction_threshold = 1000000 else: self.high_instruction_threshold = max_instr self.low_call_threshold = max(0, min_call) if max_call >= self.max_call_threshold: # we cap the value here and safely assume there is no block with more than 1000000 instructions self.high_call_threshold = 1000000 else: self.high_call_threshold = max_call def getAritlogBlocks(self, min_rating, max_rating, min_instr, max_instr, min_api, max_api, is_nonzero, \ is_looped, is_trivially_looped): """ get all blocks that are within the limits specified by the heuristic parameters. parameters are the same as in function "_updateThresholds" except param is_nonzero: defines whether zeroing instructions (like xor eax, eax) shall be counted or not. type is_nonzero: boolean param is_looped: defines whether only basic blocks in loops shall be selected type is_looped: boolean @return: a list of AritlogBasicBlock data objects, according to the parameters """ self._updateThresholds(min_rating, max_rating, min_instr, max_instr, min_api, max_api) return [ block for block in self.aritlog_blocks if (self.high_rating_threshold >= block.getAritlogRating( is_nonzero) >= self.low_rating_threshold) and ( self.high_instruction_threshold >= block.num_instructions >= self.low_instruction_threshold) and ( self.high_call_threshold >= block.num_calls_in_function >= self.low_call_threshold) and (not is_looped or block.is_contained_in_loop) and ( not is_trivially_looped or block.is_contained_in_trivial_loop) ] def getUnfilteredBlockCount(self): """ returns the number of basic blocks that have been analyzed. @return: (int) number of basic blocks """ return len(self.aritlog_blocks) ################################################################################ # Signature scanning ################################################################################ def getSegmentData(self): """ returns the raw bytes of the segments as stored by IDA @return: a list of Segment data objects. """ segments = [] for segment_ea in self.ida_proxy.Segments(): try: segment = self.Segment() segment.start_ea = segment_ea segment.end_ea = self.ida_proxy.SegEnd(segment_ea) segment.name = self.ida_proxy.SegName(segment_ea) buf = "" for ea in helpers.Misc.lrange( segment_ea, self.ida_proxy.SegEnd(segment_ea)): buf += chr(self.ida_proxy.get_byte(ea)) segment.data = buf segments.append(segment) except: print( "[!] Tried to access invalid segment data. An error has occurred while address conversion" ) return segments def scanCryptoPatterns(self, pattern_size=32): crypt_results = [] print("[*] CryptoIdentifier: Starting crypto signature scanning.") time_before_matching = self.time.time() segments = self.getSegmentData() keywords = self.pm.getTokenizedSignatures(pattern_size) for keyword in keywords.keys(): for segment in segments: crypt_results.extend([ self.CryptoSignatureHit(segment.start_ea + match.start(), keywords[keyword], keyword) for match in self.re.finditer(self.re.escape(keyword), segment.data) ]) variable_matches = self.scanVariablePatterns() crypt_results.extend(variable_matches) print("[*] Full matching took %3.2f seconds and resulted in %d hits." % (self.time.time() - time_before_matching, len(crypt_results))) self.signature_hits = crypt_results return crypt_results def scanVariablePatterns(self): # the scanning code is roughly based on kyprizel's signature scan, see credtis above for more information crypt_results = [] variable_signatures = self.pm.getVariableSignatures() for var_sig in variable_signatures.keys(): current_seg = self.ida_proxy.FirstSeg() seg_end = self.ida_proxy.SegEnd(current_seg) while current_seg != self.ida_proxy.BAD_ADDR: signature_hit = self.ida_proxy.find_binary( current_seg, seg_end, variable_signatures[var_sig], 16, 1) if signature_hit != self.ida_proxy.BAD_ADDR: crypt_results.append( self.CryptoSignatureHit(signature_hit, [var_sig], variable_signatures[var_sig])) current_seg = signature_hit + variable_signatures[ var_sig].count(" ") + 1 else: current_seg = self.ida_proxy.NextSeg(seg_end) if not current_seg == self.ida_proxy.BAD_ADDR: seg_end = self.ida_proxy.SegEnd(current_seg) return crypt_results def getSignatureLength(self, signature_name): """ returns the length for a signature, identified by its name @param signature_name: name for a signature, e.g. "ADLER 32" @type signature_name: str @return: (int) length of the signature. """ for item in self.pm.signatures.items(): if item[1] == signature_name: return len(item[0]) return 0 def getSignatureHits(self): """ Get all signature hits that have a length of at least match_filter_factor percent of the signature they triggered. Hits are grouped by signature names. @return: a dictionary with key/value entries of the following form: ("signature name", [CryptoSignatureHit]) """ sorted_hits = sorted(self.signature_hits) unified_hits = [] previous_signature_names = [] for hit in sorted_hits: hit_intersection = [ element for element in hit.signature_names if element in previous_signature_names ] if len(hit_intersection) == 0: previous_signature_names = hit.signature_names unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \ hit.matched_signature)) else: previous_signature_names = hit_intersection previous_hit = unified_hits[-1] if hit.start_address == previous_hit.start_address + len( previous_hit.matched_signature): previous_hit.matched_signature += hit.matched_signature previous_hit.signature_names = hit_intersection else: unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \ hit.matched_signature)) filtered_hits = [] for hit in unified_hits: if len(hit.matched_signature) >= max([ self.match_filter_factor * self.getSignatureLength(name) for name in hit.signature_names ]): hit.code_refs_to = self.getXrefsToAddress(hit.start_address) filtered_hits.append(hit) grouped_hits = {} for hit in filtered_hits: for name in hit.signature_names: if name not in grouped_hits: grouped_hits[name] = [hit] else: grouped_hits[name].append(hit) return grouped_hits def getXrefsToAddress(self, address): """ get all references to a certain address. These are no xrefs in IDA sense but references to the crypto signatures. If the signature points to an instruction, e.g. if a constant is moved to a register, the return is flagged as "True", meaning it is an in-code reference. @param address: an arbitrary address @type address: int @return: a list of tuples (int, boolean) """ xrefs = [] head_to_address = self.ida_proxy.PrevHead(address, address - 14) if head_to_address != 0xFFFFFFFF: flags = self.ida_proxy.GetFlags(head_to_address) if self.ida_proxy.isCode(flags): xrefs.append((head_to_address, True)) for x in self.ida_proxy.XrefsTo(address): flags = self.ida_proxy.GetFlags(x.frm) if self.ida_proxy.isCode(flags): xrefs.append((x.frm, False)) return xrefs
class SemanticIdentifier(): """ A module to analyze and explore an IDB for semantics. For a set of API names, references to these are identified and used for creating context and allowing tagging of them. """ def __init__(self, idascope_config): print ("[|] loading SemanticIdentifier") self.os = os self.re = re self.time = time self.ida_proxy = IdaProxy() self.FunctionContext = FunctionContext self.FunctionContextFilter = FunctionContextFilter self.CallContext = CallContext self.ParameterContext = ParameterContext # fields self.semantics = {} self.active_semantics = {} self.renaming_seperator = "_" self.semantic_groups = [] self.semantic_definitions = [] self.real_api_names = {} self.last_scan_result = {} self.idascope_config = idascope_config self._getRealApiNames() self._loadSemantics(self.idascope_config) return def _cbEnumImports(self, addr, name, ordinal): if name: self.real_api_names[name] = self.ida_proxy.Name(addr) return True def _getRealApiNames(self): num_imports = self.ida_proxy.get_import_module_qty() for i in xrange(0, num_imports): self.ida_proxy.enum_import_names(i, self._cbEnumImports) def lookupRealApiName(self, api_name): if api_name in self.real_api_names: return self.real_api_names[api_name] else: return api_name def lookupDisplayApiName(self, real_api_name): """ returns the key by given value of self.real_api_names (basically inverted dictionary) """ name = real_api_name for display_name in self.real_api_names: if real_api_name == self.real_api_names[display_name] \ and display_name in self.real_api_names[display_name]: name = display_name return name def _loadSemantics(self, config): """ Loads a semantic configuration file and collects all definitions from it. @param config_filename: filename of a semantic configuration file @type config_filename: str """ for filename in [fn for fn in self.os.listdir(config.semantics_folder) if fn.endswith(".json")]: loaded_file = self._loadSemanticsFile(config.semantics_folder + self.os.sep + filename) self.semantics[loaded_file["name"]] = loaded_file if config.inspection_default_semantics in self.semantics: self._setSemantics(config.inspection_default_semantics) elif len(self.semantics) > 0: self._setSemantics(sorted(self.semantics.keys())[0]) else: self._setSemantics("") return def _loadSemanticsFile(self, semantics_filename): """ Loads a semantic configuration file and collects all definitions from it. @param config_filename: filename of a semantic configuration file @type config_filename: str """ semantics_file = open(semantics_filename, "r") semantics = semantics_file.read() return json.loads(semantics, object_hook=JsonHelper.decode_dict) def _setSemantics(self, semantics_entry): semantics_content = {} if semantics_entry in self.semantics: semantics_content = self.semantics[semantics_entry] self.renaming_seperator = semantics_content["renaming_seperator"] self.semantic_groups = semantics_content["semantic_groups"] self.semantic_definitions = semantics_content["semantic_definitions"] self.active_semantics = semantics_content else: self.renaming_seperator = "_" self.semantic_groups = [] self.semantic_definitions = [] self.active_semantics = {"name": "none"} self.scanByReferences() def getSemanticsNames(self): return sorted(self.semantics.keys()) def getActiveSemanticsName(self): return self.active_semantics["name"] def calculateNumberOfBasicBlocksForFunctionAddress(self, function_address): """ Calculates the number of basic blocks for a given function by walking its FlowChart. @param function_address: function address to calculate the block count for @type function_address: int """ number_of_blocks = 0 try: func_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(function_address)) for block in func_chart: number_of_blocks += 1 except: pass return number_of_blocks def getNumberOfBasicBlocksForFunctionAddress(self, address): """ returns the number of basic blocks for the function containing the queried address, based on the value stored in the last scan result. If the number of basic blocks for this function has never been calculated, zero is returned. @param function_address: function address to get the block count for @type function_address: int @return: (int) The number of blocks in th e function """ number_of_blocks = 0 function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): number_of_blocks = self.last_scan_result[function_address].number_of_basic_blocks return number_of_blocks def scan(self): """ Scan the whole IDB with all available techniques. """ self.scanByReferences() self.scanDeep() def scanByReferences(self): """ Scan by references to API names, based on the definitions loaded from the config file. This is highly efficient because we only touch places in the IDB that actually have references to our API names of interest. """ print (" [/] SemanticIdentifier: Starting (fast) scan by references of function semantics.") time_before = self.time.time() self.last_scan_result = {} for semantic_tag in self.semantic_definitions: for api_name in semantic_tag["api_names"]: real_api_name = self.lookupRealApiName(api_name) api_address = self.ida_proxy.LocByName(real_api_name) for ref in self._getAllRefsTo(api_address): function_ctx = self._getFunctionContext(ref) function_ctx.has_tags = True call_ctx = self.CallContext() call_ctx.called_function_name = api_name call_ctx.real_called_function_name = real_api_name call_ctx.address_of_call = ref call_ctx.called_address = api_address call_ctx.tag = semantic_tag["tag"] call_ctx.group = semantic_tag["group"] call_ctx.parameter_contexts = self._resolveApiCall(call_ctx) function_ctx.call_contexts.append(call_ctx) print (" [\\] Analysis took %3.2f seconds." % (self.time.time() - time_before)) def _getAllRefsTo(self, addr): code_ref_addrs = [ref for ref in self.ida_proxy.CodeRefsTo(addr, 0)] data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsTo(addr)] return iter(set(code_ref_addrs).union(set(data_ref_addrs))) def _getNumRefsTo(self, addr): return sum([1 for ref in self._getAllRefsTo(addr)]) def _getAllRefsFrom(self, addr, code_only=False): code_ref_addrs = [ref for ref in self.ida_proxy.CodeRefsFrom(addr, 0)] data_ref_addrs = [] if code_only: # only consider data references that lead to a call near/far (likely imports) data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsFrom(addr) if \ self.ida_proxy.GetFlags(ref) & (self.ida_proxy.FL_CN | self.ida_proxy.FL_CF)] else: data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsFrom(addr)] return iter(set(code_ref_addrs).union(set(data_ref_addrs))) def _getFunctionContext(self, addr): """ Create or return an existing FunctionContext for the given address in the current scan result. @param func_addr: address to create a FunctionContext for @type func_addr: int @return: (FunctionContext) A reference to the corresponding function context """ function_ctx = None function_address = self.ida_proxy.LocByName(self.ida_proxy.GetFunctionName(addr)) if function_address not in self.last_scan_result.keys(): function_ctx = self.FunctionContext() function_ctx.function_address = function_address function_ctx.function_name = self.ida_proxy.GetFunctionName(function_address) function_ctx.has_dummy_name = (self.ida_proxy.GetFlags(function_address) & \ self.ida_proxy.FF_LABL) > 0 self.last_scan_result[function_ctx.function_address] = function_ctx else: function_ctx = self.last_scan_result[function_address] return function_ctx def scanDeep(self): """ Perform a full enumeration of all instructions, gathering information like number of instructions, number of basic blocks, references to and from functions etc. """ print (" [/] SemanticIdentifier: Starting deep scan of function semantics.") time_before = self.time.time() for function_ea in self.ida_proxy.Functions(): function_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(function_ea)) num_blocks = 0 num_instructions = 0 xrefs_from = [] calls_from = [] function_ctx = self._getFunctionContext(function_ea) for block in function_chart: num_blocks += 1 for instruction in self.ida_proxy.Heads(block.startEA, block.endEA): num_instructions += 1 if self.ida_proxy.isCode(self.ida_proxy.GetFlags(instruction)): for ref in self._getAllRefsFrom(instruction): if self.ida_proxy.GetMnem(instruction) == "call": calls_from.append(ref) xrefs_from.append(ref) function_ctx.calls_from.update(calls_from) function_ctx.number_of_xrefs_to = self._getNumRefsTo(function_ea) function_ctx.xrefs_from.update(xrefs_from) function_ctx.number_of_xrefs_from = len(xrefs_from) function_ctx.number_of_basic_blocks = num_blocks function_ctx.number_of_instructions = num_instructions print (" [\\] Analysis took %3.2f seconds." % (self.time.time() - time_before)) def getFunctionAddressForAddress(self, address): """ Get a function address containing the queried address. @param address: address to check the function address for @type address: int @return: (int) The start address of the function containing this address """ return self.ida_proxy.LocByName(self.ida_proxy.GetFunctionName(address)) def calculateNumberOfFunctions(self): """ Calculate the number of functions in all segments. @return: (int) the number of functions found. """ number_of_functions = 0 for seg_ea in self.ida_proxy.Segments(): for function_ea in self.ida_proxy.Functions(self.ida_proxy.SegStart(seg_ea), self.ida_proxy.SegEnd(seg_ea)): number_of_functions += 1 return number_of_functions def calculateNumberOfTaggedFunctions(self): """ Calculate the number of functions in all segments that have been tagged. @return: (int) the number of functions found. """ return len(self.getFunctionAddresses(self.createFunctionContextFilter())) def getFunctionAddresses(self, context_filter): """ Get all function address that have been covered by the last scanning. @param dummy_only: only return functions with dummy names @type dummy_only: bool @param tag_only: only return tag functions @type tag_only: bool @return: (list of int) The addresses of covered functions. """ all_addresses = self.last_scan_result.keys() filtered_addresses = [] if context_filter.display_all: filtered_addresses = all_addresses elif context_filter.display_tags: for address in all_addresses: enabled_tags = [tag[0] for tag in context_filter.enabled_tags] if len(set(self.last_scan_result[address].getTags()) & set(enabled_tags)) > 0: filtered_addresses.append(address) elif context_filter.display_groups: for address in all_addresses: enabled_groups = [group[0] for group in context_filter.enabled_groups] if len(set(self.last_scan_result[address].getGroups()) & set(enabled_groups)) > 0: filtered_addresses.append(address) # filter additionals if context_filter.isDisplayTagOnly(): filtered_addresses = [addr for addr in filtered_addresses if self.last_scan_result[addr].has_tags] if context_filter.isDisplayDummyOnly(): filtered_addresses = [addr for addr in filtered_addresses if self.last_scan_result[addr].has_dummy_name] return filtered_addresses def getTags(self): """ Get all the tags that have been covered by the last scanning. @return (list of str) The tags found. """ tags = [] for function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[function_address].call_contexts: if call_ctx.tag not in tags: tags.append(call_ctx.tag) return tags def getGroups(self): """ Get all the groups that have been covered by tags in the last scanning. @return (list of str) The groups found. """ tag_to_group_mapping = self._createTagToGroupMapping() groups = [] for function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[function_address].call_contexts: if tag_to_group_mapping[call_ctx.tag] not in groups: groups.append(tag_to_group_mapping[call_ctx.tag]) return groups def _createTagToGroupMapping(self): mapping = {} for definition in self.semantic_definitions: mapping[definition["tag"]] = definition["group"] return mapping def getTagsForFunctionAddress(self, address): """ Get all tags found for the function containing the queried address. @param address: address in the target function @type address: int @return: (list of str) The tags for the function containing the queried address """ tags = [] function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[function_address].call_contexts: if call_ctx.tag not in tags: tags.append(call_ctx.tag) return tags def getFieldCountForFunctionAddress(self, query, address): """ Get the number of occurrences for a certain field for the function containing the queried address. @param query: a tuple (type, name), where type is additional, tag, or group and name the field being queried. @type query: tuple @param address: address in the target function @type address: int @return: (int) The number of occurrences for this tag in the function """ function_address = self.getFunctionAddressForAddress(address) return self.last_scan_result[function_address].getCountForField(query) def getTaggedApisForFunctionAddress(self, address): """ Get all call contexts for the function containing the queried address. @param address: address in the target function @type address: int @return: (list of CallContext data objects) The call contexts identified by the scanning of this function """ function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): all_call_ctx = self.last_scan_result[function_address].call_contexts return [call_ctx for call_ctx in all_call_ctx if call_ctx.tag != ""] def getAddressTagPairsOrderedByFunction(self): """ Get all call contexts for all functions @return: a dictionary with key/value entries of the following form: (function_address, dict((call_address, tag))) """ functions_and_tags = {} for function in self.getIdentifiedFunctionAddresses(): call_contexts = self.getTaggedApisForFunctionAddress(function) if function not in functions_and_tags.keys(): functions_and_tags[function] = {} for call_ctx in call_contexts: functions_and_tags[function][call_ctx.address_of_call] = call_ctx.tag return functions_and_tags def getFunctionsToRename(self): """ Get all functions that can be renamed according to the last scan result. Only functions with the standard IDA name I{sub_[0-9A-F]+} will be considered for renaming. @return: a list of dictionaries, each consisting of three tuples: ("old_function_name", str), \ ("new_function_name", str), ("function_address", int) """ functions_to_rename = [] for function_address_to_tag in self.last_scan_result.keys(): new_function_name = self.last_scan_result[function_address_to_tag].function_name # has the function still a dummy name? if self.ida_proxy.GetFlags(function_address_to_tag) & self.ida_proxy.FF_LABL > 0: tags_for_function = self.getTagsForFunctionAddress(function_address_to_tag) for tag in sorted(tags_for_function, reverse=True): if tag not in new_function_name: new_function_name = tag + self.renaming_seperator + new_function_name functions_to_rename.append({"old_function_name": \ self.last_scan_result[function_address_to_tag].function_name, "new_function_name": \ new_function_name, "function_address": function_address_to_tag}) return functions_to_rename def renameFunctions(self): """ Perform the renaming of functions according to the last scan result. """ for function in self.getFunctionsToRename(): if function["old_function_name"] == self.ida_proxy.GetFunctionName(function["function_address"]): self.ida_proxy.MakeNameEx(function["function_address"], function["new_function_name"], \ self.ida_proxy.SN_NOWARN) def renamePotentialWrapperFunctions(self): """ contributed by Branko Spasojevic. """ num_wrappers_renamed = 0 for seg_ea in self.ida_proxy.Segments(): for func_ea in self.ida_proxy.Functions(self.ida_proxy.SegStart(seg_ea), self.ida_proxy.SegEnd(seg_ea)): if (self.ida_proxy.GetFlags(func_ea) & 0x8000) != 0: nr_calls, w_name = self._checkWrapperHeuristics(func_ea) if nr_calls == 1 and len(w_name) > 0: rval = False name_suffix = 0 while rval == False: if name_suffix > 40: print("[!] Potentially more than 50 wrappers for function %s, " \ "please report this IDB ;)" % w_name) break demangled_name = self.ida_proxy.Demangle(w_name, self.ida_proxy.GetLongPrm(self.ida_proxy.INF_SHORT_DN)) if demangled_name != None and demangled_name != w_name: f_name = w_name + '_w' + str(name_suffix) elif name_suffix > 0: f_name = w_name + '_w' + str(name_suffix) else: f_name = w_name + '_w0' name_suffix += 1 rval = self.ida_proxy.MakeNameEx(func_ea, f_name, \ self.ida_proxy.SN_NOCHECK | self.ida_proxy.SN_NOWARN) if rval == True: print("[+] Identified and renamed potential wrapper @ [%08x] to [%s]" % \ (func_ea, f_name)) num_wrappers_renamed += 1 print("[+] Renamed %d functions with their potentially wrapped name." % num_wrappers_renamed) def _checkWrapperHeuristics(self, func_ea): """ Helps renamePotentialWrapperFunctions() to decide whether the function analyzed is a wrapper or not. """ nr_calls = 0 w_name = "" # Heuristic: wrappers are likely short func_end = self.ida_proxy.GetFunctionAttr(func_ea, self.ida_proxy.FUNCATTR_END) if (func_end - func_ea) > 0 and (func_end - func_ea) < 0x40: return (0, "") # Heuristic: wrappers shall only have a single reference, ideally to a library function. for i_ea in self.ida_proxy.FuncItems(func_ea): # long jumps don't occur in wrappers considered by this code. if self.ida_proxy.GetMnem(i_ea) == 'jmp' \ and (func_ea > self.ida_proxy.GetOperandValue(i_ea,0) \ or func_end < self.ida_proxy.GetOperandValue(i_ea,0)): nr_calls += 2 # checks if call is not memory reference if self.ida_proxy.GetMnem(i_ea) == 'call': nr_calls += 1 if self.ida_proxy.GetOpType(i_ea,0) != 2 \ and self.ida_proxy.GetOpType(i_ea,0) != 6 \ and self.ida_proxy.GetOpType(i_ea,0) != 7: nr_calls += 2 if nr_calls > 1: break call_dst = list(self.ida_proxy.CodeRefsFrom(i_ea, 0)) if len(call_dst) == 0: continue call_dst = call_dst[0] if (self.ida_proxy.GetFunctionFlags(call_dst) & self.ida_proxy.FUNC_LIB) != 0 or \ (self.ida_proxy.GetFlags(func_ea) & self.ida_proxy.FF_LABL) == 0: w_name = self.ida_proxy.Name(call_dst) return (nr_calls, w_name) def getParametersForCallAddress(self, call_address): """ Get the parameters for the given address of a function call. @param call_address: address of the target call to inspect @type call_address: int @return: a list of ParameterContext data objects. """ target_function_address = self.ida_proxy.LocByName(self.ida_proxy.GetFunctionName(call_address)) all_tagged_apis_in_function = self.getTaggedApisForFunctionAddress(target_function_address) for api in all_tagged_apis_in_function: if api.address_of_call == call_address: return self._resolveApiCall(api) return [] def _resolveApiCall(self, call_context): """ Resolve the parameters for an API calls based on a call context for this API call. @param call_context: the call context to get the parameter information for @type call_context: a CallContext data object @return: a list of ParameterContext data objects. """ resolved_api_parameters = [] api_signature = self._getApiSignature(call_context.real_called_function_name) push_addresses = self._getPushAddressesBeforeTargetAddress(call_context.address_of_call) resolved_api_parameters = self._matchPushAddressesToSignature(push_addresses, api_signature) return resolved_api_parameters def _matchPushAddressesToSignature(self, push_addresses, api_signature): """ Combine the results of I{_getPushAddressesBeforeTargetAddress} and I{_getApiSignature} in order to produce a list of ParameterContext data objects. @param push_addresses: the identified push addresses before a function call that shall be matched to a function signature @type push_addresses: a list of int @param api_signature: information about a function definition with parameter names, types, and so on. @type api_signature: a dictionary with the layout as returned by I{_getApiSignature} @return: a list of ParameterContext data objects. """ matched_parameters = [] # TODO: # upgrade this feature with data flow analysis to resolve parameters with higher precision api_num_params = len(api_signature["parameters"]) push_addresses = push_addresses[-api_num_params:] # TODO: # There might be the case where we identify less pushed parameters than required by the function # signature. Thus we calculate a "parameter discrepancy" that we use to adjust our enumeration index # so that the last n parameters get matched correctly. This is a temporary fix and might be solved later on. parameter_discrepancy = len(push_addresses) - api_num_params for index, param in enumerate(api_signature["parameters"], start=parameter_discrepancy): param_ctx = self.ParameterContext() param_ctx.parameter_type = param["type"] param_ctx.parameter_name = param["name"] if (parameter_discrepancy != 0) and (index < 0): param_ctx.valid = False else: param_ctx.push_address = push_addresses[index] param_ctx.ida_operand_type = self.ida_proxy.GetOpType(push_addresses[index], 0) param_ctx.ida_operand_value = self.ida_proxy.GetOperandValue(push_addresses[index], 0) param_ctx.value = param_ctx.ida_operand_value matched_parameters.append(param_ctx) return matched_parameters def _getApiSignature(self, api_name): """ Get the signature for a function by using IDA's I{GetType()}. The string is then parsed with a Regex and returned as a dictionary. @param api_name: name of the API / function to get type information for @type api_name: str @return: a dictionary with key/value entries of the following form: ("return_type", str), ("parameters", [dict(("type", str), ("name", str))]) """ api_signature = {"api_name": api_name, "parameters": []} api_location = self.ida_proxy.LocByName(api_name) type_def = self.ida_proxy.GetType(api_location) function_signature_regex = r"(?P<return_type>[\w\s\*]+)\((?P<parameters>[,\.\*\w\s]*)\)" result = self.re.match(function_signature_regex, type_def) if result is not None: api_signature["return_type"] = result.group("return_type") if len(result.group("parameters")) > 0: for parameter in result.group("parameters").split(","): type_and_name = {} type_and_name["type"] = parameter[:parameter.rfind(" ")].strip() type_and_name["name"] = parameter[parameter.rfind(" "):].strip() api_signature["parameters"].append(type_and_name) else: print ("[-] SemanticIdentifier._getApiSignature: No API/function signature for \"%s\" @ 0x%x available. " \ + "(non-critical)") % (api_name, api_location) # TODO: # here should be a check for the calling convention # currently, list list is simply reversed to match the order parameters are pushed to the stack api_signature["parameters"].reverse() return api_signature def _getPushAddressesBeforeTargetAddress(self, address): """ Get the addresses of all push instructions in the basic block preceding the given address. @param address: address to get the push addresses for. @type address: int @return: a list of int """ push_addresses = [] function_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(address)) for block in function_chart: if block.startEA <= address < block.endEA: for instruction_addr in self.ida_proxy.Heads(block.startEA, block.endEA): if self.ida_proxy.GetMnem(instruction_addr) == "push": push_addresses.append(instruction_addr) if instruction_addr >= address: break return push_addresses def createFunctionGraph(self, func_address): graph = {"root": func_address, "nodes": {}} unexplored = set() if func_address in self.last_scan_result.keys(): graph["nodes"][func_address] = self.last_scan_result[func_address].calls_from unexplored = set(self.last_scan_result[func_address].calls_from) while len(unexplored) > 0: current_function = unexplored.pop() if current_function in graph["nodes"].keys() or current_function not in self.last_scan_result.keys(): continue else: graph["nodes"][current_function] = self.last_scan_result[current_function].calls_from new_functions = \ set(self.last_scan_result[current_function].calls_from).difference(set(graph["nodes"].keys())) unexplored.update(new_functions) return graph def createFunctionContextFilter(self): """ Create a function filter, containing only those tags/groups that have been identified within the last scan. """ context_filter = self.FunctionContextFilter() context_filter.tags = sorted([(tag, tag, tag) for tag in self.getTags()]) context_filter.enabled_tags = context_filter.tags context_filter.groups = sorted([(group, group, group) for group in self.getGroups()]) context_filter.enabled_groups = context_filter.groups return context_filter def getLastScanResult(self): """ Get the last scan result as retrieved by I{scanByReferences}. @return: a dictionary with key/value entries of the following form: (function_address, FunctionContext) """ return self.last_scan_result def printLastScanResult(self): """ nicely print the last scan result (mostly used for debugging) """ for function_address in self.last_scan_result.keys(): print ("0x%x - %s -> ") % (function_address, self.ida_proxy.GetFunctionName(function_address)) \ + ", ".join(self.getTagsForFunctionAddress(function_address)) for call_ctx in self.last_scan_result[function_address].call_contexts: print (" 0x%x - %s (%s)") % (call_ctx.address_of_call, call_ctx.called_function_name, call_ctx.tag)
class CryptoIdentifier(): """ This class contains the logic to perform Crypto identification. Two techniques are currently supported: 1. A heuristic approach that identifies functions and basic blocks based on the ratio of arithmetic/logic instructions to all instructions 2. A signature-based approach, using the signatures defined in PatternManager """ def __init__(self): self.name = "CryptoIdentifier" print ("[*] loading CryptoIdentifier") self.time = time self.re = re self.GraphHelper = GraphHelper self.CryptoSignatureHit = CryptoSignatureHit self.AritlogBasicBlock = AritlogBasicBlock self.Segment = Segment self.pm = PatternManager() self.low_rating_threshold = 0.4 self.high_rating_threshold = 1.0 self.low_instruction_threshold = 8 self.high_instruction_threshold = 100 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_instruction_threshold = 100 self.low_call_threshold = 0 self.high_call_threshold = 1 # if the threshold is set to this value, it is automatically expanded to infinite. self.max_call_threshold = 10 # if at least this fraction of a signature's length' has been identified # consecutively, the location is marked as a signature hit. self.match_filter_factor = 0.5 self.aritlog_blocks = [] self.signature_hits = [] self.ida_proxy = IdaProxy() return def scan(self): """ Scan the whole IDB with all available techniques. """ self.scanAritlog() self.scanCryptoPatterns() ################################################################################ # Aritlog scanning ################################################################################ def scanAritlog(self): """ scan with the arithmetic/logic heuristic @return: a list of AritLogBasicBlock data objects that fulfill the parameters as specified """ print ("[*] CryptoIdentifier: Starting aritlog heuristic analysis.") self.aritlog_blocks = [] time_before = self.time.time() for function_ea in self.ida_proxy.Functions(): function_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(function_ea)) calls_in_function = 0 function_blocks = [] function_dgraph = {} blocks_in_loops = set() for current_block in function_chart: block = self.AritlogBasicBlock(current_block.startEA, current_block.endEA) for instruction in self.ida_proxy.Heads(block.start_ea, block.end_ea): if self.ida_proxy.isCode(self.ida_proxy.GetFlags(instruction)): mnemonic = self.ida_proxy.GetMnem(instruction) has_identical_operands = self.ida_proxy.GetOperandValue(instruction, 0) == \ self.ida_proxy.GetOperandValue(instruction, 1) block.updateInstructionCount(mnemonic, has_identical_operands) if mnemonic == "call": calls_in_function += 1 function_blocks.append(block) # prepare graph for Tarjan's algorithm succeeding_blocks = [succ.startEA for succ in current_block.succs()] function_dgraph[current_block.startEA] = succeeding_blocks # add trivial loops if current_block.startEA in succeeding_blocks: block.is_contained_in_trivial_loop = True blocks_in_loops.update([current_block.startEA]) # perform Tarjan's algorithm to identify strongly connected components (= loops) in the function graph graph_helper = self.GraphHelper() strongly_connected = graph_helper.calculateStronglyConnectedComponents(function_dgraph) non_trivial_loops = [component for component in strongly_connected if len(component) > 1] for component in non_trivial_loops: for block in component: blocks_in_loops.update([block]) for block in function_blocks: if block.start_ea in blocks_in_loops: block.is_contained_in_loop = True block.num_calls_in_function = calls_in_function self.aritlog_blocks.extend(function_blocks) print ("[*] Heuristics analysis took %3.2f seconds." % (self.time.time() - time_before)) return self.getAritlogBlocks(self.low_rating_threshold, self.high_rating_threshold, self.low_instruction_threshold, self.high_instruction_threshold, self.low_call_threshold, self.high_call_threshold, False, False, False) def _updateThresholds(self, min_rating, max_rating, min_instr, max_instr, min_call, max_call): """ update all six threshold bounds @param min_rating: the minimum arit/log ratio a basic block must have @type min_rating: float @param max_rating: the maximum arit/log ratio a basic block can have @type max_rating: float @param min_instr: the minimum number of instructions a basic block must have @type min_instr: int @param max_instr: the minimum number of instructions a basic block can have @type max_instr: int @param min_call: the minimum number of calls a basic block must have @type min_call: int @param max_call: the minimum number of calls a basic block can have @type max_call: int """ self.low_rating_threshold = max(0.0, min_rating) self.high_rating_threshold = min(1.0, max_rating) self.low_instruction_threshold = max(0, min_instr) if max_instr >= self.max_instruction_threshold: # we cap the value here and safely assume there is no block with more than 1000000 instructions self.high_instruction_threshold = 1000000 else: self.high_instruction_threshold = max_instr self.low_call_threshold = max(0, min_call) if max_call >= self.max_call_threshold: # we cap the value here and safely assume there is no block with more than 1000000 instructions self.high_call_threshold = 1000000 else: self.high_call_threshold = max_call def getAritlogBlocks(self, min_rating, max_rating, min_instr, max_instr, min_api, max_api, is_nonzero, \ is_looped, is_trivially_looped): """ get all blocks that are within the limits specified by the heuristic parameters. parameters are the same as in function "_updateThresholds" except param is_nonzero: defines whether zeroing instructions (like xor eax, eax) shall be counted or not. type is_nonzero: boolean param is_looped: defines whether only basic blocks in loops shall be selected type is_looped: boolean @return: a list of AritlogBasicBlock data objects, according to the parameters """ self._updateThresholds(min_rating, max_rating, min_instr, max_instr, min_api, max_api) return [block for block in self.aritlog_blocks if (self.high_rating_threshold >= block.getAritlogRating(is_nonzero) >= self.low_rating_threshold) and (self.high_instruction_threshold >= block.num_instructions >= self.low_instruction_threshold) and (self.high_call_threshold >= block.num_calls_in_function >= self.low_call_threshold) and (not is_looped or block.is_contained_in_loop) and (not is_trivially_looped or block.is_contained_in_trivial_loop)] def getUnfilteredBlockCount(self): """ returns the number of basic blocks that have been analyzed. @return: (int) number of basic blocks """ return len(self.aritlog_blocks) ################################################################################ # Signature scanning ################################################################################ def getSegmentData(self): """ returns the raw bytes of the segments as stored by IDA @return: a list of Segment data objects. """ segments = [] for segment_ea in self.ida_proxy.Segments(): try: segment = self.Segment() segment.start_ea = segment_ea segment.end_ea = self.ida_proxy.SegEnd(segment_ea) segment.name = self.ida_proxy.SegName(segment_ea) buf = "" for ea in helpers.Misc.lrange(segment_ea, self.ida_proxy.SegEnd(segment_ea)): buf += chr(self.ida_proxy.get_byte(ea)) segment.data = buf segments.append(segment) except: print ("[!] Tried to access invalid segment data. An error has occurred while address conversion") return segments def scanCryptoPatterns(self, pattern_size=32): crypt_results = [] print ("[*] CryptoIdentifier: Starting crypto signature scanning.") time_before_matching = self.time.time() segments = self.getSegmentData() keywords = self.pm.getTokenizedSignatures(pattern_size) for keyword in keywords.keys(): for segment in segments: crypt_results.extend([self.CryptoSignatureHit(segment.start_ea + match.start(), keywords[keyword], keyword) for match in self.re.finditer(self.re.escape(keyword), segment.data)]) variable_matches = self.scanVariablePatterns() crypt_results.extend(variable_matches) print ("[*] Full matching took %3.2f seconds and resulted in %d hits." % (self.time.time() - time_before_matching, len(crypt_results))) self.signature_hits = crypt_results return crypt_results def scanVariablePatterns(self): # the scanning code is roughly based on kyprizel's signature scan, see credtis above for more information crypt_results = [] variable_signatures = self.pm.getVariableSignatures() for var_sig in variable_signatures.keys(): current_seg = self.ida_proxy.FirstSeg() seg_end = self.ida_proxy.SegEnd(current_seg) while current_seg != self.ida_proxy.BAD_ADDR: signature_hit = self.ida_proxy.find_binary(current_seg, seg_end, variable_signatures[var_sig], 16, 1) if signature_hit != self.ida_proxy.BAD_ADDR: crypt_results.append(self.CryptoSignatureHit(signature_hit, [var_sig], variable_signatures[var_sig])) current_seg = signature_hit + variable_signatures[var_sig].count(" ") + 1 else: current_seg = self.ida_proxy.NextSeg(seg_end) if not current_seg == self.ida_proxy.BAD_ADDR: seg_end = self.ida_proxy.SegEnd(current_seg) return crypt_results def getSignatureLength(self, signature_name): """ returns the length for a signature, identified by its name @param signature_name: name for a signature, e.g. "ADLER 32" @type signature_name: str @return: (int) length of the signature. """ for item in self.pm.signatures.items(): if item[1] == signature_name: return len(item[0]) return 0 def getSignatureHits(self): """ Get all signature hits that have a length of at least match_filter_factor percent of the signature they triggered. Hits are grouped by signature names. @return: a dictionary with key/value entries of the following form: ("signature name", [CryptoSignatureHit]) """ sorted_hits = sorted(self.signature_hits) unified_hits = [] previous_signature_names = [] for hit in sorted_hits: hit_intersection = [element for element in hit.signature_names if element in previous_signature_names] if len(hit_intersection) == 0: previous_signature_names = hit.signature_names unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \ hit.matched_signature)) else: previous_signature_names = hit_intersection previous_hit = unified_hits[-1] if hit.start_address == previous_hit.start_address + len(previous_hit.matched_signature): previous_hit.matched_signature += hit.matched_signature previous_hit.signature_names = hit_intersection else: unified_hits.append(self.CryptoSignatureHit(hit.start_address, hit.signature_names, \ hit.matched_signature)) filtered_hits = [] for hit in unified_hits: if len(hit.matched_signature) >= max([self.match_filter_factor * self.getSignatureLength(name) for name in hit.signature_names]): hit.code_refs_to = self.getXrefsToAddress(hit.start_address) filtered_hits.append(hit) grouped_hits = {} for hit in filtered_hits: for name in hit.signature_names: if name not in grouped_hits: grouped_hits[name] = [hit] else: grouped_hits[name].append(hit) return grouped_hits def getXrefsToAddress(self, address): """ get all references to a certain address. These are no xrefs in IDA sense but references to the crypto signatures. If the signature points to an instruction, e.g. if a constant is moved to a register, the return is flagged as "True", meaning it is an in-code reference. @param address: an arbitrary address @type address: int @return: a list of tuples (int, boolean) """ xrefs = [] head_to_address = self.ida_proxy.PrevHead(address, address - 14) if head_to_address != 0xFFFFFFFF: flags = self.ida_proxy.GetFlags(head_to_address) if self.ida_proxy.isCode(flags): xrefs.append((head_to_address, True)) for x in self.ida_proxy.XrefsTo(address): flags = self.ida_proxy.GetFlags(x.frm) if self.ida_proxy.isCode(flags): xrefs.append((x.frm, False)) return xrefs
class DocumentationHelper(): """ This class handles instruction coloring. """ # data layout of color maps layout_color_map = {"tag": {"base_color": 0x112233, "highlight_color": 0x445566}} def __init__(self, idascope_config): print ("[|] loading DocumentationHelper") self.ida_proxy = IdaProxy() # default colors are grey / light red / red self.default_neutral_color = 0xCCCCCC self.default_base_color = 0xB3B3FF self.default_highlight_color = 0x3333FF self.color_state = "unknown" self.idascope_config = idascope_config self._loadConfig(self.idascope_config.semantics_file) return def _loadConfig(self, config_filename): """ Loads a semantic configuration file and generates a color map from the contained information. @param config_filename: filename of a semantic configuration file @type config_filename: str """ config_file = open(config_filename, "r") config = config_file.read() parsed_config = json.loads(config, object_hook=JsonHelper.decode_dict) self.default_neutral_color = int(parsed_config["default_neutral_color"], 16) self.default_base_color = int(parsed_config["default_base_color"], 16) self.default_highlight_color = int(parsed_config["default_highlight_color"], 16) self.color_map = self._generateColorMapFromDefinitions(parsed_config) return def _generateColorMapFromDefinitions(self, config): """ Internal function to generate a color map from a semantic definitions config file. @param definitions: the defintions part of a semantic definitions config file. @type definitions: dict @return: a dictionary of a color map, see I{layout_color_map} for a reference """ color_map = {} for definition in config["semantic_definitions"]: # convert text representation of color codes to numbers group_colors = self._getColorsForGroup(definition["group"], config) color_map[definition["tag"]] = {"base_color": int(group_colors[0], 16), \ "highlight_color": int(group_colors[1], 16)} return color_map def _getColorsForGroup(self, target_group, config): for group in config["semantic_groups"]: if group["tag"] == target_group: return (group["base_color"], group["highlight_color"]) print "[-] Failed to get colors for group \"%s\" - you might want to check your semantics file." % target_group return (self.default_base_color, self.default_highlight_color) def uncolorAll(self): """ Uncolors all instructions of all segments by changing their color to white. """ for seg_ea in self.ida_proxy.Segments(): for function_address in self.ida_proxy.Functions(self.ida_proxy.SegStart(seg_ea), \ self.ida_proxy.SegEnd(seg_ea)): for block in self.ida_proxy.FlowChart(self.ida_proxy.get_func(function_address)): for head in self.ida_proxy.Heads(block.startEA, block.endEA): self.colorInstruction(head, 0xFFFFFF, refresh=False) self.ida_proxy.refresh_idaview_anyway() def colorInstruction(self, address, color, refresh=True): """ Colors the instruction at an address with the given color code. @param address: address of the instruction to color @type address: int @param color: color-code to set for the instruction @type color: int (0xBBGGRR) @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance. @type refresh: boolean """ self.ida_proxy.SetColor(address, self.ida_proxy.CIC_ITEM, color) if refresh: self.ida_proxy.refresh_idaview_anyway() def colorBasicBlock(self, address, color, refresh=True): """ Colors the basic block containing a target address with the given color code. @param address: address an instruction in the basic block to color @type address: int @param color: color-code to set for the instruction @type color: int (0xBBGGRR) @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance. @type refresh: boolean """ function_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(address)) for block in function_chart: if block.startEA <= address < block.endEA: for head in self.ida_proxy.Heads(block.startEA, block.endEA): self.colorInstruction(head, color, refresh) def getNextColorScheme(self): """ get the next color scheme in the three-cycle "individual/mono/uncolored", where individual is semantic coloring @return: next state """ if self.color_state == "individual": return "mono" elif self.color_state == "mono": return "uncolored" elif self.color_state == "uncolored": return "individual" else: return "individual" def selectHighlightColor(self, tag): """ automatically chooses the highlight color for a tag based on the current color scheme @return: (int) a color code """ if self.getNextColorScheme() == "uncolored": return 0xFFFFFF elif self.getNextColorScheme() == "mono": return self.default_highlight_color else: return self.color_map[tag]["highlight_color"] def selectBaseColor(self, tagged_addresses_in_block): """ automatically chooses the base color for a block based on the current color scheme @param tagged_addresses_in_block: all tagged addresses in a basic block for which the color shall be chosen @type tagged_addresses_in_block: a list of tuples (int, str) containing pairs of instruction addresses and tags @return: (int) a color code """ if self.getNextColorScheme() == "uncolored": return 0xFFFFFF elif self.getNextColorScheme() == "mono": return self.default_base_color else: tags_in_block = [item[1] for item in tagged_addresses_in_block] colors_in_block = set([self.color_map[tags_in_block[index]]["base_color"] \ for index in xrange(len(tags_in_block))]) if len(colors_in_block) == 1: return colors_in_block.pop() else: return self.default_neutral_color def colorize(self, scan_result): """ perform coloring on the IDB, based on a scan performed by SemanticIdentifier @param scan_result: result of a scan as performed by SemanticIdentifier @type scan_result: a dictionary with key/value entries of the following form: (address, [FunctionContext]) """ for function_address in scan_result.keys(): tagged_addresses_in_function = scan_result[function_address].getAllTaggedAddresses() function_chart = self.ida_proxy.FlowChart(self.ida_proxy.get_func(function_address)) for basic_block in function_chart: tagged_addresses_in_block = [(addr, tagged_addresses_in_function[addr]) for addr in \ tagged_addresses_in_function.keys() if addr in xrange(basic_block.startEA, basic_block.endEA)] if len(tagged_addresses_in_block) > 0: base_color = self.selectBaseColor(tagged_addresses_in_block) self.colorBasicBlock(basic_block.startEA, base_color, refresh=False) for tagged_address in tagged_addresses_in_block: highlight_color = self.selectHighlightColor(tagged_address[1]) self.colorInstruction(tagged_address[0], highlight_color, refresh=False) self.color_state = self.getNextColorScheme() self.ida_proxy.refresh_idaview_anyway() def getNextNonFuncInstruction(self, addr): next_instruction = addr while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.ida_proxy.find_not_func(next_instruction, self.ida_proxy.SEARCH_DOWN) flags = self.ida_proxy.GetFlags(next_instruction) if self.ida_proxy.isCode(flags): return next_instruction return self.ida_proxy.BAD_ADDR def convertNonFunctionCode(self): self.convertAnyProloguesToFunctions() # do a second run to define the rest next_instruction = self.ida_proxy.minEA() while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.getNextNonFuncInstruction(next_instruction) print("[+] Fixed undefined code to function @ [%08x]" % \ (next_instruction)) self.ida_proxy.MakeFunction(next_instruction) return def convertAnyProloguesToFunctions(self): self.convertDataWithPrologueToCode() self.convertNonFunctionCodeWithPrologues() def convertNonFunctionCodeWithPrologues(self): next_instruction = self.ida_proxy.minEA() while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.getNextNonFuncInstruction(next_instruction) if self.ida_proxy.GetMnem(next_instruction).startswith("push") and \ self.ida_proxy.GetOpType(next_instruction, 0) == 1 and \ self.ida_proxy.GetOperandValue(next_instruction, 0) == 5: instruction_after_push = self.getNextNonFuncInstruction(next_instruction) if self.ida_proxy.GetMnem(instruction_after_push).startswith("mov") and \ self.ida_proxy.GetOpType(instruction_after_push, 0) == 1 and \ self.ida_proxy.GetOperandValue(instruction_after_push, 0) == 5 and \ self.ida_proxy.GetOpType(instruction_after_push, 1) == 1 and \ self.ida_proxy.GetOperandValue(instruction_after_push, 1) == 4: print("[+] Fixed undefined code with function prologue (push ebp; mov ebp, esp) to function " \ + "@ [%08x]" % (next_instruction)) self.ida_proxy.MakeFunction(next_instruction) def convertDataWithPrologueToCode(self): current_seg = self.ida_proxy.FirstSeg() seg_end = self.ida_proxy.SegEnd(current_seg) while current_seg != self.ida_proxy.BAD_ADDR: signature_hit = self.ida_proxy.find_binary(current_seg, seg_end, "55 8B EC", 16, 1) if signature_hit != self.ida_proxy.BAD_ADDR: flags = self.ida_proxy.GetFlags(signature_hit) if not self.ida_proxy.isCode(flags): self.ida_proxy.MakeFunction(signature_hit) print("[+] Fixed undefined data with potential function prologue (push ebp; mov ebp, esp) to function " \ + "@ [%08x]" % (signature_hit)) current_seg = signature_hit + 3 + 1 else: current_seg = self.ida_proxy.NextSeg(seg_end) if not current_seg == self.ida_proxy.BAD_ADDR: seg_end = self.ida_proxy.SegEnd(current_seg)
class DocumentationHelper(): """ This class handles instruction coloring. """ # data layout of color maps layout_color_map = { "tag": { "base_color": 0x112233, "highlight_color": 0x445566 } } def __init__(self, idascope_config): print("[|] loading DocumentationHelper") self.ida_proxy = IdaProxy() # default colors are grey / light red / red self.default_neutral_color = 0xCCCCCC self.default_base_color = 0xB3B3FF self.default_highlight_color = 0x3333FF self.color_state = "unknown" self.idascope_config = idascope_config self._loadConfig(self.idascope_config.semantics_file) return def _loadConfig(self, config_filename): """ Loads a semantic configuration file and generates a color map from the contained information. @param config_filename: filename of a semantic configuration file @type config_filename: str """ config_file = open(config_filename, "r") config = config_file.read() parsed_config = json.loads(config, object_hook=JsonHelper.decode_dict) self.default_neutral_color = int( parsed_config["default_neutral_color"], 16) self.default_base_color = int(parsed_config["default_base_color"], 16) self.default_highlight_color = int( parsed_config["default_highlight_color"], 16) self.color_map = self._generateColorMapFromDefinitions(parsed_config) return def _generateColorMapFromDefinitions(self, config): """ Internal function to generate a color map from a semantic definitions config file. @param definitions: the defintions part of a semantic definitions config file. @type definitions: dict @return: a dictionary of a color map, see I{layout_color_map} for a reference """ color_map = {} for definition in config["semantic_definitions"]: # convert text representation of color codes to numbers group_colors = self._getColorsForGroup(definition["group"], config) color_map[definition["tag"]] = {"base_color": int(group_colors[0], 16), \ "highlight_color": int(group_colors[1], 16)} return color_map def _getColorsForGroup(self, target_group, config): for group in config["semantic_groups"]: if group["tag"] == target_group: return (group["base_color"], group["highlight_color"]) print "[-] Failed to get colors for group \"%s\" - you might want to check your semantics file." % target_group return (self.default_base_color, self.default_highlight_color) def uncolorAll(self): """ Uncolors all instructions of all segments by changing their color to white. """ for seg_ea in self.ida_proxy.Segments(): for function_address in self.ida_proxy.Functions(self.ida_proxy.SegStart(seg_ea), \ self.ida_proxy.SegEnd(seg_ea)): for block in self.ida_proxy.FlowChart( self.ida_proxy.get_func(function_address)): for head in self.ida_proxy.Heads(block.startEA, block.endEA): self.colorInstruction(head, 0xFFFFFF, refresh=False) self.ida_proxy.refresh_idaview_anyway() def colorInstruction(self, address, color, refresh=True): """ Colors the instruction at an address with the given color code. @param address: address of the instruction to color @type address: int @param color: color-code to set for the instruction @type color: int (0xBBGGRR) @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance. @type refresh: boolean """ self.ida_proxy.SetColor(address, self.ida_proxy.CIC_ITEM, color) if refresh: self.ida_proxy.refresh_idaview_anyway() def colorBasicBlock(self, address, color, refresh=True): """ Colors the basic block containing a target address with the given color code. @param address: address an instruction in the basic block to color @type address: int @param color: color-code to set for the instruction @type color: int (0xBBGGRR) @param refresh: refresh IDA view to ensure the color shows directly, can be omitted for performance. @type refresh: boolean """ function_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(address)) for block in function_chart: if block.startEA <= address < block.endEA: for head in self.ida_proxy.Heads(block.startEA, block.endEA): self.colorInstruction(head, color, refresh) def getNextColorScheme(self): """ get the next color scheme in the three-cycle "individual/mono/uncolored", where individual is semantic coloring @return: next state """ if self.color_state == "individual": return "mono" elif self.color_state == "mono": return "uncolored" elif self.color_state == "uncolored": return "individual" else: return "individual" def selectHighlightColor(self, tag): """ automatically chooses the highlight color for a tag based on the current color scheme @return: (int) a color code """ if self.getNextColorScheme() == "uncolored": return 0xFFFFFF elif self.getNextColorScheme() == "mono": return self.default_highlight_color else: return self.color_map[tag]["highlight_color"] def selectBaseColor(self, tagged_addresses_in_block): """ automatically chooses the base color for a block based on the current color scheme @param tagged_addresses_in_block: all tagged addresses in a basic block for which the color shall be chosen @type tagged_addresses_in_block: a list of tuples (int, str) containing pairs of instruction addresses and tags @return: (int) a color code """ if self.getNextColorScheme() == "uncolored": return 0xFFFFFF elif self.getNextColorScheme() == "mono": return self.default_base_color else: tags_in_block = [item[1] for item in tagged_addresses_in_block] colors_in_block = set([self.color_map[tags_in_block[index]]["base_color"] \ for index in xrange(len(tags_in_block))]) if len(colors_in_block) == 1: return colors_in_block.pop() else: return self.default_neutral_color def colorize(self, scan_result): """ perform coloring on the IDB, based on a scan performed by SemanticIdentifier @param scan_result: result of a scan as performed by SemanticIdentifier @type scan_result: a dictionary with key/value entries of the following form: (address, [FunctionContext]) """ for function_address in scan_result.keys(): tagged_addresses_in_function = scan_result[ function_address].getAllTaggedAddresses() function_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(function_address)) for basic_block in function_chart: tagged_addresses_in_block = [(addr, tagged_addresses_in_function[addr]) for addr in \ tagged_addresses_in_function.keys() if addr in xrange(basic_block.startEA, basic_block.endEA)] if len(tagged_addresses_in_block) > 0: base_color = self.selectBaseColor( tagged_addresses_in_block) self.colorBasicBlock(basic_block.startEA, base_color, refresh=False) for tagged_address in tagged_addresses_in_block: highlight_color = self.selectHighlightColor( tagged_address[1]) self.colorInstruction(tagged_address[0], highlight_color, refresh=False) self.color_state = self.getNextColorScheme() self.ida_proxy.refresh_idaview_anyway() def getNextNonFuncInstruction(self, addr): next_instruction = addr while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.ida_proxy.find_not_func( next_instruction, self.ida_proxy.SEARCH_DOWN) flags = self.ida_proxy.GetFlags(next_instruction) if self.ida_proxy.isCode(flags): return next_instruction return self.ida_proxy.BAD_ADDR def convertNonFunctionCode(self): self.convertAnyProloguesToFunctions() # do a second run to define the rest next_instruction = self.ida_proxy.minEA() while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.getNextNonFuncInstruction(next_instruction) print("[+] Fixed undefined code to function @ [%08x]" % \ (next_instruction)) self.ida_proxy.MakeFunction(next_instruction) return def convertAnyProloguesToFunctions(self): self.convertDataWithPrologueToCode() self.convertNonFunctionCodeWithPrologues() def convertNonFunctionCodeWithPrologues(self): next_instruction = self.ida_proxy.minEA() while next_instruction != self.ida_proxy.BAD_ADDR: next_instruction = self.getNextNonFuncInstruction(next_instruction) if self.ida_proxy.GetMnem(next_instruction).startswith("push") and \ self.ida_proxy.GetOpType(next_instruction, 0) == 1 and \ self.ida_proxy.GetOperandValue(next_instruction, 0) == 5: instruction_after_push = self.getNextNonFuncInstruction( next_instruction) if self.ida_proxy.GetMnem(instruction_after_push).startswith("mov") and \ self.ida_proxy.GetOpType(instruction_after_push, 0) == 1 and \ self.ida_proxy.GetOperandValue(instruction_after_push, 0) == 5 and \ self.ida_proxy.GetOpType(instruction_after_push, 1) == 1 and \ self.ida_proxy.GetOperandValue(instruction_after_push, 1) == 4: print("[+] Fixed undefined code with function prologue (push ebp; mov ebp, esp) to function " \ + "@ [%08x]" % (next_instruction)) self.ida_proxy.MakeFunction(next_instruction) def convertDataWithPrologueToCode(self): current_seg = self.ida_proxy.FirstSeg() seg_end = self.ida_proxy.SegEnd(current_seg) while current_seg != self.ida_proxy.BAD_ADDR: signature_hit = self.ida_proxy.find_binary(current_seg, seg_end, "55 8B EC", 16, 1) if signature_hit != self.ida_proxy.BAD_ADDR: flags = self.ida_proxy.GetFlags(signature_hit) if not self.ida_proxy.isCode(flags): self.ida_proxy.MakeFunction(signature_hit) print("[+] Fixed undefined data with potential function prologue (push ebp; mov ebp, esp) to function " \ + "@ [%08x]" % (signature_hit)) current_seg = signature_hit + 3 + 1 else: current_seg = self.ida_proxy.NextSeg(seg_end) if not current_seg == self.ida_proxy.BAD_ADDR: seg_end = self.ida_proxy.SegEnd(current_seg)
class SemanticIdentifier(): """ A module to analyze and explore an IDB for semantics. For a set of API names, references to these are identified and used for creating context and allowing tagging of them. """ def __init__(self, idascope_config): print("[|] loading SemanticIdentifier") self.os = os self.re = re self.time = time self.ida_proxy = IdaProxy() self.FunctionContext = FunctionContext self.FunctionContextFilter = FunctionContextFilter self.CallContext = CallContext self.ParameterContext = ParameterContext # fields self.semantics = {} self.active_semantics = {} self.renaming_seperator = "_" self.semantic_groups = [] self.semantic_definitions = [] self.real_api_names = {} self.last_scan_result = {} self.idascope_config = idascope_config self._getRealApiNames() self._loadSemantics(self.idascope_config) return def _cbEnumImports(self, addr, name, ordinal): if name: self.real_api_names[name] = self.ida_proxy.Name(addr) return True def _getRealApiNames(self): num_imports = self.ida_proxy.get_import_module_qty() for i in xrange(0, num_imports): self.ida_proxy.enum_import_names(i, self._cbEnumImports) def lookupRealApiName(self, api_name): if api_name in self.real_api_names: return self.real_api_names[api_name] else: return api_name def lookupDisplayApiName(self, real_api_name): """ returns the key by given value of self.real_api_names (basically inverted dictionary) """ name = real_api_name for display_name in self.real_api_names: if real_api_name == self.real_api_names[display_name] \ and display_name in self.real_api_names[display_name]: name = display_name return name def _loadSemantics(self, config): """ Loads a semantic configuration file and collects all definitions from it. @param config_filename: filename of a semantic configuration file @type config_filename: str """ for filename in [ fn for fn in self.os.listdir(config.semantics_folder) if fn.endswith(".json") ]: loaded_file = self._loadSemanticsFile(config.semantics_folder + self.os.sep + filename) self.semantics[loaded_file["name"]] = loaded_file if config.inspection_default_semantics in self.semantics: self._setSemantics(config.inspection_default_semantics) elif len(self.semantics) > 0: self._setSemantics(sorted(self.semantics.keys())[0]) else: self._setSemantics("") return def _loadSemanticsFile(self, semantics_filename): """ Loads a semantic configuration file and collects all definitions from it. @param config_filename: filename of a semantic configuration file @type config_filename: str """ semantics_file = open(semantics_filename, "r") semantics = semantics_file.read() return json.loads(semantics, object_hook=JsonHelper.decode_dict) def _setSemantics(self, semantics_entry): semantics_content = {} if semantics_entry in self.semantics: semantics_content = self.semantics[semantics_entry] self.renaming_seperator = semantics_content["renaming_seperator"] self.semantic_groups = semantics_content["semantic_groups"] self.semantic_definitions = semantics_content[ "semantic_definitions"] self.active_semantics = semantics_content else: self.renaming_seperator = "_" self.semantic_groups = [] self.semantic_definitions = [] self.active_semantics = {"name": "none"} self.scanByReferences() def getSemanticsNames(self): return sorted(self.semantics.keys()) def getActiveSemanticsName(self): return self.active_semantics["name"] def calculateNumberOfBasicBlocksForFunctionAddress(self, function_address): """ Calculates the number of basic blocks for a given function by walking its FlowChart. @param function_address: function address to calculate the block count for @type function_address: int """ number_of_blocks = 0 try: func_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(function_address)) for block in func_chart: number_of_blocks += 1 except: pass return number_of_blocks def getNumberOfBasicBlocksForFunctionAddress(self, address): """ returns the number of basic blocks for the function containing the queried address, based on the value stored in the last scan result. If the number of basic blocks for this function has never been calculated, zero is returned. @param function_address: function address to get the block count for @type function_address: int @return: (int) The number of blocks in th e function """ number_of_blocks = 0 function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): number_of_blocks = self.last_scan_result[ function_address].number_of_basic_blocks return number_of_blocks def scan(self): """ Scan the whole IDB with all available techniques. """ self.scanByReferences() self.scanDeep() def scanByReferences(self): """ Scan by references to API names, based on the definitions loaded from the config file. This is highly efficient because we only touch places in the IDB that actually have references to our API names of interest. """ print( " [/] SemanticIdentifier: Starting (fast) scan by references of function semantics." ) time_before = self.time.time() self.last_scan_result = {} for semantic_tag in self.semantic_definitions: for api_name in semantic_tag["api_names"]: real_api_name = self.lookupRealApiName(api_name) api_address = self.ida_proxy.LocByName(real_api_name) for ref in self._getAllRefsTo(api_address): function_ctx = self._getFunctionContext(ref) function_ctx.has_tags = True call_ctx = self.CallContext() call_ctx.called_function_name = api_name call_ctx.real_called_function_name = real_api_name call_ctx.address_of_call = ref call_ctx.called_address = api_address call_ctx.tag = semantic_tag["tag"] call_ctx.group = semantic_tag["group"] call_ctx.parameter_contexts = self._resolveApiCall( call_ctx) function_ctx.call_contexts.append(call_ctx) print(" [\\] Analysis took %3.2f seconds." % (self.time.time() - time_before)) def _getAllRefsTo(self, addr): code_ref_addrs = [ref for ref in self.ida_proxy.CodeRefsTo(addr, 0)] data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsTo(addr)] return iter(set(code_ref_addrs).union(set(data_ref_addrs))) def _getNumRefsTo(self, addr): return sum([1 for ref in self._getAllRefsTo(addr)]) def _getAllRefsFrom(self, addr, code_only=False): code_ref_addrs = [ref for ref in self.ida_proxy.CodeRefsFrom(addr, 0)] data_ref_addrs = [] if code_only: # only consider data references that lead to a call near/far (likely imports) data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsFrom(addr) if \ self.ida_proxy.GetFlags(ref) & (self.ida_proxy.FL_CN | self.ida_proxy.FL_CF)] else: data_ref_addrs = [ref for ref in self.ida_proxy.DataRefsFrom(addr)] return iter(set(code_ref_addrs).union(set(data_ref_addrs))) def _getFunctionContext(self, addr): """ Create or return an existing FunctionContext for the given address in the current scan result. @param func_addr: address to create a FunctionContext for @type func_addr: int @return: (FunctionContext) A reference to the corresponding function context """ function_ctx = None function_address = self.ida_proxy.LocByName( self.ida_proxy.GetFunctionName(addr)) if function_address not in self.last_scan_result.keys(): function_ctx = self.FunctionContext() function_ctx.function_address = function_address function_ctx.function_name = self.ida_proxy.GetFunctionName( function_address) function_ctx.has_dummy_name = (self.ida_proxy.GetFlags(function_address) & \ self.ida_proxy.FF_LABL) > 0 self.last_scan_result[function_ctx.function_address] = function_ctx else: function_ctx = self.last_scan_result[function_address] return function_ctx def scanDeep(self): """ Perform a full enumeration of all instructions, gathering information like number of instructions, number of basic blocks, references to and from functions etc. """ print( " [/] SemanticIdentifier: Starting deep scan of function semantics." ) time_before = self.time.time() for function_ea in self.ida_proxy.Functions(): function_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(function_ea)) num_blocks = 0 num_instructions = 0 xrefs_from = [] calls_from = [] function_ctx = self._getFunctionContext(function_ea) for block in function_chart: num_blocks += 1 for instruction in self.ida_proxy.Heads( block.startEA, block.endEA): num_instructions += 1 if self.ida_proxy.isCode( self.ida_proxy.GetFlags(instruction)): for ref in self._getAllRefsFrom(instruction): if self.ida_proxy.GetMnem(instruction) == "call": calls_from.append(ref) xrefs_from.append(ref) function_ctx.calls_from.update(calls_from) function_ctx.number_of_xrefs_to = self._getNumRefsTo(function_ea) function_ctx.xrefs_from.update(xrefs_from) function_ctx.number_of_xrefs_from = len(xrefs_from) function_ctx.number_of_basic_blocks = num_blocks function_ctx.number_of_instructions = num_instructions print(" [\\] Analysis took %3.2f seconds." % (self.time.time() - time_before)) def getFunctionAddressForAddress(self, address): """ Get a function address containing the queried address. @param address: address to check the function address for @type address: int @return: (int) The start address of the function containing this address """ return self.ida_proxy.LocByName( self.ida_proxy.GetFunctionName(address)) def calculateNumberOfFunctions(self): """ Calculate the number of functions in all segments. @return: (int) the number of functions found. """ number_of_functions = 0 for seg_ea in self.ida_proxy.Segments(): for function_ea in self.ida_proxy.Functions( self.ida_proxy.SegStart(seg_ea), self.ida_proxy.SegEnd(seg_ea)): number_of_functions += 1 return number_of_functions def calculateNumberOfTaggedFunctions(self): """ Calculate the number of functions in all segments that have been tagged. @return: (int) the number of functions found. """ return len( self.getFunctionAddresses(self.createFunctionContextFilter())) def getFunctionAddresses(self, context_filter): """ Get all function address that have been covered by the last scanning. @param dummy_only: only return functions with dummy names @type dummy_only: bool @param tag_only: only return tag functions @type tag_only: bool @return: (list of int) The addresses of covered functions. """ all_addresses = self.last_scan_result.keys() filtered_addresses = [] if context_filter.display_all: filtered_addresses = all_addresses elif context_filter.display_tags: for address in all_addresses: enabled_tags = [tag[0] for tag in context_filter.enabled_tags] if len( set(self.last_scan_result[address].getTags()) & set(enabled_tags)) > 0: filtered_addresses.append(address) elif context_filter.display_groups: for address in all_addresses: enabled_groups = [ group[0] for group in context_filter.enabled_groups ] if len( set(self.last_scan_result[address].getGroups()) & set(enabled_groups)) > 0: filtered_addresses.append(address) # filter additionals if context_filter.isDisplayTagOnly(): filtered_addresses = [ addr for addr in filtered_addresses if self.last_scan_result[addr].has_tags ] if context_filter.isDisplayDummyOnly(): filtered_addresses = [ addr for addr in filtered_addresses if self.last_scan_result[addr].has_dummy_name ] return filtered_addresses def getTags(self): """ Get all the tags that have been covered by the last scanning. @return (list of str) The tags found. """ tags = [] for function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[ function_address].call_contexts: if call_ctx.tag not in tags: tags.append(call_ctx.tag) return tags def getGroups(self): """ Get all the groups that have been covered by tags in the last scanning. @return (list of str) The groups found. """ tag_to_group_mapping = self._createTagToGroupMapping() groups = [] for function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[ function_address].call_contexts: if tag_to_group_mapping[call_ctx.tag] not in groups: groups.append(tag_to_group_mapping[call_ctx.tag]) return groups def _createTagToGroupMapping(self): mapping = {} for definition in self.semantic_definitions: mapping[definition["tag"]] = definition["group"] return mapping def getTagsForFunctionAddress(self, address): """ Get all tags found for the function containing the queried address. @param address: address in the target function @type address: int @return: (list of str) The tags for the function containing the queried address """ tags = [] function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): for call_ctx in self.last_scan_result[ function_address].call_contexts: if call_ctx.tag not in tags: tags.append(call_ctx.tag) return tags def getFieldCountForFunctionAddress(self, query, address): """ Get the number of occurrences for a certain field for the function containing the queried address. @param query: a tuple (type, name), where type is additional, tag, or group and name the field being queried. @type query: tuple @param address: address in the target function @type address: int @return: (int) The number of occurrences for this tag in the function """ function_address = self.getFunctionAddressForAddress(address) return self.last_scan_result[function_address].getCountForField(query) def getTaggedApisForFunctionAddress(self, address): """ Get all call contexts for the function containing the queried address. @param address: address in the target function @type address: int @return: (list of CallContext data objects) The call contexts identified by the scanning of this function """ function_address = self.getFunctionAddressForAddress(address) if function_address in self.last_scan_result.keys(): all_call_ctx = self.last_scan_result[ function_address].call_contexts return [ call_ctx for call_ctx in all_call_ctx if call_ctx.tag != "" ] def getAddressTagPairsOrderedByFunction(self): """ Get all call contexts for all functions @return: a dictionary with key/value entries of the following form: (function_address, dict((call_address, tag))) """ functions_and_tags = {} for function in self.getIdentifiedFunctionAddresses(): call_contexts = self.getTaggedApisForFunctionAddress(function) if function not in functions_and_tags.keys(): functions_and_tags[function] = {} for call_ctx in call_contexts: functions_and_tags[function][ call_ctx.address_of_call] = call_ctx.tag return functions_and_tags def getFunctionsToRename(self): """ Get all functions that can be renamed according to the last scan result. Only functions with the standard IDA name I{sub_[0-9A-F]+} will be considered for renaming. @return: a list of dictionaries, each consisting of three tuples: ("old_function_name", str), \ ("new_function_name", str), ("function_address", int) """ functions_to_rename = [] for function_address_to_tag in self.last_scan_result.keys(): new_function_name = self.last_scan_result[ function_address_to_tag].function_name # has the function still a dummy name? if self.ida_proxy.GetFlags( function_address_to_tag) & self.ida_proxy.FF_LABL > 0: tags_for_function = self.getTagsForFunctionAddress( function_address_to_tag) for tag in sorted(tags_for_function, reverse=True): if tag not in new_function_name: new_function_name = tag + self.renaming_seperator + new_function_name functions_to_rename.append({"old_function_name": \ self.last_scan_result[function_address_to_tag].function_name, "new_function_name": \ new_function_name, "function_address": function_address_to_tag}) return functions_to_rename def renameFunctions(self): """ Perform the renaming of functions according to the last scan result. """ for function in self.getFunctionsToRename(): if function["old_function_name"] == self.ida_proxy.GetFunctionName( function["function_address"]): self.ida_proxy.MakeNameEx(function["function_address"], function["new_function_name"], \ self.ida_proxy.SN_NOWARN) def renamePotentialWrapperFunctions(self): """ contributed by Branko Spasojevic. """ num_wrappers_renamed = 0 for seg_ea in self.ida_proxy.Segments(): for func_ea in self.ida_proxy.Functions( self.ida_proxy.SegStart(seg_ea), self.ida_proxy.SegEnd(seg_ea)): if (self.ida_proxy.GetFlags(func_ea) & 0x8000) != 0: nr_calls, w_name = self._checkWrapperHeuristics(func_ea) if nr_calls == 1 and len(w_name) > 0: rval = False name_suffix = 0 while rval == False: if name_suffix > 40: print("[!] Potentially more than 50 wrappers for function %s, " \ "please report this IDB ;)" % w_name) break demangled_name = self.ida_proxy.Demangle( w_name, self.ida_proxy.GetLongPrm( self.ida_proxy.INF_SHORT_DN)) if demangled_name != None and demangled_name != w_name: f_name = w_name + '_w' + str(name_suffix) elif name_suffix > 0: f_name = w_name + '_w' + str(name_suffix) else: f_name = w_name + '_w0' name_suffix += 1 rval = self.ida_proxy.MakeNameEx(func_ea, f_name, \ self.ida_proxy.SN_NOCHECK | self.ida_proxy.SN_NOWARN) if rval == True: print("[+] Identified and renamed potential wrapper @ [%08x] to [%s]" % \ (func_ea, f_name)) num_wrappers_renamed += 1 print("[+] Renamed %d functions with their potentially wrapped name." % num_wrappers_renamed) def _checkWrapperHeuristics(self, func_ea): """ Helps renamePotentialWrapperFunctions() to decide whether the function analyzed is a wrapper or not. """ nr_calls = 0 w_name = "" # Heuristic: wrappers are likely short func_end = self.ida_proxy.GetFunctionAttr(func_ea, self.ida_proxy.FUNCATTR_END) if (func_end - func_ea) > 0 and (func_end - func_ea) < 0x40: return (0, "") # Heuristic: wrappers shall only have a single reference, ideally to a library function. for i_ea in self.ida_proxy.FuncItems(func_ea): # long jumps don't occur in wrappers considered by this code. if self.ida_proxy.GetMnem(i_ea) == 'jmp' \ and (func_ea > self.ida_proxy.GetOperandValue(i_ea,0) \ or func_end < self.ida_proxy.GetOperandValue(i_ea,0)): nr_calls += 2 # checks if call is not memory reference if self.ida_proxy.GetMnem(i_ea) == 'call': nr_calls += 1 if self.ida_proxy.GetOpType(i_ea,0) != 2 \ and self.ida_proxy.GetOpType(i_ea,0) != 6 \ and self.ida_proxy.GetOpType(i_ea,0) != 7: nr_calls += 2 if nr_calls > 1: break call_dst = list(self.ida_proxy.CodeRefsFrom(i_ea, 0)) if len(call_dst) == 0: continue call_dst = call_dst[0] if (self.ida_proxy.GetFunctionFlags(call_dst) & self.ida_proxy.FUNC_LIB) != 0 or \ (self.ida_proxy.GetFlags(func_ea) & self.ida_proxy.FF_LABL) == 0: w_name = self.ida_proxy.Name(call_dst) return (nr_calls, w_name) def getParametersForCallAddress(self, call_address): """ Get the parameters for the given address of a function call. @param call_address: address of the target call to inspect @type call_address: int @return: a list of ParameterContext data objects. """ target_function_address = self.ida_proxy.LocByName( self.ida_proxy.GetFunctionName(call_address)) all_tagged_apis_in_function = self.getTaggedApisForFunctionAddress( target_function_address) for api in all_tagged_apis_in_function: if api.address_of_call == call_address: return self._resolveApiCall(api) return [] def _resolveApiCall(self, call_context): """ Resolve the parameters for an API calls based on a call context for this API call. @param call_context: the call context to get the parameter information for @type call_context: a CallContext data object @return: a list of ParameterContext data objects. """ resolved_api_parameters = [] api_signature = self._getApiSignature( call_context.real_called_function_name) push_addresses = self._getPushAddressesBeforeTargetAddress( call_context.address_of_call) resolved_api_parameters = self._matchPushAddressesToSignature( push_addresses, api_signature) return resolved_api_parameters def _matchPushAddressesToSignature(self, push_addresses, api_signature): """ Combine the results of I{_getPushAddressesBeforeTargetAddress} and I{_getApiSignature} in order to produce a list of ParameterContext data objects. @param push_addresses: the identified push addresses before a function call that shall be matched to a function signature @type push_addresses: a list of int @param api_signature: information about a function definition with parameter names, types, and so on. @type api_signature: a dictionary with the layout as returned by I{_getApiSignature} @return: a list of ParameterContext data objects. """ matched_parameters = [] # TODO: # upgrade this feature with data flow analysis to resolve parameters with higher precision api_num_params = len(api_signature["parameters"]) push_addresses = push_addresses[-api_num_params:] # TODO: # There might be the case where we identify less pushed parameters than required by the function # signature. Thus we calculate a "parameter discrepancy" that we use to adjust our enumeration index # so that the last n parameters get matched correctly. This is a temporary fix and might be solved later on. parameter_discrepancy = len(push_addresses) - api_num_params for index, param in enumerate(api_signature["parameters"], start=parameter_discrepancy): param_ctx = self.ParameterContext() param_ctx.parameter_type = param["type"] param_ctx.parameter_name = param["name"] if (parameter_discrepancy != 0) and (index < 0): param_ctx.valid = False else: param_ctx.push_address = push_addresses[index] param_ctx.ida_operand_type = self.ida_proxy.GetOpType( push_addresses[index], 0) param_ctx.ida_operand_value = self.ida_proxy.GetOperandValue( push_addresses[index], 0) param_ctx.value = param_ctx.ida_operand_value matched_parameters.append(param_ctx) return matched_parameters def _getApiSignature(self, api_name): """ Get the signature for a function by using IDA's I{GetType()}. The string is then parsed with a Regex and returned as a dictionary. @param api_name: name of the API / function to get type information for @type api_name: str @return: a dictionary with key/value entries of the following form: ("return_type", str), ("parameters", [dict(("type", str), ("name", str))]) """ api_signature = {"api_name": api_name, "parameters": []} api_location = self.ida_proxy.LocByName(api_name) type_def = self.ida_proxy.GetType(api_location) function_signature_regex = r"(?P<return_type>[\w\s\*]+)\((?P<parameters>[,\.\*\w\s]*)\)" result = self.re.match(function_signature_regex, type_def) if result is not None: api_signature["return_type"] = result.group("return_type") if len(result.group("parameters")) > 0: for parameter in result.group("parameters").split(","): type_and_name = {} type_and_name["type"] = parameter[:parameter. rfind(" ")].strip() type_and_name["name"] = parameter[parameter. rfind(" "):].strip() api_signature["parameters"].append(type_and_name) else: print ("[-] SemanticIdentifier._getApiSignature: No API/function signature for \"%s\" @ 0x%x available. " \ + "(non-critical)") % (api_name, api_location) # TODO: # here should be a check for the calling convention # currently, list list is simply reversed to match the order parameters are pushed to the stack api_signature["parameters"].reverse() return api_signature def _getPushAddressesBeforeTargetAddress(self, address): """ Get the addresses of all push instructions in the basic block preceding the given address. @param address: address to get the push addresses for. @type address: int @return: a list of int """ push_addresses = [] function_chart = self.ida_proxy.FlowChart( self.ida_proxy.get_func(address)) for block in function_chart: if block.startEA <= address < block.endEA: for instruction_addr in self.ida_proxy.Heads( block.startEA, block.endEA): if self.ida_proxy.GetMnem(instruction_addr) == "push": push_addresses.append(instruction_addr) if instruction_addr >= address: break return push_addresses def createFunctionGraph(self, func_address): graph = {"root": func_address, "nodes": {}} unexplored = set() if func_address in self.last_scan_result.keys(): graph["nodes"][func_address] = self.last_scan_result[ func_address].calls_from unexplored = set(self.last_scan_result[func_address].calls_from) while len(unexplored) > 0: current_function = unexplored.pop() if current_function in graph["nodes"].keys( ) or current_function not in self.last_scan_result.keys(): continue else: graph["nodes"][current_function] = self.last_scan_result[ current_function].calls_from new_functions = \ set(self.last_scan_result[current_function].calls_from).difference(set(graph["nodes"].keys())) unexplored.update(new_functions) return graph def createFunctionContextFilter(self): """ Create a function filter, containing only those tags/groups that have been identified within the last scan. """ context_filter = self.FunctionContextFilter() context_filter.tags = sorted([(tag, tag, tag) for tag in self.getTags()]) context_filter.enabled_tags = context_filter.tags context_filter.groups = sorted([(group, group, group) for group in self.getGroups()]) context_filter.enabled_groups = context_filter.groups return context_filter def getLastScanResult(self): """ Get the last scan result as retrieved by I{scanByReferences}. @return: a dictionary with key/value entries of the following form: (function_address, FunctionContext) """ return self.last_scan_result def printLastScanResult(self): """ nicely print the last scan result (mostly used for debugging) """ for function_address in self.last_scan_result.keys(): print ("0x%x - %s -> ") % (function_address, self.ida_proxy.GetFunctionName(function_address)) \ + ", ".join(self.getTagsForFunctionAddress(function_address)) for call_ctx in self.last_scan_result[ function_address].call_contexts: print(" 0x%x - %s (%s)") % (call_ctx.address_of_call, call_ctx.called_function_name, call_ctx.tag)
class YaraScanner(): """ A module to analyze and explore an IDB for semantics. For a set of API names, references to these are identified and used for creating context and allowing tagging of them. """ def __init__(self, idascope_config): print("[|] loading YaraScanner") self.os = os self.re = re self.time = time self.yara = yara self.YaraRule = YaraRule self.ida_proxy = IdaProxy() self.yrl = YaraRuleLoader() # fields self.idascope_config = idascope_config self.num_files_loaded = 0 self._compiled_rules = [] self._yara_rules = [] self._results = [] self.segment_offsets = [] def getResults(self): return self._results def load_rules(self): if not self.yara: return self.num_files_loaded = 0 self._compiled_rules = [] self._yara_rules = [] for yara_path in self.idascope_config.yara_sig_folders: self._load_recursive(yara_path) def _load_recursive(self, yara_path): if os.path.isfile(yara_path): self._load_file(yara_path) elif os.path.isdir(yara_path): for dirpath, dirnames, filenames in os.walk(yara_path): for filename in filenames: filepath = dirpath + os.sep + filename self._load_file(filepath) def _load_file(self, filepath): try: rules_from_file = self.yrl.loadRulesFromFile(filepath) for rule in rules_from_file: rule.checkRule() self._yara_rules.extend(rules_from_file) rules = yara.compile(filepath) self._compiled_rules.append(rules) print "loading rules from file: %s (%d)" % (filepath, len(rules_from_file)) if rules: self.num_files_loaded += 1 except: print "[!] Could not load yara rules from file: %s" % filepath def scan(self): if not self.yara: print "[!] yara-python not available, please install it from (http://plusvic.github.io/yara/)" return memory, offsets = self._get_memory() self.segment_offsets = offsets self._results = [] matches = [] print "[!] Performing YARA scan..." for rule in self._compiled_rules: matches.append( rule.match(data=memory, callback=self._result_callback)) if len(matches) == 0: print " [-] no matches. :(" def _get_memory(self): result = "" segment_starts = [ea for ea in self.ida_proxy.Segments()] offsets = [] start_len = 0 for start in segment_starts: end = self.ida_proxy.SegEnd(start) for ea in Misc.lrange(start, end): result += chr(self.ida_proxy.Byte(ea)) offsets.append((start, start_len, len(result))) start_len = len(result) return result, offsets def _result_callback(self, data): adjusted_offsets = [] for string in data["strings"]: adjusted_offsets.append( (self._translateMemOffsetToVirtualAddress(string[0]), string[1], string[2])) data["strings"] = adjusted_offsets if data["matches"]: print " [+] YARA Match for signature: %s" % data["rule"] result_rule = None for rule in self._yara_rules: if rule.rule_name == data["rule"]: result_rule = rule if not result_rule: result_rule = self.YaraRule() result_rule.match_data = data self._results.append(result_rule) self.yara.CALLBACK_CONTINUE def _translateMemOffsetToVirtualAddress(self, offset): va_offset = 0 for seg in self.segment_offsets: if seg[1] < offset < seg[2]: va_offset = seg[0] + (offset - seg[1]) return va_offset