def get_flowgraph_from(address): """ Generates a flowgraph object that can be fed into FunctionSimSearch from a given address in IDA. """ ida_flowgraph = idaapi.FlowChart(idaapi.get_func(here())) flowgraph = functionsimsearch.FlowgraphWithInstructions() for block in ida_flowgraph: # Add all the ida-flowgraph-basic blocks. We do this up-front so we can # more easily add edges later, and adding a node twice does not hurt. flowgraph.add_node(block.start_ea) for block in ida_flowgraph: instructions = [(i, GetMnem(i), (print_operand(i, 0), print_operand(i, 1))) for i in Heads(block.start_ea, block.end_ea)] small_blocks = split_instruction_list(instructions, "call") for small_block in small_blocks: flowgraph.add_node(small_block[0][0]) small_block_instructions = tuple(instruction[1:] for instruction in small_block) flowgraph.add_instructions(small_block[0][0], small_block_instructions) for index in range(len(small_blocks) - 1): flowgraph.add_edge(small_blocks[index][0][0], small_blocks[index + 1][0][0]) for successor_block in block.succs(): flowgraph.add_edge(small_blocks[-1][0][0], successor_block.start_ea) return flowgraph
def test_json(self): """ Test JSON decoding of graphs """ jsonstring = """{"edges":[{"destination":1518838580,"source":1518838565},{"destination":1518838572,"source":1518838565},{"destination":1518838578,"source":1518838572},{"destination":1518838574,"source":1518838572},{"destination":1518838580,"source":1518838574},{"destination":1518838578,"source":1518838574},{"destination":1518838580,"source":1518838578}],"name":"CFG","nodes":[{"address":1518838565,"instructions":[{"mnemonic":"xor","operands":["EAX","EAX"]},{"mnemonic":"cmp","operands":["[ECX + 4]","EAX"]},{"mnemonic":"jnle","operands":["5a87a334"]}]},{"address":1518838572,"instructions":[{"mnemonic":"jl","operands":["5a87a332"]}]},{"address":1518838574,"instructions":[{"mnemonic":"cmp","operands":["[ECX]","EAX"]},{"mnemonic":"jnb","operands":["5a87a334"]}]},{"address":1518838578,"instructions":[{"mnemonic":"mov","operands":["AL","1"]}]},{"address":1518838580,"instructions":[{"mnemonic":"ret near","operands":["[ESP]"]}]}]}""" fg = functionsimsearch.FlowgraphWithInstructions() fg.from_json(jsonstring) hasher = functionsimsearch.SimHasher() function_hash = hasher.calculate_hash(fg) self.assertTrue(function_hash[0] == 0xa7ef296fa5dea3ee)
def test_hasher_with_weights(self): """ Tests whether the loading of a weights file works. """ jsonstring = """{"edges":[{"destination":1518838580,"source":1518838565},{"destination":1518838572,"source":1518838565},{"destination":1518838578,"source":1518838572},{"destination":1518838574,"source":1518838572},{"destination":1518838580,"source":1518838574},{"destination":1518838578,"source":1518838574},{"destination":1518838580,"source":1518838578}],"name":"CFG","nodes":[{"address":1518838565,"instructions":[{"mnemonic":"xor","operands":["EAX","EAX"]},{"mnemonic":"cmp","operands":["[ECX + 4]","EAX"]},{"mnemonic":"jnle","operands":["5a87a334"]}]},{"address":1518838572,"instructions":[{"mnemonic":"jl","operands":["5a87a332"]}]},{"address":1518838574,"instructions":[{"mnemonic":"cmp","operands":["[ECX]","EAX"]},{"mnemonic":"jnb","operands":["5a87a334"]}]},{"address":1518838578,"instructions":[{"mnemonic":"mov","operands":["AL","1"]}]},{"address":1518838580,"instructions":[{"mnemonic":"ret near","operands":["[ESP]"]}]}]}""" fg = functionsimsearch.FlowgraphWithInstructions() fg.from_json(jsonstring) hasher = functionsimsearch.SimHasher("../testdata/weights.txt") function_hash = hasher.calculate_hash(fg) self.assertTrue(function_hash[0] == 0xa6ef292a658e83ee)
def extract_flowgraph_hash(self, function, minimum_size=5): """ Generates a flowgraph object that can be fed into FunctionSimSearch from a given address in Binary Ninja and returns set of hashes. """ nodes = [] graph = [] # Retrieve CFG data for block in function: local_node = [] shift = 0 position = block.start for instruction in block: local_node.append(instruction[0][0].text) shift += instruction[1] if instruction[0][ 0].text == 'call': # Split on call with assumption that we only care about x86/64 for now nodes.append((position, local_node)) local_node = [] graph.append((position, block.start + shift)) position = block.start + shift for edge in block.outgoing_edges: graph.append((position, edge.target.start)) if local_node: nodes.append((position, local_node)) else: graph.pop(-1) # Generate flowgraph flowgraph = fss.FlowgraphWithInstructions() for node in nodes: flowgraph.add_node(node[0]) flowgraph.add_instructions(node[0], tuple([((i), ()) for i in node[1] ])) # Format conversion for edge in graph: flowgraph.add_edge(edge[0], edge[1]) if flowgraph.number_of_branching_nodes() < minimum_size: return (None, None) hasher = fss.SimHasher() return hasher.calculate_hash(flowgraph)
def test_construction(self): flowgraph = functionsimsearch.FlowgraphWithInstructions() # Create an example CFG. nodedata = [ (0x5A5F6179, (("mov", ()), ("shr", ()), ("xor", ()), ("jz", ()))), (0x5A5F6187, (("mov", ()), ("and", ()), ("cmp", ()), ("jz", ()))), (0x5A5F6195, (("mov", ()), ("jmp", ()))), (0x5A5F6184, (("mov", ()), ("ret", ()))), (0x5A5F6182, (("xor", ()),)) ] edgedata = [ (0x5A5F6179, 0x5A5F6187), (0x5A5F6179, 0x5A5F6182), (0x5A5F6182, 0x5A5F6184), (0x5A5F6187, 0x5A5F6184), (0x5A5F6187, 0x5A5F5195)] for n in nodedata: flowgraph.add_node(n[0]) for e in edgedata: flowgraph.add_edge(e[0], e[1]) for n in nodedata: flowgraph.add_instructions(n[0], n[1]) # Now calculate the similarity hash of the graph. hasher = functionsimsearch.SimHasher() function_hash = hasher.calculate_hash(flowgraph) # Make a minor change to the graph (adding an extra node (5 node graph becomes # a 6-node graph). flowgraph.add_node(0) flowgraph.add_edge(0, nodedata[0][0]) # Hash the changed version. function_hash2 = hasher.calculate_hash(flowgraph) # Calculate the distance between the two hashes - simply hamming distance between # bit vectors: distance = popcount(function_hash[0]^function_hash2[0]) +\ popcount(function_hash[1]^function_hash2[1]) self.assertTrue((1.0 - (distance/128.0) > 0.7))
def get_flowgraph_from(address, ignore_instructions=False): """ Generates a flowgraph object that can be fed into FunctionSimSearch from a given address in IDA. """ call_instruction_string if not address: address = here() ida_flowgraph = idaapi.FlowChart(idaapi.get_func(address)) flowgraph = functionsimsearch.FlowgraphWithInstructions() for block in ida_flowgraph: # Add all the ida-flowgraph-basic blocks. We do this up-front so we can # more easily add edges later, and adding a node twice does not hurt. flowgraph.add_node(block.start_ea) for block in ida_flowgraph: # There seems to be no good way to get operands without IDA substituting # local variable names etc., so this is a very ugly hack to deal with that. # TODO(thomasdullien): IDA 7.2 will provide a way to perform print_operand # without replacement (by providing empty type arguments?), replace the hack # here with a "proper" solution. instructions = [(i, GetMnem(i), (print_operand(i, 0).replace("+var_", "-0x"), print_operand(i, 1).replace("+var_", "-0x"))) for i in Heads(block.start_ea, block.end_ea)] small_blocks = split_instruction_list(instructions, call_instruction_string) for small_block in small_blocks: flowgraph.add_node(small_block[0][0]) small_block_instructions = tuple(instruction[1:] for instruction in small_block) if not ignore_instructions: flowgraph.add_instructions(small_block[0][0], small_block_instructions) for index in range(len(small_blocks) - 1): flowgraph.add_edge(small_blocks[index][0][0], small_blocks[index + 1][0][0]) for successor_block in block.succs(): flowgraph.add_edge(small_blocks[-1][0][0], successor_block.start_ea) return flowgraph