def test_json(self): """ Test JSON decoding of graphs """ jsonstring = """{"edges":[{"destination":1518838580,"source":1518838565},{"destination":1518838572,"source":1518838565},{"destination":1518838578,"source":1518838572},{"destination":1518838574,"source":1518838572},{"destination":1518838580,"source":1518838574},{"destination":1518838578,"source":1518838574},{"destination":1518838580,"source":1518838578}],"name":"CFG","nodes":[{"address":1518838565,"instructions":[{"mnemonic":"xor","operands":["EAX","EAX"]},{"mnemonic":"cmp","operands":["[ECX + 4]","EAX"]},{"mnemonic":"jnle","operands":["5a87a334"]}]},{"address":1518838572,"instructions":[{"mnemonic":"jl","operands":["5a87a332"]}]},{"address":1518838574,"instructions":[{"mnemonic":"cmp","operands":["[ECX]","EAX"]},{"mnemonic":"jnb","operands":["5a87a334"]}]},{"address":1518838578,"instructions":[{"mnemonic":"mov","operands":["AL","1"]}]},{"address":1518838580,"instructions":[{"mnemonic":"ret near","operands":["[ESP]"]}]}]}""" fg = functionsimsearch.FlowgraphWithInstructions() fg.from_json(jsonstring) hasher = functionsimsearch.SimHasher() function_hash = hasher.calculate_hash(fg) self.assertTrue(function_hash[0] == 0xa7ef296fa5dea3ee)
def test_hasher_with_weights(self): """ Tests whether the loading of a weights file works. """ jsonstring = """{"edges":[{"destination":1518838580,"source":1518838565},{"destination":1518838572,"source":1518838565},{"destination":1518838578,"source":1518838572},{"destination":1518838574,"source":1518838572},{"destination":1518838580,"source":1518838574},{"destination":1518838578,"source":1518838574},{"destination":1518838580,"source":1518838578}],"name":"CFG","nodes":[{"address":1518838565,"instructions":[{"mnemonic":"xor","operands":["EAX","EAX"]},{"mnemonic":"cmp","operands":["[ECX + 4]","EAX"]},{"mnemonic":"jnle","operands":["5a87a334"]}]},{"address":1518838572,"instructions":[{"mnemonic":"jl","operands":["5a87a332"]}]},{"address":1518838574,"instructions":[{"mnemonic":"cmp","operands":["[ECX]","EAX"]},{"mnemonic":"jnb","operands":["5a87a334"]}]},{"address":1518838578,"instructions":[{"mnemonic":"mov","operands":["AL","1"]}]},{"address":1518838580,"instructions":[{"mnemonic":"ret near","operands":["[ESP]"]}]}]}""" fg = functionsimsearch.FlowgraphWithInstructions() fg.from_json(jsonstring) hasher = functionsimsearch.SimHasher("../testdata/weights.txt") function_hash = hasher.calculate_hash(fg) self.assertTrue(function_hash[0] == 0xa6ef292a658e83ee)
def extract_flowgraph_hash(self, function, minimum_size=5): """ Generates a flowgraph object that can be fed into FunctionSimSearch from a given address in Binary Ninja and returns set of hashes. """ nodes = [] graph = [] # Retrieve CFG data for block in function: local_node = [] shift = 0 position = block.start for instruction in block: local_node.append(instruction[0][0].text) shift += instruction[1] if instruction[0][ 0].text == 'call': # Split on call with assumption that we only care about x86/64 for now nodes.append((position, local_node)) local_node = [] graph.append((position, block.start + shift)) position = block.start + shift for edge in block.outgoing_edges: graph.append((position, edge.target.start)) if local_node: nodes.append((position, local_node)) else: graph.pop(-1) # Generate flowgraph flowgraph = fss.FlowgraphWithInstructions() for node in nodes: flowgraph.add_node(node[0]) flowgraph.add_instructions(node[0], tuple([((i), ()) for i in node[1] ])) # Format conversion for edge in graph: flowgraph.add_edge(edge[0], edge[1]) if flowgraph.number_of_branching_nodes() < minimum_size: return (None, None) hasher = fss.SimHasher() return hasher.calculate_hash(flowgraph)
def test_construction(self): flowgraph = functionsimsearch.FlowgraphWithInstructions() # Create an example CFG. nodedata = [ (0x5A5F6179, (("mov", ()), ("shr", ()), ("xor", ()), ("jz", ()))), (0x5A5F6187, (("mov", ()), ("and", ()), ("cmp", ()), ("jz", ()))), (0x5A5F6195, (("mov", ()), ("jmp", ()))), (0x5A5F6184, (("mov", ()), ("ret", ()))), (0x5A5F6182, (("xor", ()),)) ] edgedata = [ (0x5A5F6179, 0x5A5F6187), (0x5A5F6179, 0x5A5F6182), (0x5A5F6182, 0x5A5F6184), (0x5A5F6187, 0x5A5F6184), (0x5A5F6187, 0x5A5F5195)] for n in nodedata: flowgraph.add_node(n[0]) for e in edgedata: flowgraph.add_edge(e[0], e[1]) for n in nodedata: flowgraph.add_instructions(n[0], n[1]) # Now calculate the similarity hash of the graph. hasher = functionsimsearch.SimHasher() function_hash = hasher.calculate_hash(flowgraph) # Make a minor change to the graph (adding an extra node (5 node graph becomes # a 6-node graph). flowgraph.add_node(0) flowgraph.add_edge(0, nodedata[0][0]) # Hash the changed version. function_hash2 = hasher.calculate_hash(flowgraph) # Calculate the distance between the two hashes - simply hamming distance between # bit vectors: distance = popcount(function_hash[0]^function_hash2[0]) +\ popcount(function_hash[1]^function_hash2[1]) self.assertTrue((1.0 - (distance/128.0) > 0.7))
del hotkey_context_L del hotkey_context_H del hotkey_context_A del hotkey_context_M else: print("FunctionSimSearch: Hotkeys registered.") create_index = True if os.path.isfile(index_file): create_index = False if os.path.isfile(metadata_file): print("Parsing meta_data from file %s" % metadata_file) meta_data = parse_function_meta_data(metadata_file) print("Parsed meta_data.") for i in meta_data.keys()[0:10]: print("%lx:%lx" % i) else: meta_data = {} print("Calling functionsimsearch.SimHashSearchIndex(\"%s\", %s, 50)" % (index_file, create_index)) try: search_index = functionsimsearch.SimHashSearchIndex( index_file, create_index, 50) if os.path.isfile(weights_file): print("Calling functionsimsearch.SimHasher(\"%s\")" % weights_file) sim_hasher = functionsimsearch.SimHasher(weights_file) else: sim_hasher = functionsimsearch.SimHasher() except: print("Failure to create/open the search index!")
del hotkey_context_L else: print("FunctionSimSearch: Failed to unregister hotkey L.") search_index sim_hasher del search_index del sim_hasher except: hotkey_context_S = idaapi.add_hotkey("Shift-S", save_function) hotkey_context_L = idaapi.add_hotkey("Shift-L", load_function) if hotkey_context_S is None or hotkey_context_L is None: print("FunctionSimSearch: Failed to register hotkeys.") del hotkey_context_S del hotkey_context_L else: print("FunctionSimSearch: Hotkeys registered.") create_index = True if os.path.isfile('/tmp/example.simhash'): create_index = False if os.path.isfile('/tmp/example.simhash.meta'): print("Parsing meta_data") meta_data = parse_function_meta_data('/tmp/example.simhash.meta') print("Parsed meta_data") for i in meta_data.keys()[0:10]: print("%lx:%lx" % i) else: meta_data = {} search_index = functionsimsearch.SimHashSearchIndex( "/tmp/example.simhash", create_index, 28) sim_hasher = functionsimsearch.SimHasher()
del hotkey_context_L else: print("FunctionSimSearch: Failed to unregister hotkey L.") search_index sim_hasher del search_index del sim_hasher except: hotkey_context_S = idaapi.add_hotkey("Shift-S", save_function) hotkey_context_L = idaapi.add_hotkey("Shift-L", load_function) if hotkey_context_S is None or hotkey_context_L is None: print("FunctionSimSearch: Failed to register hotkeys.") del hotkey_context_S del hotkey_context_L else: print("FunctionSimSearch: Hotkeys registered.") create_index = True if os.path.isfile('/tmp/example.simhash'): create_index = False if os.path.isfile('/tmp/example.simhash.meta'): print("Parsing meta_data") meta_data = parse_function_meta_data('/tmp/example.simhash.meta') print("Parsed meta_data") for i in meta_data.keys()[0:10]: print("%lx:%lx" % i) else: meta_data = {} search_index = functionsimsearch.SimHashSearchIndex( "/tmp/example.simhash", create_index, 28) sim_hasher = functionsimsearch.SimHasher("/tmp/example.simhash.weights")