def _build_bindiff_lcs_signature(self, job): log.info("Building a Bindiff/LCS signature for job %d", job.id) # if (len(job.samples) != 2): # return {"message": "Can only handle two samples at the moment"} temporary_paths = [] try: ida_pickle_dbs = list( self.idb_executor.map( lambda sample: self.bindiff.bindiff_pickle_export( sample.path, sample.info[FileTypeInspector.NAME][ "bits"] == 64), job.samples)) binexport_dbs = [ binexport_db for binexport_db, _ in ida_pickle_dbs ] pickle_dbs = [pickle_db for _, pickle_db in ida_pickle_dbs] temporary_paths += binexport_dbs temporary_paths += pickle_dbs # TODO: Find a clever way of generating few bindiff comparisons binexport_pairs = list(itertools.combinations(binexport_dbs, 2)) pickle_pairs = list(itertools.combinations(pickle_dbs, 2)) log.debug("Comparing %d pairs of binaries with each other", len(pickle_pairs)) bindiff_dbs = list( self.bindiff_executor.map( lambda x: self.bindiff.bindiff_compare(*x), binexport_pairs)) temporary_paths += bindiff_dbs log.debug("Building graph of similar functions") graph = Graph() for bindiff_db_path, (sample1_pickle_db_path, sample2_pickle_db_path) in zip( bindiff_dbs, pickle_pairs): sample1_db = Database.load(sample1_pickle_db_path) sample2_db = Database.load(sample2_pickle_db_path) bindiff_db = BinDiffDb(bindiff_db_path) assert (bindiff_db.get_binary(1).get_exefilename() == sample1_db.filename) assert (bindiff_db.get_binary(2).get_exefilename() == sample2_db.filename) for similar_func in bindiff_db.get_similar_functions( min_similarity=0.6, min_confidence=0.5, min_instructions=50, min_bbs=3, min_edges=4, limit=10): # TODO: The weight might need to be tuned weight = similar_func["similarity"] * similar_func[ "confidence"] graph.add_edge((sample1_db, int(similar_func["address1"])), (sample2_db, int(similar_func["address2"])), weight=weight) # What we want here is to find subgraphs in the graph which have a high accumulated average weight log.debug("Finding connected subgraphs in the graph") subgraphs = [] processed_nodes = set() for node in graph.nodes(): if node in processed_nodes: continue binaries = set() nodes_to_explore = set((node, )) subgraph = Graph() while nodes_to_explore: cur_node = nodes_to_explore.pop() for edge in graph.edges(cur_node): assert (edge[0] == cur_node) # We don't want the same binary twice in our subgraph if edge[1][0] in binaries: continue # We don't want circles in our subgraph if edge[1] in subgraph.nodes(): continue # This should happen only in obscure cases where we # stopped at a node because of the same binary # occurring twice (with different functions) if edge[1] in processed_nodes: continue subgraph.add_edge(*edge) nodes_to_explore.add(edge[1]) processed_nodes.add(cur_node) subgraphs.append(subgraph) if not subgraphs: log.info( "No connected subgraphs in the function similarity graph found" ) return { "signatures": [], "message": "Cannot find common functions within the binaries" } log.debug("Determining maximal subgraph among %d subgraphs", len(subgraphs)) max_score = float("-infinity") max_subgraph = None for subgraph in subgraphs: # Just use the sum of similarities as a measure. We want bigger # subgraphs to have a higher score, so this is a good way to # insure it. score = sum( graph.get_edge_data(*x)["weight"] for x in subgraph.edges()) #/ len(subgraph.edges()) if score > max_score: max_score = score max_subgraph = subgraph if max_subgraph is None: log.info("No maximal subgraph found") return { "signatures": [], "message": "No maximal subgraph found" } log.debug("Found maximal subgraph with %d nodes, score %f", len(max_subgraph.nodes()), int(max_score * 100) / 100.0) # Right now, we'll just generate a signature for the subgraph with # the biggest score. In the future, we could try to find a set of # subgraphs with cover the cluster best. log.debug("Getting binary code for maximal subgraph") # Get the code for each function functions_code = [] for sample_db, function_ep in max_subgraph.nodes(): function = sample_db.get_function(function_ep) log.debug("Function %s:%d has %d chunks", sample_db.sha256, function_ep, len(list(function.chunks))) #TODO: This is wrong. You cannot simply append this, in case there are gaps between the chunks, # a '*' operator needs to be inserted in the final sig functions_code.append("".join(chunk.bytes for chunk in function.chunks)) log.debug("Longest code sequence is %s bytes", max(len(x) for x in functions_code)) log.debug("Finding common subsequence in binary code") common_seq = hamming_klcs(functions_code) #TODO: Shorten sequence to maximum acceptable length of 980 bytes #(ClamAV ndb signature length limit) while len(common_seq) > 950: kill_character = random.randint(0, len(common_seq) - 1) common_seq = common_seq[:kill_character] + common_seq[ kill_character + 1:] ndb_signature = ndb_from_common_sequence(functions_code, common_seq) log.debug("Found ndb signature: '%s'", ndb_signature) if ndb_signature: # TODO: Make nice name name = get_VT_name([sample.sha256 for sample in job.samples]) signature = { "type": "ndb", "signature": "{}:1:*:{}".format(name, ndb_signature) } num_triggering_samples = get_num_triggering_samples( signature, [sample.path for sample in job.samples]) log.debug( "Signature triggered by %d samples (%.02f %%)", num_triggering_samples, int(10000.0 * num_triggering_samples / len(job.samples)) / 100.0) return { "signatures": [{ "signature": signature, "metrics": { "coverage": 1.0 * num_triggering_samples / len(job.samples), "num_triggering_samples": num_triggering_samples } }], "message": "Found signature" } else: return { "signatures": [], "message": "Did not find a common sequence between code" } finally: for path in temporary_paths: if os.path.isdir(path): shutil.rmtree(path) else: os.unlink(path)
if include_regex: for entry in matches: print("match: {0}".format(entry)) for entry in misses: print("miss: {0}".format(entry)) print("Matches:{0}, Misses:{1}\n".format(len(matches),len(misses))) if len(matches) == 0: print("not going to generate sig no files matched the regex") sys.exit(-1) elif len(matches) == 1: print("not going to generate sig only one one file matched the regex {0}".format(matches[0])) sys.exit(-1) print("finding klcs") common = hamming_klcs(target_strings) print("steaming dem clams") ndb = ndb_from_common_sequence(target_strings, common) if not options.lwsom: disass=ndb.split('*') reass=[] for entry in disass: if re.match("^(?:[0[9dabc]|20)+$",entry) == None: reass.append(entry) ndb="*".join(reass) def replacenulls(nullmatch): nullseqlen=len(nullmatch) - 4 return "%sclamnullseqrep{%s}%s" % (nullmatch[0:2],nullseqlen,nullmatch[-2:]) def addmatch(match):
def test_clamav_ndb_from_common_sequence(): a = "aaxxbbyycc" b = "aaggbbhhcc" common = "aabbcc" ndb = ndb_from_common_sequence([a, b], common) assert(ndb == "6161*6262*6363")
def _build_bindiff_lcs_signature(self, job): log.info("Building a Bindiff/LCS signature for job %d", job.id) temporary_paths = [] try: ida_pickle_dbs = list(self.idb_executor.map(lambda sample: self.ida.bindiff_pickle_export(sample.path, sample.info[FileTypeInspector.NAME]["bits"] == 64), job.samples)) log.info("ida_pickle_dbs: %s", str(ida_pickle_dbs)) binexport_dbs = [binexport_db for binexport_db, _ in ida_pickle_dbs] log.info("binexport_dbs: %s", str(binexport_dbs)) pickle_dbs = [pickle_db for _, pickle_db in ida_pickle_dbs] temporary_paths += binexport_dbs temporary_paths += pickle_dbs # TODO: Find a clever way of generating few bindiff comparisons binexport_pairs = list(itertools.combinations(binexport_dbs, 2)) pickle_pairs = list(itertools.combinations(pickle_dbs, 2)) log.debug("Comparing %d pairs of binaries with each other", len(pickle_pairs)) bindiff_dbs = list(self.bindiff_executor.map(lambda x: self.bindiff.compare(*x), binexport_pairs)) temporary_paths += bindiff_dbs log.debug("Building graph of similar functions") graph = Graph() # PE check: if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples): # Loading JSON file log.debug("Loading JSON file containing APIs") h = open("api_db.json") json_apis = json.loads(h.read()) h.close() for bindiff_db_path, (sample1_pickle_db_path, sample2_pickle_db_path) in zip(bindiff_dbs, pickle_pairs): sample1_db = Database.load(sample1_pickle_db_path) sample2_db = Database.load(sample2_pickle_db_path) bindiff_db = BinDiffDb(bindiff_db_path) assert(bindiff_db.get_binary(1).get_exefilename() == sample1_db.filename) assert(bindiff_db.get_binary(2).get_exefilename() == sample2_db.filename) # Useful for debugging for f in sample1_db.functions: log.info("%s: %s" % (f.name, f.data["is_library_function"])) # default for similar_func in bindiff_db.get_similar_functions(min_similarity = 0.6, min_confidence = 0.5, min_instructions = 50, min_bbs = 3, min_edges = 4, limit = 10): # Filtering known functions cur_f1 = sample1_db.get_function(int(similar_func["address1"])) cur_f2 = sample2_db.get_function(int(similar_func["address2"])) if cur_f1.is_library_function or cur_f2.is_library_function: log.info("Skipping function - Reason: library function") continue # Filtering by name if not cur_f1.name.startswith("sub_") or not cur_f2.name.startswith("sub_"): log.info("Skipping function - Reason: name") # Debug log.info("%s - %s" % (cur_f1.name, cur_f2.name)) continue # Filtering by APIs list if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples): if cur_f1.apis != cur_f2.apis: log.info("Skippig function - Reason: different apis") # Debug log.info(cur_f1.apis) log.info(cur_f2.apis) continue # Filtering by length if similar_func["basicblocks"] < 10: log.info("Skipping function - Reason: length") continue # Collecting stats if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples): msvcrt = 0 mutex = 0 antidbg = 0 apialert = 0 for a in cur_f1.apis: if a in json_apis["msvcrt"]: msvcrt += 1 elif a in json_apis["mutex"]: mutex += 1 elif a in json_apis["antidbg"]: antidbg += 1 elif a in json_apis["apialert"]: apialert += 1 # Printing stats for debugging log.info(cur_f1.apis) log.info("MSVCRT:") log.info(msvcrt) log.info("MUTEX:") log.info(mutex) log.info("ANTIDBG:") log.info(antidbg) log.info("APIALERT") log.info(apialert) # TODO: The weight might need to be tuned - Proposal 1 if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples): base_weight = similar_func["similarity"] * similar_func["confidence"] weight = base_weight + antidbg + apialert + mutex - msvcrt else: weight = similar_func["similarity"] * similar_func["confidence"] # Debug print log.info("DEBUG WEIGHT:") log.info(weight) log.info(similar_func) graph.add_edge((sample1_db, int(similar_func["address1"])), (sample2_db, int(similar_func["address2"])), weight = weight) # What we want here is to find subgraphs in the graph which have a high accumulated average weight log.debug("Finding connected subgraphs in the graph") subgraphs = [] processed_nodes = set() for node in graph.nodes(): if node in processed_nodes: continue binaries = set() nodes_to_explore = set((node, )) subgraph = Graph() while nodes_to_explore: cur_node = nodes_to_explore.pop() for edge in graph.edges(cur_node): assert(edge[0] == cur_node) # We don't want the same binary twice in our subgraph if edge[1][0] in binaries: continue # We don't want circles in our subgraph if edge[1] in subgraph.nodes(): continue # This should happen only in obscure cases where we # stopped at a node because of the same binary # occurring twice (with different functions) if edge[1] in processed_nodes: continue subgraph.add_edge(*edge) nodes_to_explore.add(edge[1]) processed_nodes.add(cur_node) subgraphs.append(subgraph) if not subgraphs: log.info("No connected subgraphs in the function similarity graph found") return {"signatures": [], "message": "Cannot find common functions within the binaries"} log.debug("Determining maximal subgraph among %d subgraphs", len(subgraphs)) max_score = float("-infinity") max_subgraph = None for subgraph in subgraphs: # Just use the sum of similarities as a measure. We want bigger # subgraphs to have a higher score, so this is a good way to # insure it. score = sum(graph.get_edge_data(*x)["weight"] for x in subgraph.edges()) #/ len(subgraph.edges()) if score > max_score: max_score = score max_subgraph = subgraph if max_subgraph is None: log.info("No maximal subgraph found") return {"signatures": [], "message": "No maximal subgraph found"} log.debug("Found maximal subgraph with %d nodes, score %f", len(max_subgraph.nodes()), int(max_score * 100) / 100.0) # Right now, we'll just generate a signature for the subgraph with # the biggest score. In the future, we could try to find a set of # subgraphs with cover the cluster best. log.debug("Getting binary code for maximal subgraph") # Get the code for each function functions_code = [] for sample_db, function_ep in max_subgraph.nodes(): function = sample_db.get_function(function_ep) log.debug("Function %s:%d has %d chunks", sample_db.sha256, function_ep, len(list(function.chunks))) #TODO: This is wrong. You cannot simply append this, in case there are gaps between the chunks, # a '*' operator needs to be inserted in the final sig if not self.whitelist.find_raw(sample_db, entry_points = [function_ep]): functions_code.append("".join(chunk.bytes for chunk in function.chunks)) log.debug("Longest code sequence is %s bytes", max(len(x) for x in functions_code)) log.debug("Finding common subsequence in binary code") common_seq = hamming_klcs(functions_code) #TODO: Shorten sequence to maximum acceptable length of 980 bytes #(ClamAV ndb signature length limit) while len(common_seq) > 950: kill_character = random.randint(0, len(common_seq) - 1) common_seq = common_seq[:kill_character] + common_seq[kill_character + 1:] ndb_signature = ndb_from_common_sequence(functions_code, common_seq) log.debug("Found ndb signature: '%s'", ndb_signature) if ndb_signature: # TODO: Make nice name name = get_VT_name([sample.sha256 for sample in job.samples]) signature = {"type": "ndb", "signature": "{}:1:*:{}".format(name, ndb_signature)} num_triggering_samples = get_num_triggering_samples(signature, [sample.path for sample in job.samples]) log.debug("Signature triggered by %d samples (%.02f %%)", num_triggering_samples, int(10000.0 * num_triggering_samples / len(job.samples)) / 100.0) return {"signatures": [ {"signature": signature, "metrics": {"coverage": 1.0 * num_triggering_samples / len(job.samples), "num_triggering_samples": num_triggering_samples} } ], "message": "Found signature" } else: return {"signatures": [], "message": "Did not find a common sequence between code"} finally: for path in temporary_paths: if os.path.isdir(path): shutil.rmtree(path) else: os.unlink(path)