示例#1
0
    def _build_bindiff_lcs_signature(self, job):
        log.info("Building a Bindiff/LCS signature for job %d", job.id)
        #        if (len(job.samples) != 2):
        #            return {"message": "Can only handle two samples at the moment"}
        temporary_paths = []
        try:
            ida_pickle_dbs = list(
                self.idb_executor.map(
                    lambda sample: self.bindiff.bindiff_pickle_export(
                        sample.path, sample.info[FileTypeInspector.NAME][
                            "bits"] == 64), job.samples))
            binexport_dbs = [
                binexport_db for binexport_db, _ in ida_pickle_dbs
            ]
            pickle_dbs = [pickle_db for _, pickle_db in ida_pickle_dbs]

            temporary_paths += binexport_dbs
            temporary_paths += pickle_dbs

            # TODO: Find a clever way of generating few bindiff comparisons
            binexport_pairs = list(itertools.combinations(binexport_dbs, 2))
            pickle_pairs = list(itertools.combinations(pickle_dbs, 2))
            log.debug("Comparing %d pairs of binaries with each other",
                      len(pickle_pairs))
            bindiff_dbs = list(
                self.bindiff_executor.map(
                    lambda x: self.bindiff.bindiff_compare(*x),
                    binexport_pairs))
            temporary_paths += bindiff_dbs

            log.debug("Building graph of similar functions")
            graph = Graph()

            for bindiff_db_path, (sample1_pickle_db_path,
                                  sample2_pickle_db_path) in zip(
                                      bindiff_dbs, pickle_pairs):
                sample1_db = Database.load(sample1_pickle_db_path)
                sample2_db = Database.load(sample2_pickle_db_path)
                bindiff_db = BinDiffDb(bindiff_db_path)

                assert (bindiff_db.get_binary(1).get_exefilename() ==
                        sample1_db.filename)
                assert (bindiff_db.get_binary(2).get_exefilename() ==
                        sample2_db.filename)

                for similar_func in bindiff_db.get_similar_functions(
                        min_similarity=0.6,
                        min_confidence=0.5,
                        min_instructions=50,
                        min_bbs=3,
                        min_edges=4,
                        limit=10):
                    # TODO: The weight might need to be tuned
                    weight = similar_func["similarity"] * similar_func[
                        "confidence"]
                    graph.add_edge((sample1_db, int(similar_func["address1"])),
                                   (sample2_db, int(similar_func["address2"])),
                                   weight=weight)

            # What we want here is to find subgraphs in the graph which have a high accumulated average weight
            log.debug("Finding connected subgraphs in the graph")
            subgraphs = []
            processed_nodes = set()

            for node in graph.nodes():
                if node in processed_nodes:
                    continue

                binaries = set()
                nodes_to_explore = set((node, ))
                subgraph = Graph()
                while nodes_to_explore:
                    cur_node = nodes_to_explore.pop()

                    for edge in graph.edges(cur_node):
                        assert (edge[0] == cur_node)

                        # We don't want the same binary twice in our subgraph
                        if edge[1][0] in binaries:
                            continue

                        # We don't want circles in our subgraph
                        if edge[1] in subgraph.nodes():
                            continue

                        # This should happen only in obscure cases where we
                        # stopped at a node because of the same binary
                        # occurring twice (with different functions)
                        if edge[1] in processed_nodes:
                            continue

                        subgraph.add_edge(*edge)
                        nodes_to_explore.add(edge[1])

                    processed_nodes.add(cur_node)

                subgraphs.append(subgraph)

            if not subgraphs:
                log.info(
                    "No connected subgraphs in the function similarity graph found"
                )
                return {
                    "signatures": [],
                    "message":
                    "Cannot find common functions within the binaries"
                }

            log.debug("Determining maximal subgraph among %d subgraphs",
                      len(subgraphs))
            max_score = float("-infinity")
            max_subgraph = None
            for subgraph in subgraphs:
                # Just use the sum of similarities as a measure. We want bigger
                # subgraphs to have a higher score, so this is a good way to
                # insure it.
                score = sum(
                    graph.get_edge_data(*x)["weight"]
                    for x in subgraph.edges())  #/ len(subgraph.edges())
                if score > max_score:
                    max_score = score
                    max_subgraph = subgraph

            if max_subgraph is None:
                log.info("No maximal subgraph found")
                return {
                    "signatures": [],
                    "message": "No maximal subgraph found"
                }

            log.debug("Found maximal subgraph with %d nodes, score %f",
                      len(max_subgraph.nodes()),
                      int(max_score * 100) / 100.0)

            # Right now, we'll just generate a signature for the subgraph with
            # the biggest score. In the future, we could try to find a set of
            # subgraphs with cover the cluster best.

            log.debug("Getting binary code for maximal subgraph")
            # Get the code for each function
            functions_code = []
            for sample_db, function_ep in max_subgraph.nodes():
                function = sample_db.get_function(function_ep)
                log.debug("Function %s:%d has %d chunks", sample_db.sha256,
                          function_ep, len(list(function.chunks)))
                #TODO: This is wrong. You cannot simply append this, in case there are gaps between the chunks,
                # a '*' operator needs to be inserted in the final sig
                functions_code.append("".join(chunk.bytes
                                              for chunk in function.chunks))
            log.debug("Longest code sequence is %s bytes",
                      max(len(x) for x in functions_code))

            log.debug("Finding common subsequence in binary code")
            common_seq = hamming_klcs(functions_code)
            #TODO: Shorten sequence to maximum acceptable length of 980 bytes
            #(ClamAV ndb signature length limit)
            while len(common_seq) > 950:
                kill_character = random.randint(0, len(common_seq) - 1)
                common_seq = common_seq[:kill_character] + common_seq[
                    kill_character + 1:]
            ndb_signature = ndb_from_common_sequence(functions_code,
                                                     common_seq)

            log.debug("Found ndb signature: '%s'", ndb_signature)

            if ndb_signature:
                # TODO: Make nice name
                name = get_VT_name([sample.sha256 for sample in job.samples])
                signature = {
                    "type": "ndb",
                    "signature": "{}:1:*:{}".format(name, ndb_signature)
                }
                num_triggering_samples = get_num_triggering_samples(
                    signature, [sample.path for sample in job.samples])
                log.debug(
                    "Signature triggered by %d samples (%.02f %%)",
                    num_triggering_samples,
                    int(10000.0 * num_triggering_samples / len(job.samples)) /
                    100.0)
                return {
                    "signatures": [{
                        "signature": signature,
                        "metrics": {
                            "coverage":
                            1.0 * num_triggering_samples / len(job.samples),
                            "num_triggering_samples":
                            num_triggering_samples
                        }
                    }],
                    "message":
                    "Found signature"
                }
            else:
                return {
                    "signatures": [],
                    "message": "Did not find a common sequence between code"
                }
        finally:
            for path in temporary_paths:
                if os.path.isdir(path):
                    shutil.rmtree(path)
                else:
                    os.unlink(path)
示例#2
0
if include_regex:
    for entry in matches:
        print("match: {0}".format(entry)) 
    for entry in misses:
        print("miss: {0}".format(entry))
    print("Matches:{0}, Misses:{1}\n".format(len(matches),len(misses)))
    if len(matches) == 0:
       print("not going to generate sig no files matched the regex")
       sys.exit(-1)
    elif len(matches) == 1:
       print("not going to generate sig only one one file matched the regex {0}".format(matches[0]))
       sys.exit(-1)
print("finding klcs")
common = hamming_klcs(target_strings)
print("steaming dem clams")
ndb = ndb_from_common_sequence(target_strings, common)

if not options.lwsom:
    disass=ndb.split('*')
    reass=[]
    for entry in disass:
        if re.match("^(?:[0[9dabc]|20)+$",entry) == None:
            reass.append(entry)
    ndb="*".join(reass)


def replacenulls(nullmatch):
    nullseqlen=len(nullmatch) - 4
    return "%sclamnullseqrep{%s}%s" % (nullmatch[0:2],nullseqlen,nullmatch[-2:])

def addmatch(match):
示例#3
0
def test_clamav_ndb_from_common_sequence():
    a = "aaxxbbyycc"
    b = "aaggbbhhcc"
    common = "aabbcc"
    ndb = ndb_from_common_sequence([a, b], common)
    assert(ndb == "6161*6262*6363")
示例#4
0
文件: core.py 项目: 5l1v3r1/BASS-1
    def _build_bindiff_lcs_signature(self, job):
        log.info("Building a Bindiff/LCS signature for job %d", job.id)
        temporary_paths = []
        try:
            ida_pickle_dbs = list(self.idb_executor.map(lambda sample: self.ida.bindiff_pickle_export(sample.path, sample.info[FileTypeInspector.NAME]["bits"] == 64), job.samples))
            log.info("ida_pickle_dbs: %s", str(ida_pickle_dbs))
            binexport_dbs = [binexport_db for binexport_db, _ in ida_pickle_dbs]
            log.info("binexport_dbs: %s", str(binexport_dbs))
            pickle_dbs = [pickle_db for _, pickle_db in ida_pickle_dbs]
            
            temporary_paths += binexport_dbs
            temporary_paths += pickle_dbs

            # TODO: Find a clever way of generating few bindiff comparisons
            binexport_pairs = list(itertools.combinations(binexport_dbs, 2))
            pickle_pairs = list(itertools.combinations(pickle_dbs, 2))
            log.debug("Comparing %d pairs of binaries with each other", len(pickle_pairs))
            bindiff_dbs = list(self.bindiff_executor.map(lambda x: self.bindiff.compare(*x), binexport_pairs))
            temporary_paths += bindiff_dbs

            log.debug("Building graph of similar functions")
            graph = Graph()

            # PE check:
            if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples):
                # Loading JSON file
                log.debug("Loading JSON file containing APIs")
                h = open("api_db.json")
                json_apis = json.loads(h.read())
                h.close()

            for bindiff_db_path, (sample1_pickle_db_path, sample2_pickle_db_path) in zip(bindiff_dbs, pickle_pairs):
                sample1_db = Database.load(sample1_pickle_db_path)
                sample2_db = Database.load(sample2_pickle_db_path)
                bindiff_db = BinDiffDb(bindiff_db_path)

                assert(bindiff_db.get_binary(1).get_exefilename() == sample1_db.filename)
                assert(bindiff_db.get_binary(2).get_exefilename() == sample2_db.filename)

		# Useful for debugging 
                for f in sample1_db.functions:
                    log.info("%s: %s" % (f.name, f.data["is_library_function"]))

                # default
                for similar_func in bindiff_db.get_similar_functions(min_similarity = 0.6,
                                                                     min_confidence = 0.5,
                                                                     min_instructions = 50,
                                                                     min_bbs = 3,
                                                                     min_edges = 4,
                                                                     limit = 10):

                    # Filtering known functions
                    cur_f1 = sample1_db.get_function(int(similar_func["address1"]))
                    cur_f2 = sample2_db.get_function(int(similar_func["address2"]))
                    if cur_f1.is_library_function or cur_f2.is_library_function:
                        log.info("Skipping function - Reason: library function")
                        continue
        
		    # Filtering by name
                    if not cur_f1.name.startswith("sub_") or not cur_f2.name.startswith("sub_"):
                        log.info("Skipping function - Reason: name")
                        # Debug
                        log.info("%s - %s" % (cur_f1.name, cur_f2.name))
                        continue
                   
                    # Filtering by APIs list
                    if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples):
                        if cur_f1.apis != cur_f2.apis:
                            log.info("Skippig function - Reason: different apis")
                            # Debug
                            log.info(cur_f1.apis)
                            log.info(cur_f2.apis)
                            continue
                    
                    # Filtering by length
                    if similar_func["basicblocks"] < 10:
                        log.info("Skipping function - Reason: length")
                        continue

                    # Collecting stats
                    if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples):
                        msvcrt = 0
                        mutex = 0
                        antidbg = 0
                        apialert = 0
                        for a in cur_f1.apis:
                            if a in json_apis["msvcrt"]:
                                msvcrt += 1
                            elif a in json_apis["mutex"]:
                                mutex += 1
                            elif a in json_apis["antidbg"]:
                                antidbg += 1
                            elif a in json_apis["apialert"]:
                                apialert += 1
                       
                        # Printing stats for debugging
                        log.info(cur_f1.apis)
                        log.info("MSVCRT:")
                        log.info(msvcrt)

                        log.info("MUTEX:")
                        log.info(mutex)

                        log.info("ANTIDBG:")
                        log.info(antidbg)

                        log.info("APIALERT")
                        log.info(apialert)

                    # TODO: The weight might need to be tuned - Proposal 1
                    if all(x.info[FileTypeInspector.NAME]["type"] == FileTypeInspector.TYPE_PE for x in job.samples):
                        base_weight = similar_func["similarity"] * similar_func["confidence"]
                        weight = base_weight + antidbg + apialert + mutex - msvcrt
                    else:
                        weight = similar_func["similarity"] * similar_func["confidence"]
                   
                    # Debug print
                    log.info("DEBUG WEIGHT:")
                    log.info(weight)
                    log.info(similar_func)     
		 
                    graph.add_edge((sample1_db, int(similar_func["address1"])),
                                   (sample2_db, int(similar_func["address2"])),
                                   weight = weight)

            # What we want here is to find subgraphs in the graph which have a high accumulated average weight
            log.debug("Finding connected subgraphs in the graph")
            subgraphs = []
            processed_nodes = set()

            for node in graph.nodes():
                if node in processed_nodes:
                    continue

                binaries = set()
                nodes_to_explore = set((node, ))
                subgraph = Graph()
                while nodes_to_explore:
                    cur_node = nodes_to_explore.pop()

                    for edge in graph.edges(cur_node):
                        assert(edge[0] == cur_node)

                        # We don't want the same binary twice in our subgraph
                        if edge[1][0] in binaries:
                            continue

                        # We don't want circles in our subgraph
                        if edge[1] in subgraph.nodes():
                            continue

                        # This should happen only in obscure cases where we
                        # stopped at a node because of the same binary
                        # occurring twice (with different functions)
                        if edge[1] in processed_nodes:
                            continue

                        subgraph.add_edge(*edge)
                        nodes_to_explore.add(edge[1])

                    processed_nodes.add(cur_node)

                subgraphs.append(subgraph)

            if not subgraphs:
                log.info("No connected subgraphs in the function similarity graph found")
                return {"signatures": [], "message": "Cannot find common functions within the binaries"}

            
            log.debug("Determining maximal subgraph among %d subgraphs", len(subgraphs))
            max_score =  float("-infinity")
            max_subgraph = None
            for subgraph in subgraphs:
                # Just use the sum of similarities as a measure. We want bigger
                # subgraphs to have a higher score, so this is a good way to
                # insure it.
                score = sum(graph.get_edge_data(*x)["weight"] for x in subgraph.edges()) #/ len(subgraph.edges())
                if score > max_score:
                    max_score = score
                    max_subgraph = subgraph
            
            if max_subgraph is None:
                log.info("No maximal subgraph found")
                return {"signatures": [], "message": "No maximal subgraph found"}

            log.debug("Found maximal subgraph with %d nodes, score %f", len(max_subgraph.nodes()), int(max_score * 100) / 100.0)

            # Right now, we'll just generate a signature for the subgraph with
            # the biggest score. In the future, we could try to find a set of
            # subgraphs with cover the cluster best.

            log.debug("Getting binary code for maximal subgraph")
            # Get the code for each function
            functions_code = []
            for sample_db, function_ep in max_subgraph.nodes():
                function = sample_db.get_function(function_ep)
                log.debug("Function %s:%d has %d chunks", sample_db.sha256, function_ep, len(list(function.chunks)))
                #TODO: This is wrong. You cannot simply append this, in case there are gaps between the chunks,
                # a '*' operator needs to be inserted in the final sig
                if not self.whitelist.find_raw(sample_db, entry_points = [function_ep]):
                    functions_code.append("".join(chunk.bytes for chunk in function.chunks))
            log.debug("Longest code sequence is %s bytes", max(len(x) for x in functions_code))

            log.debug("Finding common subsequence in binary code")
            common_seq = hamming_klcs(functions_code)
            #TODO: Shorten sequence to maximum acceptable length of 980 bytes
            #(ClamAV ndb signature length limit)
            while len(common_seq) > 950:
                kill_character = random.randint(0, len(common_seq) - 1)
                common_seq = common_seq[:kill_character] + common_seq[kill_character + 1:]
            ndb_signature = ndb_from_common_sequence(functions_code, common_seq)

            log.debug("Found ndb signature: '%s'", ndb_signature)

            if ndb_signature:
                # TODO: Make nice name
                name = get_VT_name([sample.sha256 for sample in job.samples])
                signature = {"type": "ndb", "signature":  "{}:1:*:{}".format(name, ndb_signature)}
                num_triggering_samples = get_num_triggering_samples(signature, [sample.path for sample in job.samples])
                log.debug("Signature triggered by %d samples (%.02f %%)",
                    num_triggering_samples, 
                    int(10000.0 * num_triggering_samples / len(job.samples)) / 100.0)
                return {"signatures": 
                            [
                                {"signature": signature,
                                 "metrics":
                                    {"coverage": 1.0 * num_triggering_samples / len(job.samples),
                                     "num_triggering_samples": num_triggering_samples}
                                }
                            ],
                         "message": "Found signature"
                        }
            else:
                return {"signatures": [], "message": "Did not find a common sequence between code"}
        finally:
            for path in temporary_paths:
                if os.path.isdir(path):
                    shutil.rmtree(path)
                else:
                    os.unlink(path)