Пример #1
0
def main():
    parser = argparse.ArgumentParser(description='Demo: Use apiscout with a prepared api database (created using DatabaseBuilder.py) to crawl a dump for imports and render the results.')
    parser.add_argument('-f', '--filter', type=int, default=0, help='Filter out APIs that do not have a neighbour within N bytes.')
    parser.add_argument('-i', '--ignore_aslr', action='store_true', help='Do not apply the per-module ASLR offset potentially contained in a API DB file.')
    parser.add_argument('-c', '--collection_file', type=str, default='', help='Optionally match the output against a WinApi1024 vector collection file.')
    parser.add_argument('-b', '--base_addr', type=str, default='', help='Set base address to given value (int or 0x-hex format).')
    parser.add_argument('-t', '--import_table_only', action='store_true', help='Do not crawl for API references but only parse the import table instead - assumes an unmapped PE file as input.')
    parser.add_argument('binary_path', type=str, default='', help='Path to the memory dump to crawl.')
    parser.add_argument('db_path', type=str, nargs='*', help='Path to the DB(s). If no argument is given, use all files found in "./dbs"')

    args = parser.parse_args()
    if args.binary_path:
        binary = ""
        if os.path.isfile(args.binary_path):
            with open(args.binary_path, "rb") as f_binary:
                binary = f_binary.read()
        scout = ApiScout()
        base_addr = get_base_addr(args)
        print("Using base adress 0x{:x} to infer reference counts.".format(base_addr))
        scout.setBaseAddress(base_addr)
        # override potential ASLR offsets that are stored in the API DB files.
        scout.ignoreAslrOffsets(args.ignore_aslr)
        # load DB file
        db_paths = []
        if args.db_path:
            db_paths = args.db_path
        elif not args.import_table_only:
            db_paths = get_all_db_files()
        for db_path in db_paths:
            scout.loadDbFile(db_path)
        # load WinApi1024 vector
        scout.loadWinApi1024(get_winapi1024_path())
        # scout the binary
        results = {}
        if args.import_table_only:
            print("Parsing Import Table for\n  {}.".format(args.binary_path))
            results = scout.evaluateImportTable(binary, is_unmapped=True)
        else:
            print("Using \n  {}\nto analyze\n  {}.".format("\n  ".join(db_paths), args.binary_path))
            num_apis_loaded = scout.getNumApisLoaded()
            filter_info = " - neighbour filter: 0x%x" % args.filter if args.filter else ""
            print("Buffer size is {} bytes, {} APIs loaded{}.\n".format(len(binary), num_apis_loaded, filter_info))
            results = scout.crawl(binary)
        filtered_results = scout.filter(results, 0, 0, args.filter)
        print(scout.render(filtered_results))
        print(scout.renderVectorResults(filtered_results))
        if args.collection_file:
            print(scout.renderResultsVsCollection(filtered_results, args.collection_file))
    else:
        parser.print_help()
Пример #2
0
class Api():
    """Stores and analyze malwares info in neo4j"""
    def __init__(self,
                 host,
                 port,
                 user,
                 password,
                 threshold=40,
                 secure=False,
                 filepath=None,
                 filename=None,
                 folder_path=None):
        """Connects to neo4j database, loads options and set connectors.
        @raise CuckooReportError: if unable to connect.
        """
        self.threshold = int(threshold)
        self.graph = Graph(host=host,
                           user=user,
                           password=password,
                           secure=secure,
                           port=port)
        self.filepath = filepath
        self.filename = filename
        self.folder_path = folder_path
        self.scout = ApiScout()
        self.scout.setBaseAddress(0)
        self.scout.loadWinApi1024(
            os.path.abspath(os.path.join(os.path.dirname(__file__))) + os.sep +
            "data" + os.sep + "winapi1024v1.txt")

        self.magictest = magic.Magic(uncompress=True)
        CWD = os.path.abspath(os.path.dirname(__file__))
        USERDB = os.path.join(CWD, os.path.normpath("data/UserDB.TXT"))
        with open(USERDB, 'rt') as f:
            sig_data = f.read()
            self.signatures = peutils.SignatureDatabase(data=sig_data)

        if self.folder_path:
            self.files = self.get_files(folder_path)

    def check_file(self, f):
        if magic.from_file(f).find('PE32') == -1:
            return False
        if magic.from_file(f).find('self-extracting') != -1:
            return False
        try:
            pe = pefile.PE(f)
            matches = self.signatures.match_all(pe, ep_only=True)
            if matches:
                return False
            return True
        except:
            return False

    def get_files(self, folder_path):
        files_end = []

        files = []
        for root, dirnames, filenames in os.walk(folder_path):
            for filename in fnmatch.filter(filenames, '*'):
                files.append(os.path.join(root, filename))

        for filepath in files:
            if not self.check_file(filepath):
                continue
            json_path = "/".join(filepath.split(
                "/")[:-2]) + "/" + filepath.split("/")[-3] + ".json"
            if not os.path.exists(json_path):
                json_path = "/".join(filepath.split(
                    "/")[:-1]) + "/" + filepath.split("/")[-2] + ".json"
            if not os.path.exists(json_path):
                continue
            with open(json_path, 'r') as f:
                file_family = json.loads("".join([
                    str(x) for x in f.readlines()
                ])).get('common_name').replace(" ", "_")
            files_end.append((filepath, file_family))

        print(len(files_end), "Files da caricare")
        return files_end

    def get_digest(self, file):
        """ return hash, impuzzy and scout """
        md5 = hashlib.md5()
        sha1 = hashlib.sha1()
        sha256 = hashlib.sha256()

        try:
            impfuzzy = pyimpfuzzy.get_impfuzzy(file)
        except:
            impfuzzy = ""

        if os.path.isfile(file):
            with open(file, "rb") as f_binary:
                binary = f_binary.read()
        try:
            scout_ev = self.scout.evaluateImportTable(binary, is_unmapped=True)
            scout_result = self.scout.getWinApi1024Vectors(scout_ev).get(
                'import_table', {}).get('vector', None)
            scout_confidence = self.scout._apivector.getVectorConfidence(
                scout_result)
        except:
            with open('fail_list.txt', 'a') as f:
                f.write(file + "\n")
            scout_result = None
            scout_confidence = None

        with open(file, "rb") as f:
            while True:
                buf = f.read(2047)
                if not buf:
                    break
                md5.update(buf)
                sha1.update(buf)
                sha256.update(buf)

        return scout_result, impfuzzy, md5.hexdigest(), sha1.hexdigest(
        ), sha256.hexdigest(), scout_confidence

    def impfuzzy_comp(self, list, list_new):
        ssdeep = re.compile("^[0-9]{1,5}:[0-9a-zA-Z\/\+]+:[0-9a-zA-Z\/\+]+$",
                            re.DOTALL)
        complist = []
        list_len = len(list_new)
        i = 0
        for item_new in list_new:
            i += 1
            if re.search(ssdeep, item_new[2]) and len(item_new[2]) < 150:
                for j in range(i, list_len):
                    if re.search(ssdeep,
                                 list_new[j][2]) and len(list_new[j][2]) < 150:
                        complist.append([
                            item_new[0], list_new[j][0],
                            pyimpfuzzy.hash_compare(item_new[2],
                                                    list_new[j][2])
                        ])

        if list:
            for item_new in list_new:
                if re.search(ssdeep, item_new[2]) and len(item_new[2]) < 150:
                    for item in list:
                        if re.search(ssdeep, item[2]) and len(item[2]) < 150:
                            complist.append([
                                item_new[0], item[0],
                                pyimpfuzzy.hash_compare(item_new[2], item[2])
                            ])

        return complist

    def scout_comp(self, list, list_new):
        complist = []
        list_len = len(list_new)
        i = 0
        for item_new in list_new:
            i += 1
            for j in range(i, list_len):
                complist.append([
                    item_new[0], list_new[j][0],
                    int(
                        self.scout.matchVectors(item_new[3], list_new[j][3]) *
                        100)
                ])
        for item_new in list_new:
            for item in list:
                complist.append([
                    item_new[0], item[0],
                    int(self.scout.matchVectors(item_new[3], item[3]) * 100)
                ])
        return complist

    def process(self):

        hashlist = []
        hashlist_new = []
        nodes = []
        edges = []
        relationships = []

        # recover all actual data
        database = self.graph.run(
            "MATCH (m:Malware) RETURN m.id, m.name, m.impfuzzy, m.scout_result, m.scout_confidence, m.md5, m.sha1, m.sha256, m.tag"
        ).data()
        if database:
            for d in database:
                hashlist.append([
                    d["m.id"], d["m.name"], d["m.impfuzzy"],
                    d["m.scout_result"], d["m.scout_confidence"], d["m.md5"],
                    d["m.sha1"], d["m.sha256"], d["m.tag"]
                ])

        nodes_count = len(database)
        i = nodes_count

        relation_data = self.graph.run(
            "MATCH (m1:Malware)-[s:same]-(m2:Malware) RETURN m1.id, m2.id, s.value"
        ).data()
        if relation_data:
            for r in relation_data:
                relationships.append([r["m1.id"], r["m2.id"], r["s.value"]])
        for x in range(nodes_count):
            nodes.append(x)

        # if massive check for each file
        if self.folder_path:
            for item in self.files:
                scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest(
                    item[0])
                if scout_result in ("", 'A171', None):
                    continue

                query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256
                objs = self.graph.run(query).data()
                if not objs and sha256 not in [x[5] for x in hashlist_new]:
                    nodes.append(i)
                    hashlist_new.append([
                        i, item[0].split("/")[-1], impfuzzy, scout_result,
                        scout_confidence, md5, sha1, sha256, item[1]
                    ])
                    i += 1
                else:
                    continue
        else:
            # if single we are in the reporting module
            # if file is tested it need to have valid apiscout vector
            if self.check_file(self.filepath):
                scout_result, impfuzzy, md5, sha1, sha256, scout_confidence = self.get_digest(
                    self.filepath)
                if scout_result in ("", 'A171', None):
                    return {}
            else:
                return {}

            query = "MATCH (m:Malware) WHERE m.sha256=\"%s\" RETURN m" % sha256

            objs = self.graph.run(query).data()
            if not objs:
                nodes.append(nodes_count)
                hashlist_new.append([
                    nodes_count, self.filename, impfuzzy, scout_result,
                    scout_confidence, md5, sha1, sha256, None
                ])
            else:
                return self.search_hash(sha256)

        # Calculate apiscout correlation
        result_list = self.scout_comp(hashlist, hashlist_new)

        if len(database) != len(nodes):
            for edge in result_list + relationships:
                if edge[2] > self.threshold:
                    edges.append([[edge[0], edge[1]], edge[2]])
                else:
                    edges.append([[edge[0], edge[1]], 0])
            pyl = PyLouvain(nodes, edges)
            partition, modularity = pyl.apply_method()

        # Create node
        tx = self.graph.begin()

        for hash in hashlist_new + hashlist:
            i = 0
            for a in partition:
                i += 1
                if hash[0] in a:
                    tx.append(
                        statement_c, {
                            "id": hash[0],
                            "name": hash[1],
                            "impfuzzy": hash[2],
                            "scout_result": hash[3],
                            "scout_confidence": hash[4],
                            "md5": hash[5],
                            "sha1": hash[6],
                            "sha256": hash[7],
                            "tag": hash[8],
                            "cluster": i
                        })

        # Create relationship
        for result in result_list:
            if result[2] > self.threshold:
                tx.append(statement_r, {
                    "id1": result[0],
                    "id2": result[1],
                    "value_scout": result[2]
                })

        tx.process()
        tx.commit()

        # recover info
        if self.filename:
            return self.search_hash(sha256)

    def process_file(self, filepath, filename):
        self.filepath = filepath
        self.filename = filename
        return self.process()

    def search_hash(self, data):

        return_dict = {}

        # identify hash type
        HASHES = (
            ("md5", "^[a-fA-F0-9]{32}$"),
            ("sha1", "^[a-fA-F0-9]{40}$"),
            ("sha256", "^[a-fA-F0-9]{64}$"),
        )
        res = None
        for items in HASHES:
            if re.match(items[1], data):
                res = items[0]
        # No hash type match return
        if res == None:
            return {}

        family_query = "MATCH (m1:Malware) WHERE m1.%s=\"%s\" MATCH (m1:Malware)-[s:same]-(m2:Malware) MATCH (m2:Malware) WHERE m2.cluster = m1.cluster RETURN distinct m2.tag as tag, max(s.value) as max order by max(s.value) desc" % (
            res, data)
        file_query = "MATCH (m1:Malware) WHERE m1.%s=\"%s\" MATCH (m1:Malware)-[s:same]-(m2:Malware) MATCH (m2:Malware) WHERE m2.cluster = m1.cluster RETURN m2.tag as tag, m2.sha256 as sha256, max(s.value) as max order by max(s.value) desc LIMIT 10" % (
            res, data)
        cluster_count_query = "MATCH (m1:Malware) where m1.%s=\"%s\" MATCH (m2:Malware)-[p:same]->(m3:Malware) where m2.cluster = m1.cluster and m3.cluster = m1.cluster RETURN count(p.value) as total" % (
            res, data)
        cluster_query = "MATCH (m1:Malware) where m1.%s=\"%s\" MATCH (m2:Malware)-[p:same]->(m3:Malware) where m2.cluster = m1.cluster and m3.cluster = m1.cluster RETURN m2.sha256, m2.tag, p.value, m3.sha256, m3.cluster, m3.tag" % (
            res, data)
        item_query = "MATCH (m:Malware) WHERE m.%s=\"%s\" RETURN m" % (res,
                                                                       data)
        item_data = self.graph.run(item_query).data()
        cluster_count_list = self.graph.run(cluster_count_query).data()

        cluster_count = cluster_count_list[0]['total'] if len(
            cluster_count_list) > 0 else None
        return_dict['info'] = item_data[0]['m'] if len(item_data) > 0 else None

        family_objs = self.graph.run(family_query).data()
        if family_objs:
            return_dict['families'] = family_objs
            return_dict['files'] = self.graph.run(file_query).data()
            if cluster_count and cluster_count < 100:
                return_dict['cluster'] = self.graph.run(cluster_query).data()
            else:
                return_dict['cluster_count'] = cluster_count
        return return_dict