Exemplo n.º 1
0
    def get_unique_names(self, source):
        """usage: unique_names source_name"""

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()
        
        src_info = core.get_source_info(self.cnx, source)
        
        if src_info is None:
            print "Error: Source {} not found".format(source)
            return
            
        query = """
            SELECT fm.file_name, up.full_path 
            FROM file_metadata fm 
            LEFT JOIN FileNameFilter_unique_name fnfun
                ON fnfun.file_name = fm.file_name
                AND fnfun.unique_path_id = fm.unique_path_id
            LEFT JOIN unique_path up 
                ON up.id = fm.unique_path_id
            WHERE not isnull(fm.unique_file_id) 
                AND fnfun.count = 1
                AND fm.source_id = {}
        """.format(src_info.source_id)

        cursor.execute(query)
        data = cursor.fetchall()
                
        cursor.close()
        return data
Exemplo n.º 2
0
    def do_run_survey(self, line):
        '''
        run_survey <source_name>
            
        runs the survey function for the given source
        
        [source_name] - option name of source to process
        '''

	args = shlex.split(line)
	
	if len(args) < 1:
            print "Error: Incorrect # of arguments"
	    return

        src_obj = core.get_source_info(self.cnx, args[0])
        
        if src_obj is None:
            print "Error: Unable to find source {}".format(args[0])
            return
        else:
            rpt = Report(self.cnx, src_obj)
	    if len(args) > 1:
        	rpt.run(args[1:])
	    else:
		rpt.run(None)
Exemplo n.º 3
0
    def update(self, source):
        print "[+] FileName Filter running on {} ".format(source)

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()
        
        src_info = core.get_source_info(self.cnx, source)
        
        if src_info is None:
            print "Error: Source {} not found".format(source)
            return
            
        now = time.time()

#        self.cnx.autocommit(False)
        query = """
            INSERT INTO FileNameFilter_unique_name 
                (file_name, unique_path_id) 
                (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {})
            ON DUPLICATE KEY UPDATE count = count + 1;
        """.format(src_info.source_id)
        cursor.execute(query)
#        self.cnx.autocommit(True)
        
        later = time.time()
        
        #print "Updated counts in {} secs\nUpdating Scores".format(later - now)

        cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name")
        (max_count,) = cursor.fetchone()
        
        now = time.time()
        query = """
            INSERT INTO FileNameFilter_scores
                (id, score) 
                (
                    SELECT
                        fm.unique_file_id, MIN(fnfun.count / {})
                    FROM FileNameFilter_unique_name fnfun
                    LEFT JOIN file_metadata fm 
                        ON fnfun.file_name = fm.file_name
                        AND fnfun.unique_path_id = fm.unique_path_id
                    WHERE not isnull(fm.unique_file_id)
                    GROUP BY fm.unique_file_id
                )
            ON DUPLICATE KEY UPDATE score = score
            """.format(max_count)
        cursor.execute(query)
        self.cnx.commit()
        later = time.time()
        #print "Scores updated in {} secs".format(later - now)
        cursor.close()
Exemplo n.º 4
0
    def discover_detect_anomalies(self, source, out=None):
        """
        Conducts an anomaly search on a given source

        :param source: source
        :param out: output file (optional)
        """
        
        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)
        
        if src_info is None:
            print "*** Error: Source not found"
            return

        #anomaly type:  low prevalence files in normally high prevalence directories
        print "...Anomaly Detection: Unique files in common areas"
         
        query = """
            SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, 
            unique_path.full_path, file_metadata.file_name
            FROM global_file_prevalence 
            LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
            LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
            LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id
            where file_metadata.source_id = {}
            HAVING difference > 0
            ORDER BY difference desc limit 0, 100
        """.format(src_info.source_id)


        cursor.execute(query)

        if out is None:
            results = cursor.fetchall()
            if results is None or len(results) == 0:
                print "No anomalies found"
            else:
                print "Showing top {} results".format(len(results))
                for x in results:
                    print x
            return results

        print "Writing results to {}".format(out)

        with open(out, "w") as f:
            v=0
            for x in cursor.fetchall():
                f.write("{}: {}    {}/{}\n".format(v, x[0], x[1], x[2]))
                v+=1
             
        cursor.close()
Exemplo n.º 5
0
    def discover_histogram_by_source(self, source_name, output=None):
        """
        Displays a histogram of the distribution of file extensions that are
        executable of a single source as it relates to all occurrences of that
        extension across all systems

        :param source_name: The name of the source
        """

        print '[+] Running \"Histogram by Source\"...'

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source_name)

        if src_info is None:
            print "Source {} does not exist".format(source_name)
            return

        num_systems = core.get_num_systems(self.cnx, src_info.os_id)
        bins = range(1, num_systems+2)

        # !! TODO
        query = """
            SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence
            LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
            WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}")
            GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC;
        """.format(source_name)

        cursor.execute(query)

        data = cursor.fetchall()

        if data == None:
            return

        counts, ranges = zip(*data)

        fig = plt.figure()
        perc = int( float(sum(counts[1:])) / sum(counts) * 100)
        ax = fig.add_subplot(111, title="File Executable  Prevalence of {} with {}% > 1".format(src_info.source_name, perc))
        ax.hist(ranges, weights=counts, bins = bins)
        ax.set_xlabel("Num of Systems")
        ax.set_ylabel("File Occurrences")

        plt.xticks(bins)

        if output is None:
            plt.show()
        else:
            plt.savefig(output)
Exemplo n.º 6
0
    def discover_detect_anomalies(self, source, out):
        """
        Conducts an anomaly search on a given source

        :param source: source
        """

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "*** Error: Source not found"
            return

        # !! TODO
        #anomaly type:  low prevalence files in normally high prevalence directories
        print "Anomaly Detection: Unique files in common areas"
        print "running..."

        # !! TODO
        query = """
            SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, 
            unique_path.full_path, file_metadata.file_name
            FROM global_file_prevalence
            LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
            LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
            LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id
            where file_metadata.source_id = {}
            HAVING difference > 0
            ORDER BY difference desc limit 0, 100
        """.format(src_info.source_id)

        cursor.execute(query)

        if out is None:
            return cursor.fetchall()


        with open(out, "w") as f:
            v=0
            for x in cursor.fetchall():
                f.write("{}: {}    {}{}\n".format(v, x[0], x[1], x[2]))
                v+=1

        cursor.close()
Exemplo n.º 7
0
 def do_run_report_survey(self, source = None):
     '''[*] run_survey (optional)<source_name>\n\t|- runs the survey function for the given source\n\t |- if no source is provided run_survey processes all sources\n\t|-[source_name] - option name of source to process'''
     rpt = Report(self.cnx)
     if source == None:
         sources = core.get_all_sources(self.cnx)
         for s in sources:
             print "Running report survey for: " + s.source_name
             rpt.run_filter_survey(s.source_name)
             rpt.generate_report(s)
     else:
         src = core.get_source_info(self.cnx, source)
         if src == None:
             print "Source " + source + " does not exist"
             return
         print "Running report survey for: " + src.source_name
         rpt.run_filter_survey(src.source_name)
         rpt.generate_report(src)
Exemplo n.º 8
0
    def get_unique_names(self, source):
        """usage: unique_names source_name"""

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "Error: Source {} not found".format(source)
            return

        #query = """
        #    SELECT fm.file_name, up.full_path
        #    FROM file_metadata fm
        #    LEFT JOIN FileNameFilter_unique_name fnfun
        #        ON fnfun.file_name = fm.file_name
        #        AND fnfun.unique_path_id = fm.unique_path_id
        #    LEFT JOIN unique_path up
        #        ON up.id = fm.unique_path_id
        #    WHERE not isnull(fm.unique_file_id)
        #        AND fnfun.count = 1
        #        AND fm.source_id = {}
        #""".format(src_info.source_id)
        cursor.execute("""
                       SELECT fm.file_name, up.full_path
                       FROM file_metadata fm
                       LEFT JOIN FileNameFilter_unique_name fnfun
                       ON fnfun.file_name = fm.file_name
                       AND fnfun.unique_path_id = fm.unique_path_id
                       LEFT JOIN unique_path up
                       ON up.id = fm.unique_path_id
                       WHERE not isnull(fm.unique_file_id)
                       AND fnfun.count = 1
                       AND fm.source_id = %s
                       """, (src_info.source_id,))
        data = cursor.fetchall()

        cursor.close()
        return data
Exemplo n.º 9
0
    def update(self, source):
        print "[+] FileName Filter running on {} ".format(source)

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "Error: Source {} not found".format(source)
            return

        now = time.time()

        # self.cnx.autocommit(False)
        #query = """
        #    INSERT INTO FileNameFilter_unique_name
        #        (file_name, unique_path_id)
        #        (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {})
        #    ON DUPLICATE KEY UPDATE count = count + 1;
        #""".format(src_info.source_id)
        cursor.execute("""
                       INSERT INTO FileNameFilter_unique_name
                       (file_name, unique_path_id)
                       (SELECT file_name, unique_path_id
                        FROM file_metadata
                        WHERE file_name != "/" and source_id = %s)
                       ON DUPLICATE KEY UPDATE count = count + 1;
                       """, (src_info.source_id,))
        # self.cnx.autocommit(True)

        later = time.time()

        #print "Updated counts in {} secs\nUpdating Scores".format(later - now)

        cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name")
        (max_count,) = cursor.fetchone()

        now = time.time()
        #query = """
        #    INSERT INTO FileNameFilter_scores
        #        (id, score)
        #        (
        #            SELECT
        #                fm.unique_file_id, MIN(fnfun.count / {})
        #            FROM FileNameFilter_unique_name fnfun
        #            LEFT JOIN file_metadata fm
        #                ON fnfun.file_name = fm.file_name
        #                AND fnfun.unique_path_id = fm.unique_path_id
        #            WHERE not isnull(fm.unique_file_id)
        #            GROUP BY fm.unique_file_id
        #        )
        #    ON DUPLICATE KEY UPDATE score = score
        #    """.format(max_count)
        cursor.execute("""
                       INSERT INTO FileNameFilter_scores
                       (id, score)
                       (SELECT fm.unique_file_id, MIN(fnfun.count / %s)
                        FROM FileNameFilter_unique_name fnfun
                        LEFT JOIN file_metadata fm
                        ON fnfun.file_name = fm.file_name
                        AND fnfun.unique_path_id = fm.unique_path_id
                        WHERE not isnull(fm.unique_file_id)
                        GROUP BY fm.unique_file_id)
                       ON DUPLICATE KEY UPDATE score = score
                       """, (max_count,))
        self.cnx.commit()
        later = time.time()
        #print "Scores updated in {} secs".format(later - now)
        cursor.close()
Exemplo n.º 10
0
    def discover_evaluate_dir(self, dir_name, source, num_clusters=DEFAULT_NUM_CLUSTERS):
        """
        Discovery function that applies kmeans clustering to a specified directory, displays
        the resulting scatter plot with the clusters, and then prints out an ordered list of 
        the file by the distance from their respective centroid. Currently,
        this function uses two static features of "modification date" and "inode number" but
        future versions will allow for dynamic features inputs.

        :param dir_name: directory name to be analyzed (Required)
        :source: source name to be analzyed (Required)
        :num_clusters: specified number of clusters to use for kmeans (Default: 3)
        """

        num_features = 2
        num_clusters = int(num_clusters)
        cursor = self.cnx.cursor()
        
        if(dir_name.endswith('/')):
            dir_name = dir_name[:-1]

        print "...Running discovery function on source {} at directory {}".format(source, dir_name)
        
        src_info = core.get_source_info(self.cnx, source)
        if src_info is None:
            print "Error: Source {} does not exist".format(source)
            return

        

        #grab all files for a particular directory from a specific source
        hash_val = sha1(dir_name).hexdigest()
        
        query = """
            SELECT file_name, file_metadata_id, filesystem_id, last_modified
            FROM joined_file_metadata
            WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/'
            """.format(src_info.source_id, hash_val)

        cursor.execute(query)

        #bring all results into memory
        sql_results = cursor.fetchall()
      
        if(len(sql_results) == 0):
            return

        #zero out the array that will contain the inodes
        filesystem_id_arr = np.zeros((len(sql_results), num_features))

        i = 0
        for _, _,inode, mod_date in sql_results:
            seconds = calendar.timegm(mod_date.utctimetuple())
            filesystem_id_arr[i] = (inode, seconds)
            i += 1
        whitened = whiten(filesystem_id_arr) 
        #get the centroids
        codebook,_ = kmeans(whitened, num_clusters)
        code, dist = vq(whitened, codebook)
        d = defaultdict(int)

        #quick way to get count of cluster sizes        
        for c in code:
            d[c] += 1
      
        #sorts the codes and sql_results together as pairs
        combined = zip(dist, code, sql_results)

        #sort results by distances from centroid
        sorted_results =  sorted(combined, key=lambda tup: tup[0])

        for dist_val, c, r in sorted_results:
            print "Dist: {} Cluster: {}  Data: {}".format(dist_val,c,r)

        visual.visualize_scatter(d, code, whitened, codebook, 3, "inode number", "modification datetime", dir_name)
Exemplo n.º 11
0
    def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS):
        """
        Evaluates and scores a given source with a specified number of clusters for kmeans. Currently
        this function uses two set features as inputs (modification time and inode number), however
        futures versions will allow for dynamic feature inputs

        :param source_name: media source name
        :param num_clusters: number of clusters to input into kmeans (Default: 3)
        """

        cursor = self.cnx.cursor()
        src_info = core.get_source_info(self.cnx, source_name)        

        #returns all files sorted by directory for the given source
        query = """
            SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash 
            FROM joined_file_metadata 
            where source_id = {} order by parent_id asc
            """.format(src_info.source_id)
       
        cursor.execute(query)
       
        files = list()

        print "...Beginning clustering analysis"
        pool = Pool(processes=4)              # start 4 worker processes
        manager = Manager()
        rows = manager.Queue()
        is_first = True
        
        parent_id_prev = None
        #should iterate by dir of a given source at this point
        for(file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash_val) in cursor:
            
            if is_first is True:
                is_first = False
                parent_id_prev = parent_id

            #if parent_id is diff than previous, we are in new directory, so pack it up for analysis
            if parent_id_prev != parent_id:
                
                parent_id_prev = parent_id
                
                if len(files) > 0:
                    pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2])
                    files = list()
            
            #make sure to omit directories from the clustering analy               
            if file_name != '/' and hash_val != "":
                files.append((file_metadata_id, last_modified, full_path,file_name, filesystem_id, parent_id))
       
        if len(files) > 0:
            pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2])

        pool.close()
        pool.join() 
         
        input_rows = []
        count = 0
        while rows.empty() is False:
            curr = rows.get()
            input_rows.append(curr)
            count +=1
            if count % 50000 is 0:
                print "...sending {} results to server".format(len(input_rows))
                cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows)
                input_rows = []
                count=0
        print "...sending {} results to server".format(len(input_rows))
        
        cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows)
        self.cnx.commit()
        #need to drop the lu_scores and recalculate
        cursor.execute("drop table if exists lu_scores")
        
        query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` (
                `id` bigint(20) unsigned NOT NULL,
                `score` double DEFAULT NULL,
                KEY `fk_unique_file0_id` (`id`),
                CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`) 
                REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
                        ) ENGINE=InnoDB""")

        cursor.execute(query) 

        print "...updating scores on the server"
        query = """
            INSERT INTO lu_scores
            (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM 
            locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id) 
            WHERE file_metadata.unique_file_id is not null
            GROUP BY file_metadata.unique_file_id)
            """
                               
        cursor.execute(query)
        self.cnx.commit()
Exemplo n.º 12
0
    def discover_evaluate_dir(self,
                              dir_name,
                              source,
                              num_clusters=DEFAULT_NUM_CLUSTERS):
        """
        Discovery function that applies kmeans clustering to a specified directory, displays
        the resulting scatter plot with the clusters, and then prints out an ordered list of
        the file by the distance from their respective centroid. Currently,
        this function uses two static features of "modification date" and "inode number" but
        future versions will allow for dynamic features inputs.

        :param dir_name: directory name to be analyzed (Required)
        :source: source name to be analzyed (Required)
        :num_clusters: specified number of clusters to use for kmeans (Default: 3)
        """

        num_features = 2
        num_clusters = int(num_clusters)
        cursor = self.cnx.cursor()

        if (dir_name.endswith('/')):
            dir_name = dir_name[:-1]

        print "...Running discovery function on source {} at directory {}".format(
            source, dir_name)

        src_info = core.get_source_info(self.cnx, source)
        if src_info is None:
            print "Error: Source {} does not exist".format(source)
            return

        #grab all files for a particular directory from a specific source
        hash_val = sha1(dir_name).hexdigest()

        #query = """
        #    SELECT file_name, file_metadata_id, filesystem_id, last_modified
        #    FROM joined_file_metadata
        #    WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/'
        #    """.format(src_info.source_id, hash_val)

        cursor.execute(
            """
                       SELECT file_name, file_metadata_id, filesystem_id, last_modified
                       FROM joined_file_metadata
                       WHERE source_id = %s AND path_hash = %s AND file_name !='/'
                       """, (
                src_info.source_id,
                hash_val,
            ))

        #bring all results into memory
        sql_results = cursor.fetchall()

        if (len(sql_results) == 0):
            return

        print "...Found {} files in specified directory".format(
            len(sql_results))
        print "...Will form into {} clusters".format(num_clusters)
        if num_clusters > len(sql_results):
            print "Number of clusters ({}) exceeds number of files ({})".format(
                num_clusters, len(sql_results))
            num_clusters = len(sql_results)
            print "Number of clusters is now: {}".format(num_clusters)

        #zero out the array that will contain the inodes
        filesystem_id_arr = np.zeros((len(sql_results), num_features))

        i = 0
        for _, _, inode, mod_date in sql_results:
            seconds = calendar.timegm(mod_date.utctimetuple())
            filesystem_id_arr[i] = (inode, seconds)
            i += 1
        whitened = whiten(filesystem_id_arr)
        #get the centroids
        codebook, _ = kmeans(whitened, num_clusters)
        code, dist = vq(whitened, codebook)
        d = defaultdict(int)

        #quick way to get count of cluster sizes
        for c in code:
            d[c] += 1

        #sorts the codes and sql_results together as pairs
        combined = zip(dist, code, sql_results)

        #sort results by distances from centroid
        sorted_results = sorted(combined, key=lambda tup: tup[0])

        for dist_val, c, r in sorted_results:
            print "Dist: {} Cluster: {}  Data: {}".format(dist_val, c, r)

        if codebook is None or len(codebook) == 0:
            print "Data is not suitable for visualization"
            return

        visual.visualize_scatter(d, code, whitened, codebook, num_clusters,
                                 "inode number", "modification datetime",
                                 dir_name)
Exemplo n.º 13
0
    def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS):
        """
        Evaluates and scores a given source with a specified number of clusters for kmeans. Currently
        this function uses two set features as inputs (modification time and inode number), however
        futures versions will allow for dynamic feature inputs

        :param source_name: media source name
        :param num_clusters: number of clusters to input into kmeans (Default: 3)
        """

        cursor = self.cnx.cursor()
        src_info = core.get_source_info(self.cnx, source_name)

        #returns all files sorted by directory for the given source
        #query = """
        #    SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash
        #    FROM joined_file_metadata
        #    where source_id = {} order by parent_id asc
        #    """.format(src_info.source_id)

        cursor.execute(
            """
                       SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash
                       FROM joined_file_metadata
                       where source_id = %s order by parent_id asc
                       """, (src_info.source_id, ))

        files = list()

        print "...Beginning clustering analysis"
        pool = Pool(processes=4)  # start 4 worker processes
        manager = Manager()
        rows = manager.Queue()
        is_first = True

        parent_id_prev = None
        #should iterate by dir of a given source at this point
        for (file_metadata_id, last_modified, full_path, file_name,
             filesystem_id, parent_id, hash_val) in cursor:

            if is_first is True:
                is_first = False
                parent_id_prev = parent_id

            #if parent_id is diff than previous, we are in new directory, so pack it up for analysis
            if parent_id_prev != parent_id:

                parent_id_prev = parent_id

                if len(files) > 0:
                    pool.apply_async(do_eval,
                                     [rows, full_path, files, num_clusters, 2])
                    files = list()

            #make sure to omit directories from the clustering analy
            if file_name != '/' and hash_val != "":
                files.append((file_metadata_id, last_modified, full_path,
                              file_name, filesystem_id, parent_id))

        if len(files) > 0:
            pool.apply_async(do_eval,
                             [rows, full_path, files, num_clusters, 2])

        pool.close()
        pool.join()

        input_rows = []
        count = 0
        while rows.empty() is False:
            curr = rows.get()
            input_rows.append(curr)
            count += 1
            if count % 50000 is 0:
                print "...sending {} results to server".format(len(input_rows))
                cursor.executemany(
                    """REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""",
                    input_rows)
                input_rows = []
                count = 0
        print "...sending {} results to server".format(len(input_rows))

        cursor.executemany(
            """REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""",
            input_rows)
        self.cnx.commit()
        #need to drop the lu_scores and recalculate
        cursor.execute("drop table if exists lu_scores")

        query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` (
                `id` bigint(20) unsigned NOT NULL,
                `score` double DEFAULT NULL,
                KEY `fk_unique_file0_id` (`id`),
                CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`)
                REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
                        ) ENGINE=InnoDB""")

        cursor.execute(query)

        print "...updating scores on the server"
        query = """
            INSERT INTO lu_scores
            (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM
            locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id)
            WHERE file_metadata.unique_file_id is not null
            GROUP BY file_metadata.unique_file_id)
            """

        cursor.execute(query)
        self.cnx.commit()
Exemplo n.º 14
0
    def discover_detect_anomalies(self, source, out=None):
        """
        Conducts an anomaly search on a given source

        :param source: source
        :param out: output file (optional)
        """

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "*** Error: Source not found"
            return

        #anomaly type:  low prevalence files in normally high prevalence directories
        print "...Anomaly Detection: Unique files in common areas"

        #query = """
        #    SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference,
        #    unique_path.full_path, file_metadata.file_name
        #    FROM global_file_prevalence
        #    LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
        #    LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
        #    LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id
        #    where file_metadata.source_id = {}
        #    HAVING difference > 0
        #    ORDER BY difference desc limit 0, 100
        #""".format(src_info.source_id)

        cursor.execute(
            """
                       SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference,
                       unique_path.full_path, file_metadata.file_name
                       FROM global_file_prevalence
                       LEFT JOIN file_metadata
                       ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
                       LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
                       LEFT JOIN unique_path
                       ON file_metadata.unique_path_id = unique_path.id
                       where file_metadata.source_id = %s
                       HAVING difference > 0
                       ORDER BY difference desc limit 0, 100
                       """, (src_info.source_id, ))

        if out is None:
            results = cursor.fetchall()
            if results is None or len(results) == 0:
                print "No anomalies found"
            else:
                print "Showing top {} results".format(len(results))
                for x in results:
                    print x
            return results

        print "Writing results to {}".format(out)

        with open(out, "w") as f:
            v = 0
            for x in cursor.fetchall():
                f.write("{}: {}    {}/{}\n".format(v, x[0], x[1], x[2]))
                v += 1

        cursor.close()
Exemplo n.º 15
0
    def update(self, source):
        """
        Updates the scores of the ep_scores table with the new data from the inputted source

        :param source: identifier for the source to be updated
        """

        print "[+] Executable Prevalence Filter running on {} ".format(source)

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "Error: Source {} not found".format(source)
            return

        #initial insert
        # !! TODO
        query = """
            INSERT INTO  ep_scores(id, score)
            SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average)
            FROM global_file_prevalence JOIN file_metadata
            ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
            where file_metadata.source_id = {}
            ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average)
        """.format(src_info.source_id)

        cursor.execute(query)
        self.cnx.commit()

        #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each
        #dir, but his will have to work for now.
        # !! TODO
        query = """
            UPDATE  global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
            LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
            LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
            LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id
            SET fp_scores.score = fp_scores.score * .5
            where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2
            and global_dir_combined_prevalence.average - global_file_prevalence.average > .6
        """.format(src_info.source_id)

        cursor.execute(query)
        self.cnx.commit()

        #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches
        # !! TODO
        query = """
            UPDATE file_metadata
            LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
            LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id
            LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id
            SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score
            where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5
        """.format(src_info.source_id)

        cursor.execute(query)
        self.cnx.commit()
        cursor.close()
Exemplo n.º 16
0
    def run_survey(self, source_name):

        print "...running survey for {}".format(self.name)

        resources = "resources"
        img_by_src = "hist_by_src.png"
        img_by_os = "hist_by_os.png"
        survey_file = "survey.html"
        survey_dir = "survey_{}_{}".format(self.name, source_name)


        resource_dir = os.path.join(survey_dir, resources) 
        html_file = os.path.join(survey_dir, survey_file)

        try:
            shutil.rmtree(survey_dir)
        except:
            pass

        os.mkdir(survey_dir)
        os.mkdir(resource_dir)

        src_info = core.get_source_info(self.cnx, source_name)

        self.discover_histogram_by_source(source_name, os.path.join(resource_dir, img_by_src))
        self.discover_histogram_by_os(src_info.os_name, os.path.join(resource_dir, img_by_os))
        anomalies = self.discover_detect_anomalies(source_name, None)
        results = self.show_results("bottom", 100, source_name, None)


        with open(html_file, 'w') as f:
            f.write("""
            <html>
            <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css">
            <h2 class="redwood-title">Filter Prevalence Snapshot</h2>
            <body>
                <h3 class="redwood-header">Histogram for {}</h3>
                <img src="{}">
                <h3 class="redwood-header">Histogram for Operating System - {}</h3>
                <img src="{}">
            """.format( source_name, 
                        os.path.join(resources, img_by_src),
                        src_info.os_name,
                        os.path.join(resources, img_by_os)
                        ))

            f.write("<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>")
            f.write("<table border=\"1\" id=\"redwood-table\">")
            f.write("<thead><tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>")
            i = 0
            lr = len(results)
            for r in results:
                if i == lr - 1:
                    f.write("</tbody><tfoot>")
                    f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2]))
                else:
                    f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
                i += 1
            f.write("</table>") 

            f.write("<h3 class=\"redwood-header\">The top 100 anomalous files</h3>")
            f.write("<table border=\"1\" id=\"redwood-table\">")
            f.write("<thead><tr><th class=\"rounded-head-left\">Anomaly Value</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>")
            i = 0
            lr = len(anomalies)
            for r in anomalies:
                if i == lr - 1:
                    f.write("</tbody><tfoot>")
                    f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2]))
                else:
                    f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
                i += 1
            #for r in anomalies:
            #    f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
            f.write("</table>")
        return survey_dir
Exemplo n.º 17
0
    def discover_histogram_by_source(self, source_name, output=None):
        """
        Displays a histogram of the file distribution of a single source as it relates
        to all occurrences of that file across all systems

        :param source_name: The name of the source
        :param output: (optional) output filename in PNG format
        """

        print '[+] Running \"Histogram by Source\"...'
        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source_name)

        if src_info is None:
            print "Source {} does not exist".format(source_name)
            return

        num_systems = core.get_num_systems(self.cnx, src_info.os_id)
        bins = range(1, num_systems + 2)

        #query = """
        #    SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence
        #    LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
        #    WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}")
        #    GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC;
        #""".format(source_name)

        cursor.execute(
            """
                       SELECT COUNT(file_metadata.id), global_file_prevalence.count
                       FROM global_file_prevalence
                       LEFT JOIN file_metadata
                       ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
                       WHERE file_metadata.source_id =
                       (SELECT media_source.id
                        FROM media_source
                        WHERE media_source.name = %s)
                       GROUP BY global_file_prevalence.count
                       ORDER BY global_file_prevalence.count ASC;
                       """, (source_name, ))

        data = cursor.fetchall()

        if data == None or len(data) is 0:
            return

        counts, ranges = zip(*data)

        fig = plt.figure()
        perc = int(float(sum(counts[1:])) / sum(counts) * 100)
        ax = fig.add_subplot(111,
                             title="File Prevalence of {} with {}% > 1".format(
                                 src_info.source_name, perc))
        ax.hist(ranges, weights=counts, bins=bins)
        ax.set_xlabel("Num of Systems")
        ax.set_ylabel("File Occurrences")

        if output is None:
            plt.show()
        else:
            print "Saving histogram to {}".format(output)
            plt.savefig(output)
Exemplo n.º 18
0
    def run_survey(self, source_name):

        print "...running survey for {}".format(self.name)

        resources = "resources"
        img_by_src = "hist_by_src.png"
        img_by_os = "hist_by_os.png"
        survey_file = "survey.html"
        survey_dir = "survey_{}_{}".format(self.name, source_name)

        resource_dir = os.path.join(survey_dir, resources)
        html_file = os.path.join(survey_dir, survey_file)

        try:
            shutil.rmtree(survey_dir)
        except:
            pass

        os.mkdir(survey_dir)
        os.mkdir(resource_dir)

        src_info = core.get_source_info(self.cnx, source_name)

        self.discover_histogram_by_source(
            source_name, os.path.join(resource_dir, img_by_src))
        self.discover_histogram_by_os(src_info.os_name,
                                      os.path.join(resource_dir, img_by_os))
        anomalies = self.discover_detect_anomalies(source_name, None)
        results = self.show_results("bottom", 100, source_name, None)

        with open(html_file, 'w') as f:
            f.write("""
            <html>
            <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css">
            <h2 class="redwood-title">Filter Prevalence Snapshot</h2>
            <body>
                <h3 class="redwood-header">Histogram for {}</h3>
                <img src="{}">
                <h3 class="redwood-header">Histogram for Operating System - {}</h3>
                <img src="{}">
            """.format(source_name, os.path.join(resources, img_by_src),
                       src_info.os_name, os.path.join(resources, img_by_os)))

            f.write(
                "<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>"
            )
            f.write("<table border=\"1\" id=\"redwood-table\">")
            f.write(
                "<thead><tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>"
            )
            i = 0
            lr = len(results)
            for r in results:
                if i == lr - 1:
                    f.write("</tbody><tfoot>")
                    f.write(
                        "<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>"
                        .format(r[0], r[1], r[2]))
                else:
                    f.write(
                        "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
                            r[0], r[1], r[2]))
                i += 1
            f.write("</table>")

            f.write(
                "<h3 class=\"redwood-header\">The top 100 anomalous files</h3>"
            )
            f.write("<table border=\"1\" id=\"redwood-table\">")
            f.write(
                "<thead><tr><th class=\"rounded-head-left\">Anomaly Value</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>"
            )
            i = 0
            lr = len(anomalies)
            for r in anomalies:
                if i == lr - 1:
                    f.write("</tbody><tfoot>")
                    f.write(
                        "<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>"
                        .format(r[0], r[1], r[2]))
                else:
                    f.write(
                        "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
                            r[0], r[1], r[2]))
                i += 1
            #for r in anomalies:
            #    f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2]))
            f.write("</table>")
        return survey_dir
Exemplo n.º 19
0
    def update(self, source):
        """
        Updates the scores of the fp_scores table with the new data from the inputted source

        :param source: identifier for the source to be updated
        """

        print "[+] Prevalence Filter running on {} ".format(source)

        #creates the basic tables if they do not exist
        self.build()

        cursor = self.cnx.cursor()

        src_info = core.get_source_info(self.cnx, source)

        if src_info is None:
            print "Error: Source {} not found".format(source)
            return

        #initial insert
        #query = """
        #    INSERT INTO  fp_scores(id, score)
        #    SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average)
        #    FROM global_file_prevalence JOIN file_metadata
        #    ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
        #    where file_metadata.source_id = {}
        #    ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average)
        #""".format(src_info.source_id)

        cursor.execute(
            """
                       INSERT INTO  fp_scores(id, score)
                       SELECT global_file_prevalence.unique_file_id,
                       IF(num_systems < 3, .5, average)
                       FROM global_file_prevalence JOIN file_metadata
                       ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id
                       where file_metadata.source_id = %s
                       ON DUPLICATE KEY UPDATE score =
                       IF(num_systems < 3, .5, average)
                       """, (src_info.source_id, ))
        self.cnx.commit()

        #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each
        #dir, but his will have to work for now.
        #query = """
        #    UPDATE  global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
        #    LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
        #    LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
        #    LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id
        #    SET fp_scores.score = fp_scores.score * .5
        #    where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2
        #    and global_dir_combined_prevalence.average - global_file_prevalence.average > .6
        #""".format(src_info.source_id)

        cursor.execute(
            """
                       UPDATE  global_file_prevalence left join file_metadata
                       ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id
                       LEFT JOIN global_dir_prevalence
                       on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
                       LEFT JOIN global_dir_combined_prevalence
                       on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id
                       LEFT JOIN fp_scores
                       ON fp_scores.id = global_file_prevalence.unique_file_id
                       SET fp_scores.score = fp_scores.score * .5
                       where file_metadata.source_id = %s
                       AND global_file_prevalence.count = 1
                       and global_file_prevalence.num_systems > 2
                       and global_dir_combined_prevalence.average - global_file_prevalence.average > .6
                       """, (src_info.source_id, ))
        self.cnx.commit()

        #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches
        #query = """
        #    UPDATE file_metadata
        #    LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
        #    LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id
        #    LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id
        #    SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score
        #    where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5
        #""".format(src_info.source_id)

        cursor.execute(
            """
                       UPDATE file_metadata
                       LEFT JOIN global_dir_prevalence
                       ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id
                       LEFT JOIN global_dir_combined_prevalence
                       ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id
                       LEFT JOIN fp_scores
                       ON file_metadata.unique_file_id = fp_scores.id
                       SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score
                       where file_metadata.source_id = %s
                       AND global_dir_prevalence.average > .8
                       AND global_dir_combined_prevalence.average < .5
                       """, (src_info.source_id, ))
        self.cnx.commit()
        cursor.close()