def get_unique_names(self, source): """usage: unique_names source_name""" #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return query = """ SELECT fm.file_name, up.full_path FROM file_metadata fm LEFT JOIN FileNameFilter_unique_name fnfun ON fnfun.file_name = fm.file_name AND fnfun.unique_path_id = fm.unique_path_id LEFT JOIN unique_path up ON up.id = fm.unique_path_id WHERE not isnull(fm.unique_file_id) AND fnfun.count = 1 AND fm.source_id = {} """.format(src_info.source_id) cursor.execute(query) data = cursor.fetchall() cursor.close() return data
def do_run_survey(self, line): ''' run_survey <source_name> runs the survey function for the given source [source_name] - option name of source to process ''' args = shlex.split(line) if len(args) < 1: print "Error: Incorrect # of arguments" return src_obj = core.get_source_info(self.cnx, args[0]) if src_obj is None: print "Error: Unable to find source {}".format(args[0]) return else: rpt = Report(self.cnx, src_obj) if len(args) > 1: rpt.run(args[1:]) else: rpt.run(None)
def update(self, source): print "[+] FileName Filter running on {} ".format(source) #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return now = time.time() # self.cnx.autocommit(False) query = """ INSERT INTO FileNameFilter_unique_name (file_name, unique_path_id) (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {}) ON DUPLICATE KEY UPDATE count = count + 1; """.format(src_info.source_id) cursor.execute(query) # self.cnx.autocommit(True) later = time.time() #print "Updated counts in {} secs\nUpdating Scores".format(later - now) cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name") (max_count,) = cursor.fetchone() now = time.time() query = """ INSERT INTO FileNameFilter_scores (id, score) ( SELECT fm.unique_file_id, MIN(fnfun.count / {}) FROM FileNameFilter_unique_name fnfun LEFT JOIN file_metadata fm ON fnfun.file_name = fm.file_name AND fnfun.unique_path_id = fm.unique_path_id WHERE not isnull(fm.unique_file_id) GROUP BY fm.unique_file_id ) ON DUPLICATE KEY UPDATE score = score """.format(max_count) cursor.execute(query) self.cnx.commit() later = time.time() #print "Scores updated in {} secs".format(later - now) cursor.close()
def discover_detect_anomalies(self, source, out=None): """ Conducts an anomaly search on a given source :param source: source :param out: output file (optional) """ cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "*** Error: Source not found" return #anomaly type: low prevalence files in normally high prevalence directories print "...Anomaly Detection: Unique files in common areas" query = """ SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, unique_path.full_path, file_metadata.file_name FROM global_file_prevalence LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id where file_metadata.source_id = {} HAVING difference > 0 ORDER BY difference desc limit 0, 100 """.format(src_info.source_id) cursor.execute(query) if out is None: results = cursor.fetchall() if results is None or len(results) == 0: print "No anomalies found" else: print "Showing top {} results".format(len(results)) for x in results: print x return results print "Writing results to {}".format(out) with open(out, "w") as f: v=0 for x in cursor.fetchall(): f.write("{}: {} {}/{}\n".format(v, x[0], x[1], x[2])) v+=1 cursor.close()
def discover_histogram_by_source(self, source_name, output=None): """ Displays a histogram of the distribution of file extensions that are executable of a single source as it relates to all occurrences of that extension across all systems :param source_name: The name of the source """ print '[+] Running \"Histogram by Source\"...' cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source_name) if src_info is None: print "Source {} does not exist".format(source_name) return num_systems = core.get_num_systems(self.cnx, src_info.os_id) bins = range(1, num_systems+2) # !! TODO query = """ SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}") GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC; """.format(source_name) cursor.execute(query) data = cursor.fetchall() if data == None: return counts, ranges = zip(*data) fig = plt.figure() perc = int( float(sum(counts[1:])) / sum(counts) * 100) ax = fig.add_subplot(111, title="File Executable Prevalence of {} with {}% > 1".format(src_info.source_name, perc)) ax.hist(ranges, weights=counts, bins = bins) ax.set_xlabel("Num of Systems") ax.set_ylabel("File Occurrences") plt.xticks(bins) if output is None: plt.show() else: plt.savefig(output)
def discover_detect_anomalies(self, source, out): """ Conducts an anomaly search on a given source :param source: source """ cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "*** Error: Source not found" return # !! TODO #anomaly type: low prevalence files in normally high prevalence directories print "Anomaly Detection: Unique files in common areas" print "running..." # !! TODO query = """ SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, unique_path.full_path, file_metadata.file_name FROM global_file_prevalence LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id where file_metadata.source_id = {} HAVING difference > 0 ORDER BY difference desc limit 0, 100 """.format(src_info.source_id) cursor.execute(query) if out is None: return cursor.fetchall() with open(out, "w") as f: v=0 for x in cursor.fetchall(): f.write("{}: {} {}{}\n".format(v, x[0], x[1], x[2])) v+=1 cursor.close()
def do_run_report_survey(self, source = None): '''[*] run_survey (optional)<source_name>\n\t|- runs the survey function for the given source\n\t |- if no source is provided run_survey processes all sources\n\t|-[source_name] - option name of source to process''' rpt = Report(self.cnx) if source == None: sources = core.get_all_sources(self.cnx) for s in sources: print "Running report survey for: " + s.source_name rpt.run_filter_survey(s.source_name) rpt.generate_report(s) else: src = core.get_source_info(self.cnx, source) if src == None: print "Source " + source + " does not exist" return print "Running report survey for: " + src.source_name rpt.run_filter_survey(src.source_name) rpt.generate_report(src)
def get_unique_names(self, source): """usage: unique_names source_name""" #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return #query = """ # SELECT fm.file_name, up.full_path # FROM file_metadata fm # LEFT JOIN FileNameFilter_unique_name fnfun # ON fnfun.file_name = fm.file_name # AND fnfun.unique_path_id = fm.unique_path_id # LEFT JOIN unique_path up # ON up.id = fm.unique_path_id # WHERE not isnull(fm.unique_file_id) # AND fnfun.count = 1 # AND fm.source_id = {} #""".format(src_info.source_id) cursor.execute(""" SELECT fm.file_name, up.full_path FROM file_metadata fm LEFT JOIN FileNameFilter_unique_name fnfun ON fnfun.file_name = fm.file_name AND fnfun.unique_path_id = fm.unique_path_id LEFT JOIN unique_path up ON up.id = fm.unique_path_id WHERE not isnull(fm.unique_file_id) AND fnfun.count = 1 AND fm.source_id = %s """, (src_info.source_id,)) data = cursor.fetchall() cursor.close() return data
def update(self, source): print "[+] FileName Filter running on {} ".format(source) #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return now = time.time() # self.cnx.autocommit(False) #query = """ # INSERT INTO FileNameFilter_unique_name # (file_name, unique_path_id) # (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = {}) # ON DUPLICATE KEY UPDATE count = count + 1; #""".format(src_info.source_id) cursor.execute(""" INSERT INTO FileNameFilter_unique_name (file_name, unique_path_id) (SELECT file_name, unique_path_id FROM file_metadata WHERE file_name != "/" and source_id = %s) ON DUPLICATE KEY UPDATE count = count + 1; """, (src_info.source_id,)) # self.cnx.autocommit(True) later = time.time() #print "Updated counts in {} secs\nUpdating Scores".format(later - now) cursor.execute("SELECT MAX(count) FROM FileNameFilter_unique_name") (max_count,) = cursor.fetchone() now = time.time() #query = """ # INSERT INTO FileNameFilter_scores # (id, score) # ( # SELECT # fm.unique_file_id, MIN(fnfun.count / {}) # FROM FileNameFilter_unique_name fnfun # LEFT JOIN file_metadata fm # ON fnfun.file_name = fm.file_name # AND fnfun.unique_path_id = fm.unique_path_id # WHERE not isnull(fm.unique_file_id) # GROUP BY fm.unique_file_id # ) # ON DUPLICATE KEY UPDATE score = score # """.format(max_count) cursor.execute(""" INSERT INTO FileNameFilter_scores (id, score) (SELECT fm.unique_file_id, MIN(fnfun.count / %s) FROM FileNameFilter_unique_name fnfun LEFT JOIN file_metadata fm ON fnfun.file_name = fm.file_name AND fnfun.unique_path_id = fm.unique_path_id WHERE not isnull(fm.unique_file_id) GROUP BY fm.unique_file_id) ON DUPLICATE KEY UPDATE score = score """, (max_count,)) self.cnx.commit() later = time.time() #print "Scores updated in {} secs".format(later - now) cursor.close()
def discover_evaluate_dir(self, dir_name, source, num_clusters=DEFAULT_NUM_CLUSTERS): """ Discovery function that applies kmeans clustering to a specified directory, displays the resulting scatter plot with the clusters, and then prints out an ordered list of the file by the distance from their respective centroid. Currently, this function uses two static features of "modification date" and "inode number" but future versions will allow for dynamic features inputs. :param dir_name: directory name to be analyzed (Required) :source: source name to be analzyed (Required) :num_clusters: specified number of clusters to use for kmeans (Default: 3) """ num_features = 2 num_clusters = int(num_clusters) cursor = self.cnx.cursor() if(dir_name.endswith('/')): dir_name = dir_name[:-1] print "...Running discovery function on source {} at directory {}".format(source, dir_name) src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} does not exist".format(source) return #grab all files for a particular directory from a specific source hash_val = sha1(dir_name).hexdigest() query = """ SELECT file_name, file_metadata_id, filesystem_id, last_modified FROM joined_file_metadata WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/' """.format(src_info.source_id, hash_val) cursor.execute(query) #bring all results into memory sql_results = cursor.fetchall() if(len(sql_results) == 0): return #zero out the array that will contain the inodes filesystem_id_arr = np.zeros((len(sql_results), num_features)) i = 0 for _, _,inode, mod_date in sql_results: seconds = calendar.timegm(mod_date.utctimetuple()) filesystem_id_arr[i] = (inode, seconds) i += 1 whitened = whiten(filesystem_id_arr) #get the centroids codebook,_ = kmeans(whitened, num_clusters) code, dist = vq(whitened, codebook) d = defaultdict(int) #quick way to get count of cluster sizes for c in code: d[c] += 1 #sorts the codes and sql_results together as pairs combined = zip(dist, code, sql_results) #sort results by distances from centroid sorted_results = sorted(combined, key=lambda tup: tup[0]) for dist_val, c, r in sorted_results: print "Dist: {} Cluster: {} Data: {}".format(dist_val,c,r) visual.visualize_scatter(d, code, whitened, codebook, 3, "inode number", "modification datetime", dir_name)
def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS): """ Evaluates and scores a given source with a specified number of clusters for kmeans. Currently this function uses two set features as inputs (modification time and inode number), however futures versions will allow for dynamic feature inputs :param source_name: media source name :param num_clusters: number of clusters to input into kmeans (Default: 3) """ cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source_name) #returns all files sorted by directory for the given source query = """ SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash FROM joined_file_metadata where source_id = {} order by parent_id asc """.format(src_info.source_id) cursor.execute(query) files = list() print "...Beginning clustering analysis" pool = Pool(processes=4) # start 4 worker processes manager = Manager() rows = manager.Queue() is_first = True parent_id_prev = None #should iterate by dir of a given source at this point for(file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash_val) in cursor: if is_first is True: is_first = False parent_id_prev = parent_id #if parent_id is diff than previous, we are in new directory, so pack it up for analysis if parent_id_prev != parent_id: parent_id_prev = parent_id if len(files) > 0: pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) files = list() #make sure to omit directories from the clustering analy if file_name != '/' and hash_val != "": files.append((file_metadata_id, last_modified, full_path,file_name, filesystem_id, parent_id)) if len(files) > 0: pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) pool.close() pool.join() input_rows = [] count = 0 while rows.empty() is False: curr = rows.get() input_rows.append(curr) count +=1 if count % 50000 is 0: print "...sending {} results to server".format(len(input_rows)) cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) input_rows = [] count=0 print "...sending {} results to server".format(len(input_rows)) cursor.executemany("""REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) self.cnx.commit() #need to drop the lu_scores and recalculate cursor.execute("drop table if exists lu_scores") query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` ( `id` bigint(20) unsigned NOT NULL, `score` double DEFAULT NULL, KEY `fk_unique_file0_id` (`id`), CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`) REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION ) ENGINE=InnoDB""") cursor.execute(query) print "...updating scores on the server" query = """ INSERT INTO lu_scores (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id) WHERE file_metadata.unique_file_id is not null GROUP BY file_metadata.unique_file_id) """ cursor.execute(query) self.cnx.commit()
def discover_evaluate_dir(self, dir_name, source, num_clusters=DEFAULT_NUM_CLUSTERS): """ Discovery function that applies kmeans clustering to a specified directory, displays the resulting scatter plot with the clusters, and then prints out an ordered list of the file by the distance from their respective centroid. Currently, this function uses two static features of "modification date" and "inode number" but future versions will allow for dynamic features inputs. :param dir_name: directory name to be analyzed (Required) :source: source name to be analzyed (Required) :num_clusters: specified number of clusters to use for kmeans (Default: 3) """ num_features = 2 num_clusters = int(num_clusters) cursor = self.cnx.cursor() if (dir_name.endswith('/')): dir_name = dir_name[:-1] print "...Running discovery function on source {} at directory {}".format( source, dir_name) src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} does not exist".format(source) return #grab all files for a particular directory from a specific source hash_val = sha1(dir_name).hexdigest() #query = """ # SELECT file_name, file_metadata_id, filesystem_id, last_modified # FROM joined_file_metadata # WHERE source_id ='{}' AND path_hash = '{}' AND file_name !='/' # """.format(src_info.source_id, hash_val) cursor.execute( """ SELECT file_name, file_metadata_id, filesystem_id, last_modified FROM joined_file_metadata WHERE source_id = %s AND path_hash = %s AND file_name !='/' """, ( src_info.source_id, hash_val, )) #bring all results into memory sql_results = cursor.fetchall() if (len(sql_results) == 0): return print "...Found {} files in specified directory".format( len(sql_results)) print "...Will form into {} clusters".format(num_clusters) if num_clusters > len(sql_results): print "Number of clusters ({}) exceeds number of files ({})".format( num_clusters, len(sql_results)) num_clusters = len(sql_results) print "Number of clusters is now: {}".format(num_clusters) #zero out the array that will contain the inodes filesystem_id_arr = np.zeros((len(sql_results), num_features)) i = 0 for _, _, inode, mod_date in sql_results: seconds = calendar.timegm(mod_date.utctimetuple()) filesystem_id_arr[i] = (inode, seconds) i += 1 whitened = whiten(filesystem_id_arr) #get the centroids codebook, _ = kmeans(whitened, num_clusters) code, dist = vq(whitened, codebook) d = defaultdict(int) #quick way to get count of cluster sizes for c in code: d[c] += 1 #sorts the codes and sql_results together as pairs combined = zip(dist, code, sql_results) #sort results by distances from centroid sorted_results = sorted(combined, key=lambda tup: tup[0]) for dist_val, c, r in sorted_results: print "Dist: {} Cluster: {} Data: {}".format(dist_val, c, r) if codebook is None or len(codebook) == 0: print "Data is not suitable for visualization" return visual.visualize_scatter(d, code, whitened, codebook, num_clusters, "inode number", "modification datetime", dir_name)
def evaluate_source(self, source_name, num_clusters=DEFAULT_NUM_CLUSTERS): """ Evaluates and scores a given source with a specified number of clusters for kmeans. Currently this function uses two set features as inputs (modification time and inode number), however futures versions will allow for dynamic feature inputs :param source_name: media source name :param num_clusters: number of clusters to input into kmeans (Default: 3) """ cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source_name) #returns all files sorted by directory for the given source #query = """ # SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash # FROM joined_file_metadata # where source_id = {} order by parent_id asc # """.format(src_info.source_id) cursor.execute( """ SELECT file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash FROM joined_file_metadata where source_id = %s order by parent_id asc """, (src_info.source_id, )) files = list() print "...Beginning clustering analysis" pool = Pool(processes=4) # start 4 worker processes manager = Manager() rows = manager.Queue() is_first = True parent_id_prev = None #should iterate by dir of a given source at this point for (file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id, hash_val) in cursor: if is_first is True: is_first = False parent_id_prev = parent_id #if parent_id is diff than previous, we are in new directory, so pack it up for analysis if parent_id_prev != parent_id: parent_id_prev = parent_id if len(files) > 0: pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) files = list() #make sure to omit directories from the clustering analy if file_name != '/' and hash_val != "": files.append((file_metadata_id, last_modified, full_path, file_name, filesystem_id, parent_id)) if len(files) > 0: pool.apply_async(do_eval, [rows, full_path, files, num_clusters, 2]) pool.close() pool.join() input_rows = [] count = 0 while rows.empty() is False: curr = rows.get() input_rows.append(curr) count += 1 if count % 50000 is 0: print "...sending {} results to server".format(len(input_rows)) cursor.executemany( """REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) input_rows = [] count = 0 print "...sending {} results to server".format(len(input_rows)) cursor.executemany( """REPLACE INTO locality_uniqueness(file_metadata_id, score) values(%s, %s)""", input_rows) self.cnx.commit() #need to drop the lu_scores and recalculate cursor.execute("drop table if exists lu_scores") query = ("""CREATE TABLE IF NOT EXISTS `lu_scores` ( `id` bigint(20) unsigned NOT NULL, `score` double DEFAULT NULL, KEY `fk_unique_file0_id` (`id`), CONSTRAINT `fk_unique_file0_id` FOREIGN KEY (`id`) REFERENCES `unique_file` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION ) ENGINE=InnoDB""") cursor.execute(query) print "...updating scores on the server" query = """ INSERT INTO lu_scores (SELECT file_metadata.unique_file_id, avg(locality_uniqueness.score) FROM locality_uniqueness LEFT JOIN file_metadata on (locality_uniqueness.file_metadata_id = file_metadata.id) WHERE file_metadata.unique_file_id is not null GROUP BY file_metadata.unique_file_id) """ cursor.execute(query) self.cnx.commit()
def discover_detect_anomalies(self, source, out=None): """ Conducts an anomaly search on a given source :param source: source :param out: output file (optional) """ cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "*** Error: Source not found" return #anomaly type: low prevalence files in normally high prevalence directories print "...Anomaly Detection: Unique files in common areas" #query = """ # SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, # unique_path.full_path, file_metadata.file_name # FROM global_file_prevalence # LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id # LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id # LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id # where file_metadata.source_id = {} # HAVING difference > 0 # ORDER BY difference desc limit 0, 100 #""".format(src_info.source_id) cursor.execute( """ SELECT (global_dir_combined_prevalence.average - global_file_prevalence.average) as difference, unique_path.full_path, file_metadata.file_name FROM global_file_prevalence LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id LEFT JOIN global_dir_combined_prevalence ON file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id LEFT JOIN unique_path ON file_metadata.unique_path_id = unique_path.id where file_metadata.source_id = %s HAVING difference > 0 ORDER BY difference desc limit 0, 100 """, (src_info.source_id, )) if out is None: results = cursor.fetchall() if results is None or len(results) == 0: print "No anomalies found" else: print "Showing top {} results".format(len(results)) for x in results: print x return results print "Writing results to {}".format(out) with open(out, "w") as f: v = 0 for x in cursor.fetchall(): f.write("{}: {} {}/{}\n".format(v, x[0], x[1], x[2])) v += 1 cursor.close()
def update(self, source): """ Updates the scores of the ep_scores table with the new data from the inputted source :param source: identifier for the source to be updated """ print "[+] Executable Prevalence Filter running on {} ".format(source) #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return #initial insert # !! TODO query = """ INSERT INTO ep_scores(id, score) SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average) FROM global_file_prevalence JOIN file_metadata ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id where file_metadata.source_id = {} ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average) """.format(src_info.source_id) cursor.execute(query) self.cnx.commit() #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each #dir, but his will have to work for now. # !! TODO query = """ UPDATE global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id SET fp_scores.score = fp_scores.score * .5 where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2 and global_dir_combined_prevalence.average - global_file_prevalence.average > .6 """.format(src_info.source_id) cursor.execute(query) self.cnx.commit() #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches # !! TODO query = """ UPDATE file_metadata LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5 """.format(src_info.source_id) cursor.execute(query) self.cnx.commit() cursor.close()
def run_survey(self, source_name): print "...running survey for {}".format(self.name) resources = "resources" img_by_src = "hist_by_src.png" img_by_os = "hist_by_os.png" survey_file = "survey.html" survey_dir = "survey_{}_{}".format(self.name, source_name) resource_dir = os.path.join(survey_dir, resources) html_file = os.path.join(survey_dir, survey_file) try: shutil.rmtree(survey_dir) except: pass os.mkdir(survey_dir) os.mkdir(resource_dir) src_info = core.get_source_info(self.cnx, source_name) self.discover_histogram_by_source(source_name, os.path.join(resource_dir, img_by_src)) self.discover_histogram_by_os(src_info.os_name, os.path.join(resource_dir, img_by_os)) anomalies = self.discover_detect_anomalies(source_name, None) results = self.show_results("bottom", 100, source_name, None) with open(html_file, 'w') as f: f.write(""" <html> <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css"> <h2 class="redwood-title">Filter Prevalence Snapshot</h2> <body> <h3 class="redwood-header">Histogram for {}</h3> <img src="{}"> <h3 class="redwood-header">Histogram for Operating System - {}</h3> <img src="{}"> """.format( source_name, os.path.join(resources, img_by_src), src_info.os_name, os.path.join(resources, img_by_os) )) f.write("<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>") f.write("<table border=\"1\" id=\"redwood-table\">") f.write("<thead><tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>") i = 0 lr = len(results) for r in results: if i == lr - 1: f.write("</tbody><tfoot>") f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2])) else: f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2])) i += 1 f.write("</table>") f.write("<h3 class=\"redwood-header\">The top 100 anomalous files</h3>") f.write("<table border=\"1\" id=\"redwood-table\">") f.write("<thead><tr><th class=\"rounded-head-left\">Anomaly Value</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>") i = 0 lr = len(anomalies) for r in anomalies: if i == lr - 1: f.write("</tbody><tfoot>") f.write("<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>".format(r[0], r[1], r[2])) else: f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2])) i += 1 #for r in anomalies: # f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2])) f.write("</table>") return survey_dir
def discover_histogram_by_source(self, source_name, output=None): """ Displays a histogram of the file distribution of a single source as it relates to all occurrences of that file across all systems :param source_name: The name of the source :param output: (optional) output filename in PNG format """ print '[+] Running \"Histogram by Source\"...' cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source_name) if src_info is None: print "Source {} does not exist".format(source_name) return num_systems = core.get_num_systems(self.cnx, src_info.os_id) bins = range(1, num_systems + 2) #query = """ # SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence # LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id # WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = "{}") # GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC; #""".format(source_name) cursor.execute( """ SELECT COUNT(file_metadata.id), global_file_prevalence.count FROM global_file_prevalence LEFT JOIN file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id WHERE file_metadata.source_id = (SELECT media_source.id FROM media_source WHERE media_source.name = %s) GROUP BY global_file_prevalence.count ORDER BY global_file_prevalence.count ASC; """, (source_name, )) data = cursor.fetchall() if data == None or len(data) is 0: return counts, ranges = zip(*data) fig = plt.figure() perc = int(float(sum(counts[1:])) / sum(counts) * 100) ax = fig.add_subplot(111, title="File Prevalence of {} with {}% > 1".format( src_info.source_name, perc)) ax.hist(ranges, weights=counts, bins=bins) ax.set_xlabel("Num of Systems") ax.set_ylabel("File Occurrences") if output is None: plt.show() else: print "Saving histogram to {}".format(output) plt.savefig(output)
def run_survey(self, source_name): print "...running survey for {}".format(self.name) resources = "resources" img_by_src = "hist_by_src.png" img_by_os = "hist_by_os.png" survey_file = "survey.html" survey_dir = "survey_{}_{}".format(self.name, source_name) resource_dir = os.path.join(survey_dir, resources) html_file = os.path.join(survey_dir, survey_file) try: shutil.rmtree(survey_dir) except: pass os.mkdir(survey_dir) os.mkdir(resource_dir) src_info = core.get_source_info(self.cnx, source_name) self.discover_histogram_by_source( source_name, os.path.join(resource_dir, img_by_src)) self.discover_histogram_by_os(src_info.os_name, os.path.join(resource_dir, img_by_os)) anomalies = self.discover_detect_anomalies(source_name, None) results = self.show_results("bottom", 100, source_name, None) with open(html_file, 'w') as f: f.write(""" <html> <link href="../../../resources/css/style.css" rel="stylesheet" type="text/css"> <h2 class="redwood-title">Filter Prevalence Snapshot</h2> <body> <h3 class="redwood-header">Histogram for {}</h3> <img src="{}"> <h3 class="redwood-header">Histogram for Operating System - {}</h3> <img src="{}"> """.format(source_name, os.path.join(resources, img_by_src), src_info.os_name, os.path.join(resources, img_by_os))) f.write( "<h3 class=\"redwood-header\">The lowest 100 reputations for this filter</h3>" ) f.write("<table border=\"1\" id=\"redwood-table\">") f.write( "<thead><tr><th class=\"rounded-head-left\">Score</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>" ) i = 0 lr = len(results) for r in results: if i == lr - 1: f.write("</tbody><tfoot>") f.write( "<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>" .format(r[0], r[1], r[2])) else: f.write( "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format( r[0], r[1], r[2])) i += 1 f.write("</table>") f.write( "<h3 class=\"redwood-header\">The top 100 anomalous files</h3>" ) f.write("<table border=\"1\" id=\"redwood-table\">") f.write( "<thead><tr><th class=\"rounded-head-left\">Anomaly Value</th><th>Parent Path</th><th class=\"rounded-head-right\">Filename</th></tr></thead><tbody>" ) i = 0 lr = len(anomalies) for r in anomalies: if i == lr - 1: f.write("</tbody><tfoot>") f.write( "<tr><td class=\"rounded-foot-left-light\">{}</td><td>{}</td><td class=\"rounded-foot-right-light\">{}</td></tr></tfoot>" .format(r[0], r[1], r[2])) else: f.write( "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format( r[0], r[1], r[2])) i += 1 #for r in anomalies: # f.write("<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(r[0], r[1], r[2])) f.write("</table>") return survey_dir
def update(self, source): """ Updates the scores of the fp_scores table with the new data from the inputted source :param source: identifier for the source to be updated """ print "[+] Prevalence Filter running on {} ".format(source) #creates the basic tables if they do not exist self.build() cursor = self.cnx.cursor() src_info = core.get_source_info(self.cnx, source) if src_info is None: print "Error: Source {} not found".format(source) return #initial insert #query = """ # INSERT INTO fp_scores(id, score) # SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average) # FROM global_file_prevalence JOIN file_metadata # ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id # where file_metadata.source_id = {} # ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average) #""".format(src_info.source_id) cursor.execute( """ INSERT INTO fp_scores(id, score) SELECT global_file_prevalence.unique_file_id, IF(num_systems < 3, .5, average) FROM global_file_prevalence JOIN file_metadata ON file_metadata.unique_file_id = global_file_prevalence.unique_file_id where file_metadata.source_id = %s ON DUPLICATE KEY UPDATE score = IF(num_systems < 3, .5, average) """, (src_info.source_id, )) self.cnx.commit() #adjustment for low outliers in high prevalent directories... This could probably better be done with taking the std dev of each #dir, but his will have to work for now. #query = """ # UPDATE global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id # LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id # LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id # LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id # SET fp_scores.score = fp_scores.score * .5 # where file_metadata.source_id = {} AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2 # and global_dir_combined_prevalence.average - global_file_prevalence.average > .6 #""".format(src_info.source_id) cursor.execute( """ UPDATE global_file_prevalence left join file_metadata ON global_file_prevalence.unique_file_id = file_metadata.unique_file_id LEFT JOIN global_dir_prevalence on file_metadata.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN global_dir_combined_prevalence on file_metadata.unique_path_id = global_dir_combined_prevalence.unique_path_id LEFT JOIN fp_scores ON fp_scores.id = global_file_prevalence.unique_file_id SET fp_scores.score = fp_scores.score * .5 where file_metadata.source_id = %s AND global_file_prevalence.count = 1 and global_file_prevalence.num_systems > 2 and global_dir_combined_prevalence.average - global_file_prevalence.average > .6 """, (src_info.source_id, )) self.cnx.commit() #adjustments for low prevalent scored directories which occur often... hopefully this will exclude the caches #query = """ # UPDATE file_metadata # LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id # LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id # LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id # SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score # where file_metadata.source_id = {} AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5 #""".format(src_info.source_id) cursor.execute( """ UPDATE file_metadata LEFT JOIN global_dir_prevalence ON file_metadata.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN global_dir_combined_prevalence ON global_dir_combined_prevalence.unique_path_id = global_dir_prevalence.unique_path_id LEFT JOIN fp_scores ON file_metadata.unique_file_id = fp_scores.id SET fp_scores.score = (1 - fp_scores.score) * .25 + fp_scores.score where file_metadata.source_id = %s AND global_dir_prevalence.average > .8 AND global_dir_combined_prevalence.average < .5 """, (src_info.source_id, )) self.cnx.commit() cursor.close()