def runIndexedSearch(dbfilenameFullPath, search_space, options): # todo: Handle duplicate hit supression logger.info("Performing indexed search") DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) DB.appInitDB() DB.appConnectDB() searchTerm = options.searchLiteral[0] numHits = 0 # Run actual indexed query data = DB.Query("SELECT RowID FROM Entries_FilePaths WHERE %s == '%s';" % (search_space, searchTerm)) if data: # results = [] # results.append(('cyan', "FileName,HitCount".split(','))) with open(options.outputFile, "w") as text_file: with open( os.path.join( ntpath.dirname(options.outputFile), ntpath.splitext(options.outputFile)[0] + ".mmd"), "w") as markdown_file: for row in data: # results.append(('white', row)) record = retrieveSearchData(row[0], DB, search_space) saveSearchData(record, None, None, text_file, markdown_file) numHits += 1 # outputcolum(results) return (numHits, 0, []) else: return (0, 0, [])
def test_Stack_Generic01(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Run (db_filenameFullPath, db_version, db_count, num_instances, num_entries) = main([self.testset1, "status"]) ret = main([self.testset1, "stack", "FileName"]) for item_count, item_file_name in [(int(i[1][0]), i[1][1]) for i in ret[1:]][1:10]: print "Checking: " + item_file_name (num_hits, num_hits_suppressed, results) = main( [self.testset1, "search", "-F", '\\' + item_file_name]) self.assertEquals(num_hits, item_count, "test_Stack_Generic01 failed!") (num_hits2, num_hits_suppressed2, results2) = main([ self.testset1, "fsearch", "FileName", "-F", "=" + item_file_name ]) self.assertEquals(num_hits2, item_count, "test_Stack_Generic01 failed!") # Check total entry count from stacking on FileName = total # entries. count = sum([int(i[1][0]) for i in ret[1:]]) self.assertEquals(count, num_entries, "test_Stack_Generic01 failed!")
def test_Filehitcount1(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() entry_fields = settings.EntriesFields(EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='test123.exe') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields(EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='test1234.exe') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields(EntryType=settings.__APPCOMPAT__, FilePath='C:\\test123.exe', FileName='nohit.exe') add_entry(DB, "TestHost01", entry_fields) # Get temp db name for the test temp_file = tempfile.NamedTemporaryFile(suffix='.db', prefix='testCase', dir=tempfile.gettempdir()) temp_file.close() with open(temp_file.name, 'w') as fh: fh.write('test123.exe') try: ret = main([self.testset1, "filehitcount", temp_file.name]) except Exception as e: print traceback.format_exc() self.fail(e.message + "\n" + traceback.format_exc()) # Remove temp file os.remove(temp_file.name) num_hits = len(ret) self.assertEquals(num_hits, 2, sys._getframe().f_code.co_name) self.assertEquals(ret[1][1][1][0], 'test123.exe', "test_Tstomp1 failed!") self.assertEquals(int(ret[1][1][1][1]), 1, "test_Tstomp1 failed!")
def test_AppCompat_LiteralSearchNoHits(self): rndFileName = ''.join( random.choice(string.ascii_uppercase) for _ in range(20)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Get temp file name for the DB with tempfile.NamedTemporaryFile( suffix='.txt', prefix='test_AppCompat_LiteralSearch', dir=tempfile.gettempdir()) as temp_file: # Search (num_hits, num_hits_suppressed, results) = main([ "-o", temp_file.name, self.testset1, "search", "-F", rndFileName ]) # Check we got at least as many as we added into the DB self.assertTrue( num_hits == 0, sys._getframe().f_code.co_name + " num_hits: %d" % num_hits) # Check output has the expected result self.assertEquals( num_hits - num_hits_suppressed, self.count_lines_regex(temp_file.name, rndFileName), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!")
def test_Stack(self): rndFileName = ''.join( random.choice(string.ascii_uppercase) for _ in range(15)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Add stuff to stack for i in xrange(0, 10): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Windows', FileName=rndFileName, Size=i, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Run ret = main([ self.testset1, "stack", "FileName", "FilePath = 'c:\Windows'" ]) # Check status count == db count count = int([i[1][0] for i in ret if rndFileName in i[1]][0]) self.assertEquals(count, 10, "test_Stack failed!")
def test_TStack(self): rndFileName = 'randomfilename.rnd' with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Add stuff to stack for i in xrange(0, 10): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Windows', FileName=rndFileName, Size=i, LastModified='1000-01-01 00:00:0' + str(i)) add_entry(DB, "TestHost01", entry_fields) # Run ret = main([self.testset1, "tstack", '1000-01-01', '1000-01-02']) # Check we found the right file self.assertEquals(ret[1][1][0], rndFileName, "test_TStack failed!") # Check expected in count self.assertEquals(int(ret[1][1][1]), 10, "test_TStack failed!") # Check expected out count self.assertEquals(int(ret[1][1][2]), 0, "test_TStack failed!")
def test_AmCache_LiteralSearch(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() for i in xrange(0, 10): entry_fields = settings.EntriesFields( EntryType=settings.__AMCACHE__, FilePath='C:\Temp', FileName='calc.exe', Size=i, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Get temp file name for the DB with tempfile.NamedTemporaryFile( suffix='.txt', prefix='Output', dir=tempfile.gettempdir()) as temp_file: # Search (num_hits, num_hits_suppressed, results) = main([ "-o", temp_file.name, self.testset1, "search", "-F", "calc.exe" ]) # Check we got at least as many as we added into the DB self.assertTrue( num_hits >= 10, sys._getframe().f_code.co_name + " num_hits: %d" % num_hits) # Check output has the expected result self.assertEquals( num_hits, self.count_lines_regex(temp_file.name, "calc\.exe"), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!")
def test_MPEngine_ConsumerSimple(self): try: # Get temp db name for the test tempdb = tempfile.NamedTemporaryFile(suffix='.db', prefix='testCase', dir=tempfile.gettempdir()) tempdb.close() dbfilenameFullPath = tempdb.name with appDB.DBClass(dbfilenameFullPath, settings.__version__) as DB: DB.appInitDB() print "Starting test" mpe = MPEngineProdCons(6, WkrTestProd, WkrTestConsDB) # Add tasks task_list = [i for i in xrange(1, 5)] mpe.addTaskList(task_list) mpe.addConsumer([dbfilenameFullPath]) time.sleep(1) mpe.removeConsumer() del mpe print "Test ended" except Exception: traceback.print_exc(file=sys.stdout) self.fail("Exception triggered") # Pass self.assertEquals(1, 1, "test_MPEngine")
def test_AppCompat_IndexedSearch2(self): rndFileName = ''.join( random.choice(string.ascii_uppercase) for _ in range(20)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() for i in xrange(0, 20): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName=rndFileName, Size=i, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Get temp file name for the DB with tempfile.NamedTemporaryFile( suffix='.txt', prefix='test_AppCompat_IndexedSearch', dir=tempfile.gettempdir()) as temp_file_indexed: with tempfile.NamedTemporaryFile( suffix='.txt', prefix='test_AppCompat_NormalSearch', dir=tempfile.gettempdir()) as temp_file_normal: # Indexed Search (num_hits, num_hits_suppressed, results) = main([ "-o", temp_file_indexed.name, self.testset1, "fsearch", "FileName", "-F", rndFileName ]) # Standard Search (num_hits2, num_hits_suppressed2, results2) = main([ "-o", temp_file_normal.name, self.testset1, "search", "-F", "\\" + rndFileName ]) # Check we got the same number of hits self.assertTrue( num_hits == num_hits2, sys._getframe().f_code.co_name + " num_hits: %d" % num_hits) # Check output has the expected results self.assertEquals( num_hits - num_hits_suppressed, self.count_lines_regex(temp_file_indexed.name, rndFileName), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!") # Check output has the expected results self.assertEquals( num_hits2 - num_hits_suppressed2, self.count_lines_regex(temp_file_normal.name, rndFileName), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!") # Check standard and indexed search produced the same results self.assertTrue( self.compare_output_files(temp_file_normal.name, temp_file_indexed.name), "Results differ!")
def dumpCSV(self, dbfilenameFullPath, dumpfilenameFullPath): DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) DB.appInitDB() conn = DB.appConnectDB() rows = DB.Query("SELECT * FROM Csv_Dump") with open(dumpfilenameFullPath, "w") as file_handle: for row in rows: line = [str(field) for field in row] file_handle.write("%s\n" % ','.join(line)) file_handle.flush()
def test_AppCompat_LiteralSearch_Suppressed(self): rndFileName = ''.join( random.choice(string.ascii_uppercase) for _ in range(15)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Add 10 entries for i in xrange(0, 10): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName=rndFileName, Size=i, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Add 10 entries which will be deduped to 1 on search for i in xrange(0, 10): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName=rndFileName, Size=1000, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Get temp file name for the DB with tempfile.NamedTemporaryFile( suffix='.txt', prefix='Output', dir=tempfile.gettempdir()) as temp_file: # Search (num_hits, num_hits_suppressed, results) = main([ "-o", temp_file.name, self.testset1, "search", "-F", rndFileName ]) # Check we got as many hits as we expect self.assertTrue( num_hits == 10 + 10, sys._getframe().f_code.co_name + " num_hits: %d - %s" % (num_hits, self.testset1)) # Check supression worked as expected self.assertTrue( num_hits_suppressed == 9, sys._getframe().f_code.co_name + " num_hits: %d" % num_hits) # Check output has the expected result self.assertEquals( num_hits - num_hits_suppressed, self.count_lines_regex(temp_file.name, rndFileName), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!")
def test_Leven2(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Add stuff entry_fields = settings.EntriesFields(EntryType=settings.__APPCOMPAT__, FilePath='C:\Windows\System32', FileName='svchosts.exe') add_entry(DB, "TestHost01", entry_fields) # Run ret = main([self.testset1, "leven"]) # Check we found the right file self.assertEquals('svchosts.exe' in ret[1][1][1], True, "test_Leven2 failed!")
def test_Leven(self): rndFileName = ''.join(random.choice(string.ascii_uppercase) for _ in range(15)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Add stuff entry_fields = settings.EntriesFields(EntryType=settings.__APPCOMPAT__, FilePath='C:\Windows\System32', FileName=rndFileName) add_entry(DB, "TestHost01", entry_fields) # Run leven_fileName = 'a' + rndFileName ret = main([self.testset1, "leven", leven_fileName]) # Check we found the right file self.assertEquals(ret[1][1][1], "'"+rndFileName+"'", "test_Leven failed!")
def test_MPEngine_DatabaseLocked(self): try: logger.info("Starting test_MPEngine_end2end_BalanceSimulation") # Get temp db name for the test tempdb = tempfile.NamedTemporaryFile(suffix='.db', prefix='testCase', dir=tempfile.gettempdir()) tempdb.close() dbfilenameFullPath = tempdb.name with appDB.DBClass(dbfilenameFullPath, settings.__version__) as DB: DB.appInitDB() num_tasks = 50 mpe = MPEngineProdCons(4, WkrTestProdFast, WkrTestConsDB) # Add tasks task_list = [i for i in xrange(1, num_tasks + 1)] mpe.addTaskList(task_list) mpe.addConsumer([dbfilenameFullPath]) mpe.addProducer() loop_test_num = num_tasks while mpe.working(): (num_prod, num_cons, task1, task2, task3) = mpe.getProgress() print("Prod: %d / Cons: %d | %s -> %s -> %s" % mpe.getProgress()) time.sleep(1) if task3 >= 20 and task3 <= 30: logger.info("Simulating rebalance (task3: %d task1/2: %d" % (task3, task1 / 2)) mpe.restartConsumers() mpe.restartProducers() loop_test_num -= 1 results = mpe.grabResults() self.assertEquals(len(results), num_tasks, "test_MPEngine_end2end") self.assertEquals(results[-1], num_tasks, "test_MPEngine_end2end") del mpe print "Test ended" except Exception: traceback.print_exc(file=sys.stdout) self.fail("Exception triggered") # Pass self.assertEquals(1, 1, "test_MPEngine_end2end")
def run(self): # Note: __init__ runs on multiprocessing's main thread and as such we can't use that to init a sqlite connection assert(len(self.extra_arg_list) == 1) self.dbfilenameFullPath = self.extra_arg_list[0] self.DB = None self.conn = None # Init DB access to DB self.DB = appDB.DBClass(self.dbfilenameFullPath, True, settings.__version__) # self.DB.appInitDB() self.conn = self.DB.appConnectDB() # Call super run to continue with the natural worker flow super(appLoadCons, self).run() # Close DB connection self.logger.debug("%s - closing down DB" % self.proc_name) self.conn.close() del self.DB
def run(self): self.logger.info("WorkerTestConsumerDB: Run") self.dbfilenameFullPath = self.extra_arg_list[0] self.DB = None self.conn = None # Init DB access to DB self.DB = appDB.DBClass(self.dbfilenameFullPath, True, settings.__version__) self.conn = self.DB.appConnectDB() self.logger.info("WorkerTestConsumerDB: appConnectDB done") # Call super run to continue with the natural worker flow super(WkrTestConsDB, self).run() # Close DB connection self.logger.info("%s - closing down DB" % self.proc_name) # # Simulate a very log pending queue of data that needs to be dumped to the DB before we can exit: # self.write_to_DB(10, 20) # self.conn.close() self.logger.info("%s - deleting DB object" % self.proc_name) del self.DB
def test_AppCompat_IndexedSearchFilePath(self): rndFileName = ''.join( random.choice(string.ascii_uppercase) for _ in range(20)) with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() for i in xrange(0, 20): entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\\' + rndFileName, FileName="calc.exe", Size=i, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # Get temp file name for the DB with tempfile.NamedTemporaryFile( suffix='.txt', prefix='test_AppCompat_IndexedSearch', dir=tempfile.gettempdir()) as temp_file: # Search (num_hits, num_hits_suppressed, results) = main([ "-o", temp_file.name, self.testset1, "fsearch", "FilePath", "-F", "C:\\" + rndFileName ]) # Check we got at least as many as we added into the DB self.assertTrue( num_hits == 20, sys._getframe().f_code.co_name + " num_hits: %d" % num_hits) # Check output has the expected result self.assertEquals( num_hits - num_hits_suppressed, self.count_lines_regex(temp_file.name, rndFileName), sys._getframe().f_code.co_name + " Output regex count doesn't match num_hits!")
def test_StatusAppCompat(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Get host list (db_filenameFullPath2, db_version2, db_count2, num_instances2, num_entries2) = main([self.testset1, "status"]) db_count_query = DB.CountHosts() # Check status count == db count self.assertEquals(db_count2, db_count_query, "test_StatusAmCache failed!") # Check status count == known host # self.assertEquals(db_count2, self.fake_bd_num_hosts, "test_StatusAmCache failed!") # Check reported path == known path self.assertEquals(db_filenameFullPath2, self.testset1, "test_StatusAmCache failed!") # Check entries count is with expected parameters self.assertTrue( num_entries2 > 400 * self.fake_bd_num_hosts and num_entries2 < 800 * self.fake_bd_num_hosts, "test_StatusAmCache failed!")
def build_fake_DB(hosts=10, seed=random.randint(0, 10000), database_file=None): hostnames_set = set() filePaths_dict = defaultdict(int) filePaths_dict_ID = 0 filePaths_dict_ID_skip = 0 random.seed(seed) fake.seed(seed) fake_ES.seed(seed) if database_file == None: # Get temp db name for the test tempdb = tempfile.NamedTemporaryFile(suffix='.db', prefix='testCase', dir=tempfile.gettempdir()) tempdb.close() database_file = tempdb.name if os.path.isfile(database_file): logger.warning("Adding hosts to existing database") with appDB.DBClass(database_file, "False", settings.__version__) as DB: conn = DB.appConnectDB() # Load existing hosts data = DB.Query("SELECT HostName FROM Hosts") for hostName in data: hostnames_set.add(hostName[0]) # Load existing paths data = DB.Query("SELECT FilePathID, FilePath FROM FilePaths") for filePathID, FilePath in data: filePaths_dict[FilePath] = (filePathID) filePaths_dict_ID += 1 filePaths_dict_ID_skip = filePaths_dict_ID else: with appDB.DBClass(database_file, "True", settings.__version__) as DB: DB.appInitDB() DB.appSetIndex() conn = DB.appConnectDB() DB.appRequireIndexesDB( "index_EntriesHostName", "CREATE INDEX index_EntriesHostName on Hosts(HostName)") DB.appRequireIndexesDB( "index_FilePathsFilePath", "CREATE INDEX index_FilePathsFilePath on FilePaths(FilePath)") with appDB.DBClass(database_file, "False", settings.__version__) as DB: conn = DB.appConnectDB() # Start creating hosts and data: rowList = [] insertList = [] numFields = 29 - 3 valuesQuery = "(NULL," + "?," * numFields + "0, 0)" progressCurrent = 0 progressTotal = hosts for i in xrange(0, hosts): progressCurrent += 1 update_progress(float(progressCurrent) / float(progressTotal)) HostName = "" while True: HostName = strip_accents( (fake_ES.color_name() + fake_ES.country()).replace( ' ', '')) HostName = strip_non_ascii(HostName) HostName += "_" + str(random.randint(000, 999)) if HostName not in hostnames_set: hostnames_set.add(HostName) break print "Creating appcompat/amcache data for host: %s" % HostName Instances = ['dummy'] InstancesCounter = 1 Recon = 0 ReconScoring = 0 DB.ExecuteMany("INSERT INTO Hosts VALUES (NULL,?,?,?,?,?)", [(HostName, str(repr(Instances)), InstancesCounter, Recon, ReconScoring)]) HostID = DB.Query( "SELECT HostID FROM Hosts WHERE HostName = '%s'" % HostName)[0][0] # Sampled 2K hosts, this should statistically provide a somewhat realistic amount of entries (for AppCompat) for i in xrange(1, random.randint(400, 800)): # EntryType = random.choice([settings.__APPCOMPAT__,settings.__AMCACHE__]) EntryType = settings.__APPCOMPAT__ RowNumber = 0 LastModified = str(fake.date_time_between('-1y')) + "." + str( random.randint(1, 9999)) LastUpdate = str(fake.date_time_between('-4y')) + "." + str( random.randint(1, 9999)) filePathID = 0 # todo: FilePath retains final backslash on root paths (c:\, d:\ ...) remove. FilePath, FileName = ntpath.split(fake.path()) FilePath = FilePath.lower() FileName = FileName.lower() Size = random.randint(1, 100000) if EntryType == settings.__APPCOMPAT__: ExecFlag = random.choice(['True', 'False']) else: ExecFlag = 'True' if EntryType == settings.__AMCACHE__: SHA1 = fake.sha1() FileDescription = random.choice( ['', '', '', '', '', '', '', '', '', '', fake.text()]) FirstRun = str(fake.date_time_between('-1y')) + "." + str( random.randint(1, 9999)) Created = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) Modified1 = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) Modified2 = str(fake.date_time_between('-5y')) + "." + str( random.randint(1, 9999)) LinkerTS = str(fake.date_time_between('-10y')) Company = fake.company() PE_sizeofimage = random.randint(1, 10000) # Redo re-assignment of date we do on load for AmCache LastUpdate = FirstRun LastModified = Modified2 else: SHA1 = '' FileDescription = '' FirstRun = '' Created = '' Modified1 = '' Modified2 = '' LinkerTS = '' Company = '' PE_sizeofimage = '' Product = 0 Version_number = 0 Version = 0 Language = 0 Header_hash = 0 PE_checksum = 0 SwitchBackContext = 0 InstanceID = 0 # # Add FilePath if not there yet # DB.Execute("INSERT OR IGNORE INTO FilePaths VALUES (NULL, '%s')" % FilePath) # # Get FilePathID # FilePathID = DB.QueryInt("SELECT FilePathID FROM FilePaths WHERE FilePath = '%s'" % FilePath) if FilePath not in filePaths_dict: filePaths_dict[FilePath] = (filePaths_dict_ID) filePathID = filePaths_dict_ID filePaths_dict_ID += 1 else: filePathID = filePaths_dict[FilePath] insertList.append( (HostID, EntryType, RowNumber, LastModified, LastUpdate, filePathID, FileName, Size, ExecFlag, SHA1, FileDescription, FirstRun, Created, Modified1, Modified2, LinkerTS, Product, Company, PE_sizeofimage, Version_number, Version, Language, Header_hash, PE_checksum, SwitchBackContext, InstanceID)) # Dump every now and then: if len(insertList) > 1000000: logger.info("Dumping data to DB") DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery, insertList) insertList = [] # Insert last bucket logger.info("Dumping last bucket to DB") DB.ExecuteMany("INSERT INTO Entries VALUES " + valuesQuery, insertList) # Insert new FilePaths list_FilePath_ID = [(v, k) for k, v in filePaths_dict.items()] list_FilePath_ID.sort(key=lambda tup: tup[0]) DB.ExecuteMany("INSERT INTO FilePaths VALUES (?,?)", list_FilePath_ID[filePaths_dict_ID_skip:]) return database_file
def test_TcorrTest_prog1(self): with appDB.DBClass(self.testset1, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # TestHost01 entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='AAA.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='BBB.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='CCC.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='DDD.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='EEE.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='FFF.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='GGG.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost01", entry_fields) # TestHost02 entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='AAA.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='BBB.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='CCC.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='DDD.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='EEE.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='FFF.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='GGG.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost02", entry_fields) try: directCorrelationData = main( [self.testset1, "tcorr", "DDD.exe", "-w 1"]) except Exception as e: print traceback.format_exc() self.fail(e.message + "\n" + traceback.format_exc()) # Check Names self.assertEquals(directCorrelationData[1][3], "CCC.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[0][3], "EEE.exe", "test_TcorrTest_prog1 - Name failed!") # Check Before self.assertEquals(directCorrelationData[1][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[0][6], 2, "test_TcorrTest_prog1 - Name failed!") # Check After self.assertEquals(directCorrelationData[1][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[0][7], 0, "test_TcorrTest_prog1 - Name failed!") # Check InvBond self.assertEquals(directCorrelationData[1][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[0][9], "True", "test_TcorrTest_prog1 - Name failed!") # Check Total_Count self.assertEquals(directCorrelationData[1][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[0][10], 2, "test_TcorrTest_prog1 - Name failed!") try: directCorrelationData = main( [self.testset1, "tcorr", "DDD.exe", "-w 2"]) except Exception as e: print traceback.format_exc() self.fail(e.message + "\n" + traceback.format_exc()) # Check Names self.assertEquals(directCorrelationData[0][3], "CCC.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][3], "EEE.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][3], "BBB.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][3], "FFF.exe", "test_TcorrTest_prog1 - Name failed!") # Check Before self.assertEquals(directCorrelationData[0][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][6], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][6], 2, "test_TcorrTest_prog1 - Name failed!") # Check After self.assertEquals(directCorrelationData[0][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][7], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][7], 0, "test_TcorrTest_prog1 - Name failed!") # Check InvBond self.assertEquals(directCorrelationData[0][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][9], "True", "test_TcorrTest_prog1 - Name failed!") # Check Total_Count self.assertEquals(directCorrelationData[0][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][10], 2, "test_TcorrTest_prog1 - Name failed!") # Check Weight self.assertTrue( directCorrelationData[0][8] > directCorrelationData[2][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] > directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[2][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] == directCorrelationData[1][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[2][8] == directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") try: directCorrelationData = main( [self.testset1, "tcorr", "DDD.exe", "-w 3"]) except Exception as e: print traceback.format_exc() self.fail(e.message + "\n" + traceback.format_exc()) # Check Names self.assertEquals(directCorrelationData[0][3], "CCC.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][3], "EEE.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][3], "BBB.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][3], "FFF.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[4][3], "AAA.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[5][3], "GGG.exe", "test_TcorrTest_prog1 - Name failed!") # Check Before self.assertEquals(directCorrelationData[0][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][6], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][6], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[4][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[5][6], 2, "test_TcorrTest_prog1 - Name failed!") # Check After self.assertEquals(directCorrelationData[0][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][7], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][7], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[4][7], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[5][7], 0, "test_TcorrTest_prog1 - Name failed!") # Check InvBond self.assertEquals(directCorrelationData[0][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[4][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[5][9], "True", "test_TcorrTest_prog1 - Name failed!") # Check Total_Count self.assertEquals(directCorrelationData[0][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[2][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[3][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[4][10], 2, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[5][10], 2, "test_TcorrTest_prog1 - Name failed!") # Check Weight self.assertTrue( directCorrelationData[0][8] > directCorrelationData[2][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] > directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] > directCorrelationData[4][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] > directCorrelationData[5][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[2][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[4][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[1][8] > directCorrelationData[5][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[0][8] == directCorrelationData[1][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[2][8] == directCorrelationData[3][8], "test_TcorrTest_prog1 - Name failed!") self.assertTrue( directCorrelationData[4][8] == directCorrelationData[5][8], "test_TcorrTest_prog1 - Name failed!") # TestHost03 entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='AAA.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='BBB.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='CCC.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='DDD.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='EEE.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='FFF.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) entry_fields = settings.EntriesFields( EntryType=settings.__APPCOMPAT__, FilePath='C:\Temp', FileName='GGG.exe', Size=1, ExecFlag='True') add_entry(DB, "TestHost03", entry_fields) try: directCorrelationData = main( [self.testset1, "tcorr", "DDD.exe", "-w 1"]) except Exception as e: print traceback.format_exc() self.fail(e.message + "\n" + traceback.format_exc()) # Check Names self.assertEquals(directCorrelationData[0][3], "CCC.exe", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][3], "EEE.exe", "test_TcorrTest_prog1 - Name failed!") # Check Before self.assertEquals(directCorrelationData[0][6], 0, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][6], 3, "test_TcorrTest_prog1 - Name failed!") # Check After self.assertEquals(directCorrelationData[0][7], 3, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][7], 0, "test_TcorrTest_prog1 - Name failed!") # Check InvBond self.assertEquals(directCorrelationData[0][9], "True", "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][9], "True", "test_TcorrTest_prog1 - Name failed!") # Check Total_Count self.assertEquals(directCorrelationData[0][10], 3, "test_TcorrTest_prog1 - Name failed!") self.assertEquals(directCorrelationData[1][10], 3, "test_TcorrTest_prog1 - Name failed!")
def test_Dump(self): try: # Init DB if required with appDB.DBClass(self.testset10, settings.__version__) as DB: DB.appInitDB() conn = DB.appConnectDB() # Get host list data = DB.Query( "SELECT HostID, HostName, Recon, ReconScoring FROM Hosts ORDER BY ReconScoring DESC" ) # Dump all hosts for row in data: hostname = row[1] # Get temp dump filename temp = tempfile.NamedTemporaryFile( suffix='.txt', prefix='testCase', dir=tempfile.gettempdir()) dump_filename = temp.name temp.close() # Dump host dump = appDumpHost(DB, hostname, None) appCompatREGEX = re.compile( r'"((?:\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})|N\/A)","((?:\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})|N\/A)","(.*)\\([^\\]*)","(N\/A|\d*)","(N\/A|True|False)"' ) with open(dump_filename, "wb") as file: for item in dump: if item == 'Last Modified,Last Update,Path,File Size,Exec Flag': file.write("%s\r\n" % item) else: m = appCompatREGEX.match(item) if m: if m.group(1) == '0001-01-01 00:00:00': LastModified = 'N/A' else: LastModified = datetime.datetime.strptime( unicode(m.group(1)), '%Y-%m-%d %H:%M:%S').strftime( '%Y-%m-%d %H:%M:%S') if m.group(2) == '0001-01-01 00:00:00': LastUpdate = 'N/A' else: LastUpdate = datetime.datetime.strptime( unicode(m.group(2)), '%Y-%m-%d %H:%M:%S').strftime( '%Y-%m-%d %H:%M:%S') file.write("%s,%s,%s\\%s,%s,%s\r\n" % (LastModified, LastUpdate, unicode(m.group(3)), unicode(m.group(4)), unicode(m.group(5)), unicode(m.group(6)))) # Remove dumped host os.remove(dump_filename) except Exception: traceback.print_exc(file=sys.stdout) self.fail("Exception triggered")
def appSearchMP(dbfilenameFullPath, searchType, search_space, options): (outputFile, maxCores) = (options.outputFile, options.maxCores) known_bad_data = None # Start timer t0 = time.time() DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() # If possible use the available indexes if hasattr( options, 'field_name' ) and searchType == 'LITERAL' and options.searchLiteral[0][0] not in [ '=', '>', '<' ] and DB.appIndexExistsDB(options.field_name): num_hits = namedtuple('hits', 'value') num_hits_suppressed = namedtuple('hits', 'value') (num_hits.value, num_hits_suppressed.value, results) = runIndexedSearch(dbfilenameFullPath, search_space, options) else: # Get total number of entries to search entriesCount = DB.CountEntries() logger.debug("Total entries in search space: %d" % entriesCount) # Pre-load known_bad if required if searchType == 'KNOWNBAD': known_bad_data = LoadRegexBulkSearch(options.knownbad_file) # Establish communication queues tasks = multiprocessing.JoinableQueue() resultsProducers = multiprocessing.Queue() resultsConsumers = multiprocessing.Queue() hitHistogram_queue = multiprocessing.Queue() # Start producers/consumers num_consumers = 1 num_producers = max(1, maxCores - 1) # Prep lock for progress update Producers progProducers = multiprocessing.Value('i', 0) # Prep lock for progress update Consumers progConsumers = multiprocessing.Value('i', 0) # Prep Consumers return values num_hits = multiprocessing.Value('i', 0) num_hits_suppressed = multiprocessing.Value('i', 0) logger.debug( 'Using %d cores for searching / %d cores for dumping results' % (num_producers, num_consumers)) # Queue tasks for Producers # Limit rowsPerJob to constrain memory use and ensure reasonable progress updates rowsPerJob = min((entriesCount / 8), 5000) logger.debug("RowsPerJob: %d" % rowsPerJob) num_tasks = 0 for startingRowID in range(0, entriesCount - rowsPerJob, rowsPerJob): tasks.put(Task(startingRowID, rowsPerJob - 1)) logger.debug( "Creating search job %d: [%d - %d]" % (num_tasks, startingRowID, startingRowID + rowsPerJob - 1)) num_tasks += 1 logger.debug("Creating search job %d: [%d - %d]" % (num_tasks, num_tasks * (rowsPerJob), ((num_tasks * rowsPerJob) + (entriesCount - (num_tasks * (rowsPerJob) - 1))))) # Special consideration for the last one: tasks.put( Task(num_tasks * (rowsPerJob), (entriesCount - ((num_tasks * rowsPerJob) - 1)))) logger.debug("Number of tasks: %d" % num_tasks) # Add a poison pill for each producer for i in xrange(num_producers): tasks.put(None) # Start producer threads producers = [Producer(tasks, resultsProducers, dbfilenameFullPath, progProducers, num_consumers, \ searchType, search_space, options, num_hits, known_bad_data) for i in xrange(num_producers)] for producer in producers: producer.daemon = True # Remove for debugging producer.start() # Start consumer threads consumers = [Consumer(resultsProducers, resultsConsumers, progConsumers, num_producers, outputFile, \ dbfilenameFullPath, searchType, search_space, options, num_hits, \ num_hits_suppressed, hitHistogram_queue, known_bad_data) for i in xrange(num_consumers)] for consumer in consumers: consumer.daemon = True # Remove for debugging consumer.start() # Producer progress loop while (num_tasks > progProducers.value and progProducers.value >= 0): logger.debug("Producer num_tasks: %d - v.value: %d" % (num_tasks, progProducers.value)) update_progress( min(1, float(progProducers.value) / float(num_tasks)), "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) time.sleep(0.5) update_progress( 1, "Searching [%d]" % (num_hits.value - num_hits_suppressed.value)) # Wait for consumers dumping results to finish too while (num_hits.value > progConsumers.value and progConsumers.value >= 0): logger.debug("Consuming hit: %d / %d" % (progConsumers.value, num_hits.value)) update_progress( min(1, float(progConsumers.value) / float(num_hits.value)), "Dumping results to disk [%d]" % progConsumers.value) time.sleep(0.5) # Make sure we dumped as many hits as we found assert (num_hits.value == progConsumers.value) update_progress(1, "Dumping results to disk [%d]" % progConsumers.value) # Track Consumers deaths logger.debug("Waiting for consumer reverse-poison pills") while num_consumers > 0: tmp = resultsConsumers.get() # Check for reverse-poison pill if tmp is None: num_consumers -= 1 logger.debug("Consumer finished!") logger.debug("All consumers accounted for") # Wait for consumer threads to finish logger.debug("Waiting for consumer threads to finish") for consumer in consumers: consumer.join() logger.debug("Consumer threads finished") # Print hit histogram: results = [] results.append(('cyan', ("Hit histogram:", "", ""))) while not hitHistogram_queue.empty(): (name, regex, regex_hits) = hitHistogram_queue.get() results.append(('white', (name, regex, regex_hits))) if len(results) > 1: outputcolum(results) # Stop timer t1 = time.time() logger.info("Search hits: %d" % num_hits.value) logger.info("Suppresed duplicate hits: %d" % num_hits_suppressed.value) logger.info("Search time: %s" % (str(timedelta(seconds=(t1 - t0))))) if num_hits.value: logger.info("Head:") # Dump head of output file: num_lines = file_size(options.outputFile) from itertools import islice with open(options.outputFile) as myfile: head = list(islice(myfile, 5)) for line in head: logger.info(line.strip('\n\r')) logger.info("(%d lines suppressed)" % max(0, (num_lines - 5))) return (num_hits.value, num_hits_suppressed.value, results)
def run(self): proc_name = self.name exitFlag = False hit_dict = {} logger.debug("%s - Starting consumer process" % (self.proc_name)) # Init DB if required self.DB = appDB.DBClass(self.dbfilenameFullPath, True, settings.__version__) self.conn = self.DB.appConnectDB() # Load known_bad if required if self.searchType == 'KNOWNBAD': (searchTermRegex, searchTermRegexFilters, known_bad_search_terms) = self.known_bad_data for x in known_bad_search_terms: hit_dict[x.regex] = [0, x.name, x.regex] # Open output files: tmp_counter = 0 with open(self.outputFile, "w") as text_file: with open( os.path.join(ntpath.dirname(self.outputFile), ntpath.splitext(self.outputFile)[0] + ".mmd"), "w") as markdown_file: # While there are results to be processed we grab them and process them # todo: [High] We're holding all hits in memory now, stage file dumping activity? rowID_list = [] while not exitFlag: # Grab next result from queue rowID = self.task_queue.get() # Check for poison pill from Producers if rowID is None: self.num_producers -= 1 logger.debug( "%s - Found one poison pill %d Producers left" % (self.proc_name, self.num_producers)) # Check if all Producers have finished if self.num_producers == 0: # Reverse poison pill self.result_queue.put(None) logger.debug("%s - Exiting process" % (self.proc_name)) exitFlag = True continue else: tmp_counter += 1 # logger.debug("%s - consuming hit #%d: %d" % (self.proc_name, tmp_counter, rowID)) rowID_list.append(rowID) # Finished grabbing rowID, now we dump them all: dumped_set = set() for rowID in rowID_list: # Grab entry data we want to save to the output file: record = retrieveSearchData(rowID, self.DB, self.search_space) # De-dup results: entryMD5 = hashlib.md5(''.join([ str(e) for e in [ record[0], record[1], record[2], record[3], record[4], record[5], record[9] ] ])).hexdigest() if entryMD5 in dumped_set: # print("Suppressing row %d" % entry[6]) with self.num_hits_suppressed.get_lock(): self.num_hits_suppressed.value += 1 else: dumped_set.add(entryMD5) # Re-filter against known bad individually to build histogram and highlight regex_hit_name = None search_space = None if self.searchType == 'KNOWNBAD': # Search for known_bad one by one and filter if required for x in list(known_bad_search_terms): if re.compile(x.regex, re.IGNORECASE).search( str(record.Search_Space)) is not None: if x.filter is not None: if re.compile( x.filter, re.IGNORECASE).search( str(record.Search_Space) ) is not None: regex_hit_name = x.name continue # 'u200b' is a zero width unicode character I have to use to avoid messy markdown highlighting: search_space = re.compile( '(.*)(' + x.regex + ')(.*)', re.I).sub( r'\1' + u'\u200b' + r'**' + u'\u200b' + r'\2' + u'\u200b' + '**' + u'\u200b' + r'\3', record.Search_Space, re.IGNORECASE) # Add hit to know_bad hit counter: regex_hit_name = x.name hit_dict[x.regex][0] += 1 # We only report the match with the first regex from our set break # Program flow should never really make it here :) # assert(False, "We're in trouble") else: search_space = record.Search_Space # search_space will be None if Producer hit but Consumer did not: if search_space is None: if regex_hit_name: logger.error( "Producer/Consumer hit mismatch (consumer filtered) ! (report bug please) sig: %s - %s" % (regex_hit_name, record.Search_Space)) else: logger.error( "Producer/Consumer hit mismatch! (report bug please) - %s" % record.Search_Space) pass # We dump the data to the output file/s saveSearchData(record, self.searchType, regex_hit_name, text_file, markdown_file) # Update progress counter with self.val.get_lock(): self.val.value += 1 # Dump hit histogram time.sleep(0.5) for x in sorted(hit_dict.values(), key=operator.itemgetter(0), reverse=True): if x[0] > 0: self.hitHistogram_queue.put((x[1], x[2], x[0]))
def run(self): DB = appDB.DBClass(self.dbfilenameFullPath, True, settings.__version__) DB.appInitDB() conn = DB.appConnectDB() filter_skipped = 0 # While there are tasks to be ran we grab and run them while True: # Start timer t0 = time.time() taskRows = [] # Grab next job from job queue next_task = self.task_queue.get() if next_task is None: # Poison pill means shutdown self.task_queue.task_done() # Pass poison pills for _ in xrange(self.num_consumers): self.result_queue.put(None) logger.debug("%s - Adding poison pill for consumer" % (self.proc_name)) logger.debug("%s - Exiting process" % (self.proc_name)) # We're skipping way to much stuff improve filter skipper counter to detect what regexes have to be tightened logger.debug("filter_skipped: %d" % filter_skipped) return # Grab job data (startingRowID, entriesPerJob) = next_task() with closing(conn.cursor()) as c: # Start timer t0 = time.time() logger.debug("%s - Starting query [%d / %d]. SearchSpace: %s" % (self.proc_name, startingRowID, entriesPerJob, self.search_space)) if self.searchType == 'REGEX' or self.searchType == 'KNOWNBAD': results = c.execute( "SELECT RowID, " + self.search_space + " AS SearchSpace FROM Entries_FilePaths \ WHERE RowID >= %d AND RowID <= %d" % (startingRowID, startingRowID + entriesPerJob)) elif self.searchType == 'LITERAL' or self.searchType == 'COMBINED': if self.search_modifier_Literal in [">", "<"]: results = c.execute( "SELECT RowID, " + self.search_space + " AS SearchSpace FROM Entries_FilePaths \ WHERE RowID >= %d AND RowID <= %d \ AND SearchSpace %s '%s'" % (startingRowID, startingRowID + entriesPerJob, self .search_modifier_Literal, self.searchTermLiteral)) else: results = c.execute( "SELECT RowID, " + self.search_space + " AS SearchSpace FROM Entries_FilePaths \ WHERE RowID >= %d AND RowID <= %d \ AND SearchSpace LIKE '%s'" % (startingRowID, startingRowID + entriesPerJob, self.searchTermLiteral)) else: logger.error("Unknown searchType %s" % (self.searchType)) t1 = time.time() logger.debug("%s - Execute time: %s seconds" % (self.proc_name, "{0:.4f}".format(t1 - t0))) rows = c.fetchall() t2 = time.time() logger.debug("%s - Fetchall time: %s seconds (%s / %s)" % (self.proc_name, "{0:.4f}".format(t2 - t1), startingRowID, entriesPerJob)) # Process row per row: for row in rows: if row[1] is not None: if self.searchType == 'LITERAL': self.addHit(int(row[0])) elif self.searchType == 'REGEX' or self.searchType == 'COMBINED': if re_fn(self.searchTermRegex, str(row[1])): self.addHit(int(row[0])) elif self.searchType == 'KNOWNBAD': # Search for known bads with no filters: if self.searchTermRegex != "()": if re_fn(self.searchTermRegex, str(row[1])): self.addHit(int(row[0])) # Search for known bads which have a filter associated: for x in list(self.known_bad_with_filter): assert (x.filter is not None) if re.compile(x.regex, re.IGNORECASE).search( str(row[1])) is not None: if re.compile(x.filter, re.IGNORECASE).search( str(row[1])) is None: self.addHit(int(row[0])) # One hit is enough for us break # fixme: else: filter_skipped += 1 else: logger.error("Unknown searchType %s" % (self.searchType)) t3 = time.time() logger.debug("%s - REGEX filtering time: %s seconds (%s / %s)" % (self.proc_name, "{0:.4f}".format(t3 - t2), startingRowID, entriesPerJob)) if (t3 - t2) > 30: logger.warning( "Warning: Producer queues clogged, throttling down.") logger.debug( "%s Task results: %d execution time: %s seconds" % (self.proc_name, len(taskRows), "{0:.4f}".format(t3 - t0))) # Update progress counter with self.val.get_lock(): self.val.value += 1 self.task_queue.task_done() logger.warning("%s - Abnormal exit" % (self.proc_name))
def appLoadMP(pathToLoad, dbfilenameFullPath, maxCores, governorOffFlag): global _tasksPerJob files_to_process = [] conn = None # Start timer t0 = datetime.now() logger.debug("Starting appLoadMP") # Calculate aggreagate file_filter for all ingest types supported: file_filter = '|'.join([v.getFileNameFilter() for k,v in ingest_plugins.iteritems()]) # Add zip extension file_filter += "|.*\.zip" # Check if we're loading Redline data if os.path.isdir(pathToLoad) and os.path.basename(pathToLoad).lower() == 'RedlineAudits'.lower(): files_to_process = searchRedLineAudits(pathToLoad) else: # Search for all files to be processed if os.path.isdir(pathToLoad): files_to_process = searchFolders(pathToLoad, file_filter) else: files_to_process = processArchives(pathToLoad, file_filter) if files_to_process: # Init DB if required DB = appDB.DBClass(dbfilenameFullPath, True, settings.__version__) conn = DB.appConnectDB() # Extract hostnames, grab existing host IDs from DB and calculate instance ID for new IDs to be ingested: instancesToProcess = [] instancesToProcess += GetIDForHosts(files_to_process, DB) countInstancesToProcess = len(instancesToProcess) logger.info("Found %d new instances" % (countInstancesToProcess)) # Setup producers/consumers initial counts num_consumers = 1 num_producers = 1 # Setup MPEngine mpe = MPEngineProdCons(maxCores, appLoadProd, appLoadCons, governorOffFlag) # Reduce _tasksPerJob for small jobs if countInstancesToProcess < _tasksPerJob: _tasksPerJob = 1 # Create task list task_list = [] instancesPerJob = _tasksPerJob num_tasks = 0 for chunk in chunks(instancesToProcess, instancesPerJob): # todo: We no longer need pathToLoad as tasks include the fullpath now task_list.append(Task(pathToLoad, chunk)) num_tasks += 1 if num_tasks > 0: # Check if we have to drop indexes to speedup insertions # todo: Research ratio of existing hosts to new hosts were this makes sense if countInstancesToProcess > 1000 or DB.CountHosts() < 1000: DB.appDropIndexesDB() # Queue tasks for Producers mpe.addTaskList(task_list) # Start procs mpe.startProducers(num_producers) mpe.startConsumers(num_consumers, [dbfilenameFullPath]) # mpe.addProducer() # Control loop while mpe.working(): time.sleep(1.0) (num_producers,num_consumers,num_tasks,progress_producers,progress_consumers) = mpe.getProgress() elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, _tasksPerJob * progress_consumers) pending_hosts = ((num_tasks * _tasksPerJob) - (_tasksPerJob * progress_consumers)) etr = (mean_loadtime_per_host * pending_hosts) eta = t0 + elapsed_time + etr ett = (eta - t0) if settings.logger_getDebugMode(): status_extra_data = " Prod: %s Cons: %s (%d -> %d -> %d: %d) [RAM: %d%% / Obj: %d / ETH: %s / ETA: %s / ETT: %s]" % \ (num_producers, num_consumers, num_tasks, progress_producers, progress_consumers, progress_producers - progress_consumers, psutil_phymem_usage(), len(gc.get_objects()), mean_loadtime_per_host if progress_consumers * _tasksPerJob > 100 else "N/A", str(eta.time()).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A", str(ett).split(".")[0] if progress_consumers * _tasksPerJob > 100 else "N/A") else: status_extra_data = "" # logger.info("Parsing files%s" % status_extra_data) logger.info(update_progress(min(1,float(progress_consumers) / float(num_tasks)), "Parsing files%s" % status_extra_data, True)) mpe.rebalance() del mpe # Stop timer elapsed_time = datetime.now() - t0 mean_loadtime_per_host = (elapsed_time) / max(1, countInstancesToProcess) logger.info("Load speed: %s seconds / file" % (mean_loadtime_per_host)) logger.info("Load time: %s" % (str(elapsed_time).split(".")[0])) else: logger.info("Found no files to process!")