def comparing(bin_bytes, sample, n, rounds, files_expected, detection_threshold, scanner): ''' This function compares ARMED and AIMED to assess random vs. evolutionary performance finding adversarial examples. The results will be stored on compare.csv ''' # Run ARMED start_Total = time() start_ARMED = time() _, ARMED_corrupt_samples = armed(bin_bytes, sample, n, rounds, files_expected, detection_threshold, scanner) time_ARMED = f.time_me(start_ARMED) # Run AIMED size_population = 4 start_AIMED = time() AIMED_new_evasions, AIMED_corrupt_files = aimed(bin_bytes, sample, size_population, n, files_expected, scanner) time_AIMED = f.time_me(start_AIMED) # Update CSV with comparison data Compare_CSV = {} fields_compare = [ 'Sample', 'Perturbations', 'Module 1', 'Time M1', 'Files M1', 'Corr M1', 'Module 2', 'Time M2', 'Files M2', 'Corr M2', 'Total Time' ] Compare_CSV['Sample'], Compare_CSV['Perturbations'], Compare_CSV['Module 1'], Compare_CSV['Time M1'], Compare_CSV['Files M1'], \ Compare_CSV['Corr M1'], Compare_CSV['Module 2'], Compare_CSV['Time M2'], Compare_CSV['Files M2'], Compare_CSV['Corr M2'], Compare_CSV['Total Time'] = \ sample, n, 'ARMED', time_ARMED, files_expected, ARMED_corrupt_samples, 'AIMED', time_AIMED, AIMED_new_evasions, AIMED_corrupt_files, strftime('%H:%M:%S', gmtime(time() - start_Total)) f.write_dict_CSV('db/compare.csv', Compare_CSV, fields_compare) # Update short version CSV with time averages to use as input in LaTeX f.comparing_AXMED()
def main(option, scanner): # Defining paths mod_path = "samples/mod/" evasion_path = "samples/successful/" detected_path = "samples/successful/detected/" # Argument parsing & displaying __doc__ parser = ArgumentParser(description=__doc__) parser.add_argument("-s", dest="myFilenameVariable", required=False, help="malware sample as input", metavar="sample") parser.add_argument("-p", dest="myFilenameVariable", required=True, help="number of perturbations to inject", metavar="perturbations") parser.add_argument("-r", dest="myFilenameVariable", required=False, help="number of rounds to run", metavar="rounds") parser.add_argument("-m", dest="myFilenameVariable", required=True, help="number of manipulated files expected", metavar="mutations exp.") parser.add_argument("-t", dest="myFilenameVariable", required=False, help="run until detections are below threshold", metavar="detection thresh.") args = parser.parse_args() # Processing input from terminal sample, n, rounds, files_expected, detection_threshold = i.handling_input( sys.argv) # Convert malware sample into binaries bin_bytes = f.readfile(sample) # ARMED: Fixed length of sequence -- Using remote/local sandbox (HT/Cuckoo) + remote (VT)/local detection if option == 'ARMED': start_ARMED = time() i.armed(bin_bytes, sample, n, rounds, files_expected, detection_threshold, scanner) f.time_me(start_ARMED) # ARMED II: Incremental Iterations of perturbations' sequence -- Using local sandbox + local detection elif option == 'ARMED2': start_ARMED2 = time() i.armed2(bin_bytes, sample, n, rounds, files_expected, scanner) f.time_me(start_ARMED2) # AIMED: Fixed length & optimized order of perturbations -- GP with local sandbox + detection elif option == 'AIMED': size_population = 4 # & n = length_sequence (number of perturbations) start_AIMED = time() i.aimed(bin_bytes, sample, size_population, n, files_expected, scanner) f.time_me(start_AIMED) # COMPARE: Examine intelligent evolutionary algorithm against random (AIMED vs ARMED) elif option == 'COMPARE': start_COMPARE = time() i.comparing(bin_bytes, sample, n, rounds, files_expected, detection_threshold, scanner) f.time_me(start_COMPARE)
def malware_detection(mutation, snapshot): ''' Detecting malware samples using local scanners. Use malware classifiers from industry or academia: Gradient Boosting [Anderson et al. 2018]: Trained with 100k malicious and benign samples and achieves ROC-AUC = 0.993 Threshold of 0.9 correponds to 1% FPR at 90% TPR A functionality (beta)-test has been added that overcomes the processing time of Cuckoo by 1/3 reducing from 45 to 15 s. ''' start = time() detect = False vm = "Windows7-Detection" path_m = 'e:\\mod\\' if not (snapshot == 'GradientBoosting' or snapshot == 'Functionality'): print('Engines supported: GradientBoosting') sys.exit() # Pre-trained Gradient Boosting Model if snapshot == 'GradientBoosting': av_model = f.load_av('data/gradient_boosting.pkl') bin_bytes = f.readfile(mod_path + mutation) score = f.get_score_local(bin_bytes, av_model) if score > 0.9: # As per paper print('\nMalware detected.\n') return True else: print('\nSample not detected.\n') return False # Start & restore the VM (headless = invisible) state = check_output(['VBoxManage', 'showvminfo', vm]).decode('UTF-8') if "powered off" in state or "saved" in state: call([ 'VBoxManage', 'snapshot', vm, 'restore', 'Windows7-' + snapshot + '-Ready' ]) call(['VBoxManage', 'startvm', vm, '--type', 'headless']) elif "paused" in state: call(['VBoxManage', 'controlvm', vm, 'resume', '--type', 'headless']) try: # Beta-test to check functionality (Reduces time of Cuckoo by 1/3 but needs further testing) if snapshot == "Functionality": try: status = check_output([ 'timeout', '10', 'VBoxManage', 'guestcontrol', vm, '--username', 'user', '--password', 'sandbox', 'run', '--exe', path_m + mutation ]) except Exception as err: if 'returned non-zero exit status 1.' in str(err): print('\nMutation corrupt!\n') valid = False else: print('\nMutation WORKING!\n') valid = True return valid except CalledProcessError as err: state = err # Terminate the running process if snapshot != "Functionality": s.kill() # Pause the VM – Use pause only if power-off is on main() #call(['VBoxManage', 'controlvm', vm, 'pause', '--type', 'headless']) # Power off the VM call(['VBoxManage', 'controlvm', vm, 'poweroff']) # Show total time in hh:mm:ss f.time_me(start) return detect
def malware_detection_VT(sample_report, CSV): ''' Detecting malware samples using VirusTotal (remote) Input: sample_report: the number of VT detections to use as benchmark ''' loops = 0 limit = 20 start = time() # Comparing detections of both samples print('\n# Malware Detection Stage #') print('\nOriginal sample:') print('Detected by {} out of {} engines \n'.format( sample_report['positives'], sample_report['total']) ) #, (sample_report['positives']/sample_report['total'])*100)) print(sample_report['permalink']) print('\nStatus:') # Use loops and sleep to keep requests lows and avoid API banned by VT (Limit: 100) while loops < limit: try: # Getting report of sample submitted via VT API - Rescan: False report = f.get_report_VT(CSV['Mod_File_Hash'], False) # Check the status of sample & report if report['response_code'] == -2: print('The sample is queued for analysis. Next update in 60 s') sleep(60) elif report['response_code'] == 1: print('\nResults: New sample found') print('\nDetected by {} out of {} engines \n'.format( report['positives'], #({:.2f}%) report['total']) ) #, (report['positives']/report['total'])*100)) # Print only engines detecting new sample av_detect = { key: val for key, val in report['scans'].items() if val['detected'] == 1 } print(list(av_detect.keys())) # Provide link to sample detections report print('\n{}'.format(report['permalink'])) # Calculate evasion rate based on original sample detections and print summary print('\n## Summary ##') print('\nEvasion rate: {:.2f}% of previous engines'.format( (1 - (report['positives'] / report['total']) / (sample_report['positives'] / sample_report['total'])) * 100)) #print('\nEvasion rate: {:.2f}% of engines'.format((sample_report['positives']/ #sample_report['total']-report['positives']/report['total'])*100)) # Show detection time in hh:mm:ss f.time_me(start) # Copy successful sample into evasion path now = datetime.now() name_file = str(now.year) + str(now.month) + str( now.day) + str(now.hour) + str(now.minute) + str( now.second) copyfile(mod_path+CSV['Perturbations']+'_m.exe', \ evasion_path+CSV['Perturbations']+'m_'+name_file+'.exe') # Update database with sample's info CSV['Manipulated_File'], CSV['MF_Detections'], CSV['Full_Detections_Report'], \ CSV['Date_Reported'] = evasion_path+CSV['Perturbations']+'m_'+ \ name_file+'.exe', str(report['positives'])+'/'+str(report['total']), \ str(report['permalink']), str(report['scan_date']) f.write_dict_CSV('db/database.csv', CSV, fields) return report['positives'] else: # 'response_code' == 0: print("Sample is not present in VirusTotal's dataset") sleep(60) loops += 1 except (requests.ConnectionError, requests.Timeout, requests.ConnectTimeout) as e: print('Connection issues or API requests threshold reached: {}'. format(e))
def malware_analysis_HA(mod_sample, json_send_HA, CSV): ''' Analyze malware using remote service Hybrid Analysis ''' loops = 0 start = time() functionality = False # Wait a few minutes if server did not accept further submissions while json_send_HA == 429: print('Submission quota limit has been exceeded. Retry in 5 minutes.') sleep(301) # Retrieve report from Hybrid Analisys sandbox: report URL + Hash + Job ID url_sample = 'https://www.reverse.it/sample/' + json_send_HA[ 'sha256'] + '/' + json_send_HA['job_id'] print('\nFull report: {}\n\nStatus:'.format(url_sample)) # Use loops and sleep to keep requests low and avoid API banned by HA (Limit: 5/m) limit = 30 while loops < limit: try: # Server could return 403 if f.url_ok(url_sample) == 200 or f.url_ok(url_sample) == 403: report_HA = f.get_summary_HA(json_send_HA['sha256']) if report_HA['state'] == 'ERROR': print('The sandbox environment returned {}.'.format( report_HA['error_type'])) break elif report_HA['state'] == 'IN_QUEUE': print( 'Waiting in queue to be analyzed. Next update in 60 s') elif report_HA['state'] == 'IN_PROGRESS': print('Analysis in progress..') elif report_HA['state'] == 'SUCCESS': print('Analysis finished.') break sleep(60) else: print('Website not reachable. Next update in 30 s') sleep(30) if loops == limit - 1: print( 'ARMED exited because the limit of {} minutes has been reached.\n' .format(limit)) quit() loops += 1 except (requests.ConnectionError, requests.Timeout, requests.ConnectTimeout) as e: print('Connection issues or API requests reached:\n{}'.format(e)) # Check the likelihood that malware runs based on report if report_HA['domains'] or report_HA['compromised_hosts']: functionality = True print('\nResults: WORKING') print('Malware connects to domains or contacts hosts.') # Show analysis time in hh:mh:ss f.time_me(start) # Send to VT to check detections print('Sent to VirusTotal!') json_send_VT = f.send_VT(mod_sample) else: if report_HA['state'] != 'ERROR': print('\nResults: Most likely not working') print('Check if manipulated sample runs before scanning.') print('Malware does not connect to domains or contacts hosts.') # Copy sample into failed path & tag with F now = datetime.now() name_file = str(now.year) + str(now.month) + str(now.day) + str( now.hour) + str(now.minute) copyfile(mod_path+CSV['Perturbations']+'_m.exe', \ fail_path+CSV['Perturbations']+'F_'+name_file+'.exe') # Update database with basic sample's info CSV['Manipulated_File'], CSV['Full_Analysis_Report'] \ = fail_path+CSV['Perturbations']+'F_'+name_file+'.exe', url_sample f.write_dict_CSV('db/fail_database.csv', CSV, fields) # Show analysis time in hh:mh:ss f.time_me(start) return functionality, url_sample
def malware_analysis(mod_sample, json_send, useVT, CSV): ''' Analyze malware with sandbox Cuckoo Input: mod_sample: Compiled version of modified malware mutation json_send: JSON status after sending mutation to local sandbox for analysis useVT: Boolean value indicating whether VirusTotal is used or detection will be performed locally CSV: Data structure with information to save on DB ''' loops = 0 start = time() functionality = False # Show report from analisys sandbox: report URL + Job ID url_sample = 'http://localhost:8000/analysis/' + str( json_send['task_id']) + '/summary' print('\nFull analysis report: {}\n\nStatus:'.format(url_sample)) # Using sleep in loop to space requests to sandbox may improve results firstPrintR, firstPrintW, firstPrintRep = True, True, True while True: try: v = f.get_summary_local_sandbox(json_send['task_id'], 'view') view_status = v['task']['status'] if view_status == 'completed' and firstPrintRep: print('Analysis finished. Generating report..') firstPrintRep = False elif view_status == 'pending' and firstPrintW: print('Waiting in queue to be analyzed..') firstPrintW = False elif view_status == 'running' and firstPrintR: print('Analysis in progress..') firstPrintR = False elif view_status == 'reported': print('Report finished.') break sleep(0.2) except (requests.ConnectionError, requests.Timeout, requests.ConnectTimeout) as e: print('Connection issues or API not available:\n{}'.format(e)) # Check the likelihood that malware runs based on report err = 'CuckooPackageError: Unable to execute the initial process, analysis aborted.\n' r = f.get_summary_local_sandbox(json_send['task_id'], 'report') report = r['debug']['cuckoo'] duration = r['info']['duration'] if err not in report and duration >= 15: functionality = True print('\nResults: WORKING') # Show analysis time in hh:mh:ss f.time_me(start) # Send to VT for detections (activate if local detection is not used) if useVT: print('Sending to VirusTotal!') json_send_VT = f.send_VT(mod_sample) elif err not in report and duration < 15: print( '\nResults: It could not be determined (score = {} – duration = {})' .format(r['info']['score'], duration)) # Show analysis time in hh:mh:ss f.time_me(start) elif err in report: print('\nResults: Mutation is corrupt') # Copy sample into failed path & tag with letter F now = datetime.now() name_file = str(now.year) + str(now.month) + str(now.day) + str( now.hour) + str(now.minute) copyfile(mod_path+CSV['Perturbations']+'_m.exe', \ fail_path+CSV['Perturbations']+'F_'+name_file+'.exe') # Update database with basic sample's info CSV['Manipulated_File'], CSV['Full_Analysis_Report'], CSV['Date_Reported'] \ = fail_path+CSV['Perturbations']+'F_'+name_file+'.exe', url_sample, str(datetime.now()) f.write_dict_CSV('db/corrupted.csv', CSV, fields) # Show analysis time in hh:mh:ss f.time_me(start) return functionality, url_sample