def asm_extraction(dataset_type): read_mode, write_mode = ('r', 'w') if six.PY2 else ('rt','wt') defined_apis = io.read_all_lines(APIS_PATH) defined_apis = defined_apis[0].split(',') directory_name = dataset_type + '_asm/' files = os.listdir(directory_name) files = numpy.sort(files) asm_files = [i for i in files if i.endswith('.asm.gz')] asm_csv = ASM_CSV_PATH + dataset_type + '_asm_csv.gz' # if not os.path.isdir(SAVED_PATH_CSV + dataset_type): # os.makedirs(SAVED_PATH_CSV + dataset_type) symbols_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_symbols.csv', write_mode) meta_data_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_meta_data.csv', write_mode) registers_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_registers.csv', write_mode) opcodes_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_opcodes.csv', write_mode) sections_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_sections.csv', write_mode) data_define_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_data_define.csv', write_mode) apis_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_apis.csv', write_mode) symbols_time = [] m_data_time = [] registers_time = [] opcodes_time = [] sections_time = [] data_define_time = [] apis_time = [] with gzip.open(asm_csv, write_mode) as f: # Header construction fw = writer(f) colnames = ['filename'] colnames += header_asm_meta_data() colnames += header_asm_sym() colnames += header_asm_registers() colnames += header_asm_opcodes() colnames += header_asm_sections() colnames += header_asm_data_define() colnames += header_asm_apis() meta_data_colnames = header_asm_meta_data() sym_colnames = header_asm_sym() registers_colnames = header_asm_registers() opcodes_colnames = header_asm_opcodes() sections_colnames = header_asm_sections() data_define_colnames = header_asm_data_define() apis_colnames = header_asm_apis() meta_data_csv_w = writer(meta_data_csv) meta_data_csv_w.writerows([meta_data_colnames]) symbols_csv_w = writer(symbols_csv) symbols_csv_w.writerows([sym_colnames]) registers_csv_w = writer(registers_csv) registers_csv_w.writerows([registers_colnames]) opcodes_csv_w = writer(opcodes_csv) opcodes_csv_w.writerows([opcodes_colnames]) sections_csv_w = writer(sections_csv) sections_csv_w.writerows([sections_colnames]) data_define_csv_w = writer(data_define_csv) data_define_csv_w.writerows([data_define_colnames]) apis_csv_w = writer(apis_csv) apis_csv_w.writerows([apis_colnames]) fw.writerow(colnames) # Creating row set rows = [] for t, fname in enumerate(asm_files): f = gzip.open(directory_name + fname, read_mode, encoding='utf-8', errors='ignore') #Meta data try: f2 = gzip.open(directory_name + fname, 'r') start_time = time.time() meta_data = asm_meta_data(directory_name+fname, f2) required_time = time.time() - start_time m_data_time.append(required_time) meta_data_csv_w.writerows([meta_data]) f.seek(0) start_time = time.time() symbols = asm_symbols(f) required_time = time.time() - start_time symbols_time.append(required_time) symbols_csv_w.writerows([symbols]) f.seek(0) start_time = time.time() registers = asm_registers(f) required_time = time.time() - start_time registers_time.append(required_time) registers_csv_w.writerows([registers]) f.seek(0) start_time = time.time() opcodes = asm_opcodes(f) required_time = time.time() - start_time opcodes_time.append(required_time) opcodes_csv_w.writerows([opcodes]) f.seek(0) start_time = time.time() sections, names = asm_sections(f) required_time = time.time() - start_time sections_time.append(required_time) sections_csv_w.writerows([sections]) f.seek(0) start_time = time.time() data_defines = asm_data_define(f) required_time = time.time() - start_time data_define_time.append(required_time) data_define_csv_w.writerows([data_defines]) f.seek(0) start_time = time.time() apis = asm_APIs(f,defined_apis) required_time = time.time() - start_time apis_time.append(required_time) apis_csv_w.writerows([apis]) except Exception as err: print(err, traceback.print_exc()) print("Error", fname) # Row added whole = meta_data + symbols + registers + opcodes + sections + data_defines + apis rows.append([fname[:fname.find('.asm.gz')]] + whole) # Writing rows after every 10 files processed if (t+1) % 100 == 0: print(t+1, 'asm files loaded from ', dataset_type) fw.writerows(rows) rows = [] # Save the time m_data_time_str = ', '.join(str(x) for x in m_data_time) symbols_time_str = ', '.join(str(x) for x in symbols_time) registers_time_str = ', '.join(str(x) for x in registers_time) opcodes_time_str = ', '.join(str(x) for x in opcodes_time) sections_time_str = ', '.join(str(x) for x in sections_time) data_define_time_str = ', '.join(str(x) for x in data_define_time) apis_time_str = ', '.join(str(x) for x in apis_time) io.save_txt(('asm_meta_data_'+dataset_type, m_data_time_str,'asm_sym_'+dataset_type, symbols_time_str, 'asm_registers_'+dataset_type, registers_time_str,'asm_opcodes_'+dataset_type, opcodes_time_str, 'asm_sections_'+dataset_type, sections_time_str,'asm_datadefine_'+dataset_type, data_define_time_str, 'asm_apis_'+dataset_type, apis_time_str ), ASM_TIME_PATH)
from handle_io import io def asm_APIs(asm_code, apis): apis_values = [0]*len(apis) for row in asm_code: for i in range(len(apis)): if apis[i] in row: apis_values[i] += row.count(apis[i]) return apis_values file_path = "samples/01IsoiSMh5gxyDYTl4CB.asm" # file_path = "samples/01SuzwMJEIXsK7A8dQbl.asm" with open(file_path) as f: byte_code = f.readlines() defined_apis = io.read_all_lines('APIs.txt') defined_apis = defined_apis[0].split(',') print asm_APIs(byte_code, defined_apis)
from multiprocessing import Pool from csv import writer from feature_extraction import * from header_construction import * from settings import * from handle_io import io import os, gzip, time, numpy, traceback path = DATASET_PATH os.chdir(path) defined_apis = io.read_all_lines(APIS_PATH) defined_apis = defined_apis[0].split(',') def asm_extraction(dataset_type): directory_name = dataset_type + '_gz/' files = os.listdir(directory_name) files = numpy.sort(files) byte_files = [i for i in files if i.endswith('.asm.gz')] #byte_csv = dataset_type + '_asm_csv.gz' #if not os.path.isdir(SAVED_PATH_CSV + dataset_type): # os.makedirs(SAVED_PATH_CSV + dataset_type) symbols_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_symbols.csv', 'w') meta_data_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_meta_data.csv', 'w') registers_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_registers.csv', 'w') opcodes_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_opcodes.csv', 'w') sections_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_sections.csv', 'w') data_define_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_data_define.csv', 'w')
def header_asm_apis(): defined_apis = io.read_all_lines(APIS_PATH) colnames = defined_apis[0].split(',') return colnames
from multiprocessing import Pool from csv import writer from feature_extraction import * from header_construction import * from settings import * from handle_io import io import os, gzip, time, numpy, traceback path = DATASET_PATH os.chdir(path) defined_apis = io.read_all_lines(APIS_PATH) defined_apis = defined_apis[0].split(',') def asm_extraction(dataset_type): directory_name = dataset_type + '_gz/' files = os.listdir(directory_name) files = numpy.sort(files) byte_files = [i for i in files if i.endswith('.asm.gz')] #byte_csv = dataset_type + '_asm_csv.gz' #if not os.path.isdir(SAVED_PATH_CSV + dataset_type): # os.makedirs(SAVED_PATH_CSV + dataset_type) symbols_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_symbols.csv', 'w') meta_data_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_meta_data.csv', 'w') registers_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_registers.csv', 'w') opcodes_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_opcodes.csv', 'w') sections_csv = open(SAVED_PATH_CSV + dataset_type + '/asm_sections.csv',