def path_count(self, file_paths): ret = [] for fpath in file_paths: pdf_obj = PdfGenome.load_genome(fpath) paths = PdfGenome.get_object_paths(pdf_obj) ret.append(len(paths)) return ret
def load_external_genome(self, file_paths): ext_pdf_paths = [] # element: (entry, path) self.genome_desc = [] for file_path in file_paths: pdf_obj = PdfGenome.load_genome(file_path) paths = PdfGenome.get_object_paths(pdf_obj) for path in paths: ext_pdf_paths.append((pdf_obj, path)) self.genome_desc.append((file_path, len(path))) return ext_pdf_paths
def main(args): mal_sha1 = os.path.basename(args.mal).split('.')[0] # load malicious pdf file. mal_obj = PdfGenome.load_genome(args.mal, noxref=True) # load benign pdf file. ben_obj = PdfGenome.load_genome(args.ben, noxref=True) newpdf = deepcopy(ben_obj) # get exploit path from the malicious pdf file. exploit_spec = pickle.load(open(args.exploit_spec, 'rb')) epaths = exploit_spec[mal_sha1] all_ben_paths = PdfGenome.get_object_paths(ben_obj, set()) # inject each path from exploit paths for path in epaths: src_path = None # what is the object from path? get insertable path. for j in xrange(1, len(path)): if path[:-j] in all_ben_paths: src_path = path[:-j] break if src_path is None: src_path = ['/Root'] if j > 1: tgt_path = path[:-j + 1] else: tgt_path = path PdfGenome.insert_under(newpdf, src_path, mal_obj, tgt_path) outname = '%s/%s_%s' % (args.var_dir, mal_sha1, os.path.basename(args.ben)) PdfGenome.save_to_file(newpdf, outname)
def main(args): global genome_dict global idx_to_path build_genome_dict() # load the npy file adv_samples = np.load('../data/un_adv_samples.npy') # load the seed feature vectors seed_dict = pickle.load(open('robustness_spec/seed_test_malicious/feat_dict_3416.pickle', 'rb')) seed_features = genfromtxt('robustness_spec/seed_test_malicious/seed_feature_3416.csv', delimiter=',') # load the seed entries together. deepcopy later all_sha1 = seed_dict.keys() sha1_500 = [item.split('.')[0] for item in os.listdir('../data/500_seed_pdfs/')] v_i_to_sha1 = {} for i in range(len(all_sha1)): if all_sha1[i] in sha1_500: v_i_to_sha1[i] = all_sha1[i] # each of the 15 models # "baseline", "TA", "TB", "TC", "TD", "ATAB", "EAB", "ED", "RA", "RB", "RC", "RD", "RAB", "RABE", "mono" model_names = ['baseline', 'adv_a', 'adv_b', 'adv_c', 'adv_d', 'adv_ab', 'ensemble_ab', 'ensemble_d', 'robust_a', 'robust_b', 'robust_c', 'robust_d', 'robust_ab', 'robust_abe', 'robust_e'] #for m_i in range(15): for m_i in range(8, 15): # each of the 3416 evasive vectors against the model res = adv_samples[m_i] for v_i in range(3416): # figure out the difference of this vector with the original feature vector if v_i not in v_i_to_sha1.keys(): continue vector = res[v_i] seed_vec = seed_features[v_i] # get the difference # all the insertion indices # all the deletion indices ins_indices, del_indices = get_ins_del(seed_vec, vector) # get the original PDF object, then mutate. sha1 = v_i_to_sha1[v_i] src_entry = PdfGenome.load_genome('../data/500_seed_pdfs/%s.pdf' % sha1, noxref = True) generate_pdf(src_entry, all_sha1[v_i], ins_indices, del_indices, model_names[m_i]) return
def get_cf(file_name): """ Get conserved features for a given PDF file. """ # We evaluate each variant with n_test times. n_test = 5 seed_file_path = 'samples/seeds/' + file_name pdf_folder = 'samples/tmp_pdfs/' + file_name + '/' os.system('mkdir -p %s' % (pdf_folder)) seed_root = PdfGenome.load_genome(seed_file_path) root = deepcopy(seed_root) visited_paths = set() remaining_paths = list() remaining_paths = PdfGenome.get_object_paths(root, visited_paths) obj_paths = PdfGenome.get_object_paths(root, visited_paths) path_len = len(PdfGenome.get_object_paths(root, visited_paths)) print('Initial paths:', remaining_paths) print path_len # Auxilliary list with ASCII order aux = [] for i in range(0, path_len): aux.append(str(i)) aux.sort() # Sequentially delete structural paths i = 0 for j in range(0, path_len): root = deepcopy(seed_root) op_obj_path = remaining_paths.pop(0) PdfGenome.delete(root, op_obj_path) #print "####################################################" #print i, ".pdf: delete", op_obj_path #save_path = '/home/liangtong/Desktop/tmp_pdfs/%d.pdf' % (i) save_path = pdf_folder + str(i) + '.pdf' y = PdfWriter() y.write(save_path, root) i += 1 # Evaluate the maliciousness of the variants fpaths = list_file_paths(pdf_folder) n_mal = [0] * len(fpaths) for i in range(0, n_test): results = cuckoo(fpaths) for j in range(0, len(results)): if results[j] != '[]': n_mal[j] += 1 # If the PDF becomes benign after being deleted with a structural pth, # then this one should be one of its conserved features. paths = [] for i in range(0, len(n_mal)): if n_mal[i] == 0: print i path = get_path(obj_paths[int(aux[i])]) if path in feat_list: paths.append(get_feat_seq(path, feature_list)) paths = set(paths) paths = list(paths) paths.sort() print file_name, paths
def get_cr(): n_test = 1 # STEP 1. Load the external benign pdf file ext_file_name = 'ir01-108.pdf' ext_path = '/home/liangtong/pdf_files/benign/' + ext_file_name ext_root = PdfGenome.load_genome(ext_path) ext_obj = PdfGenome.get_object_paths(ext_root, set()) # STEP 2. Load the malicious pdf file mal_file_name = '001d92fc29146e01e0ffa619e5dbf23067f1e814' #mal_file_name = '00aaa01030cb7254a0ba30e9e62516f8690b9e3b' #mal_file_name = 'kdd04.pdf' mal_path = '/home/liangtong/EvadeML-master/samples/seeds/' + mal_file_name #mal_path = '/home/liangtong/Desktop/cr-test/'+mal_file_name mal_pdf_folder = '/home/liangtong/Desktop/tmp_pdfs/' mal_root = PdfGenome.load_genome(mal_path) tmp_root = deepcopy(mal_root) mal_obj = PdfGenome.get_object_paths(tmp_root, set()) n_mal_obj = len(mal_obj) #os.system('mkdir -p %s' % (mal_pdf_folder)) print 'Paths in the malicious PDF' for i in range(0, n_mal_obj): print i, mal_obj[i] #print 'Paths in the benign PDF' #for i in range(0, len(ext_obj)): # print i, ext_obj[i] # STEP 3. Prepare the synthetic PDF syn_root = deepcopy(mal_root) print 'Target and source paths' #print mal_obj[47] #print ext_obj[69] print mal_obj[19] PdfGenome.delete(syn_root, mal_obj[19]) #PdfGenome.swap(syn_root, mal_obj[47], ext_root, ext_obj[69]) #PdfGenome.insert(syn) syn_obj = PdfGenome.get_object_paths(syn_root, set()) n_syn_obj = len(syn_obj) print 'Paths in the synthetic file' for i in range(0, n_syn_obj): print i, syn_obj[i] #parent, key = PdfGenome.get_parent_key(mal_root, mal_obj[11]) #print "The key: " #print key #print "The parent: " #print parent.keys() #print mal_root.keys() # STEP 4. Store the synthetic PDF save_path = mal_pdf_folder + 'test.pdf' y = PdfWriter() #y.write(save_path, syn_root) y.write(save_path, syn_root) # STEP 6. Test malicious behaviors with sandbox ''' fpaths = list_file_paths(mal_pdf_folder) n_mal = [0]*len(fpaths) for i in range(0, n_test): results = cuckoo(fpaths) for j in range(0, len(results)): if results[j] != '[]': n_mal[j] += 1 ''' '''
def generate_pdf(src_entry, sha1, ins_indices, del_indices, model_name): global genome_dict global idx_to_path # deep copy newpdf = deepcopy(src_entry) ### INSERTION for index in ins_indices: # find the newobj try: train_f, fullpaths = genome_dict[index] except KeyError: continue fname = '../data/traintest_all_500test/train_benign/%s' % train_f try: tgt_entry = PdfGenome.load_genome(fname, noxref = True) except pdfrw.errors.PdfParseError: tgt_entry = PdfGenome.load_genome(fname, noxref = False) # do deterministic tgt_path = ['/'+item for item in ('/Root' + fullpaths[0]).split('/')[1:]] #tgt_parent, tgt_key = PdfGenome.get_parent_key(tgt_entry, tgt_path) # find the longest prefix that exists in src_entry #parent = newpdf #for i in range(len(tgt_path)-1, 0, -1): # key = tgt_path[:i] src_parent = newpdf i = 0 for key in tgt_path[:-1]: try: src_parent = src_parent[key] i += 1 except (KeyError, TypeError): #print tgt_path #print sha1 #print index #print cur_iter #raise SystemExit break #key = tgt_path[i-1:i] # last parent should work src_key = tgt_path[:i] if src_key != ['/Root']: tgt_key = tgt_path[:i] #print src_key #print tgt_key try: PdfGenome.insert(newpdf, src_key, tgt_entry, tgt_key) except Exception: pass else: tgt_key = tgt_path[:i+1] #print src_key #print tgt_key # do a insert_under PdfGenome.insert_under(newpdf, src_key, tgt_entry, tgt_key) ### DELETION # for each compact path, I need a set of original paths from the PDF. then I need to delete all of them. # get the compact path to path mapping compact_to_full = defaultdict(list) paths = PdfGenome.get_object_paths(src_entry) for ext_id in range(len(paths)): fullpath = paths[ext_id] fullkey = ''.join([item for item in fullpath[1:] if type(item) != int]) # IMPORTANT: make this path compact key = compact(fullkey[1:]) compact_to_full[key].append(fullpath) # TODO: remove debug print compact_to_full for index in del_indices: compactpath = idx_to_path[index] for path in compact_to_full[compactpath]: # TODO: remove debug print 'delete:', path # delete the full path try: PdfGenome.delete(newpdf, path) except Exception: # the parent may already be deleted continue file_dir = 'unrestricted/%s' % model_name if not os.path.exists(file_dir): os.makedirs(file_dir) pdf_path = '%s/%s.pdf' % (file_dir, sha1) PdfGenome.save_to_file(newpdf, pdf_path) return newpdf, pdf_path
# if classifier_name == 'pdfrate': # from lib.fitness import fitness_pdfrate as fitness_func # elif classifier_name == 'hidost': # from lib.fitness import fitness_hidost as fitness_func # elif classifier_name == "hidost_pdfrate": # from lib.fitness import fitness_hidost_pdfrate as fitness_func # elif classifier_name == "hidost_pdfrate_mean": # from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func # elif classifier_name == "hidost_pdfrate_sigmoid": # from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func gp_params = { 'pop_size': pop_size, 'max_gen': max_gen, 'mut_rate': mut_rate } ext_genome = PdfGenome.load_external_genome(ext_genome_folder) fitness_func = get_fitness_func(field, threshold) gp = GPPdf(job_dir=job_dir, seed_file_path=start_file_path, logger=logger, ext_genome=ext_genome, gp_params=gp_params, fitness_func=fitness_func, sandbox_func=get_fitness_func('gmu_bm', threshold=65), hc_step=hc_step) gp.run()
if classifier_name == 'pdfrate': from lib.fitness import fitness_pdfrate as fitness_func elif classifier_name == 'hidost': from lib.fitness import fitness_hidost as fitness_func elif classifier_name == "hidost_pdfrate": from lib.fitness import fitness_hidost_pdfrate as fitness_func elif classifier_name == "hidost_pdfrate_mean": from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func elif classifier_name == "hidost_pdfrate_sigmoid": from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func gp_params = {'pop_size': pop_size, 'max_gen': max_gen, \ 'mut_rate': mut_rate, 'xover_rate': xover_rate, \ 'fitness_threshold': stop_fitness} ext_genome = PdfGenome.load_external_genome(ext_genome_folder) try: gp = GPPdf( job_dir = job_dir, seed_sha1 = start_hash, seed_file_path = start_file_path, logger = logger, random_state_file_path = random_state_file_path, ext_genome = ext_genome, success_traces_path = success_traces_path, promising_traces_path = promising_traces_path, gp_params = gp_params, fitness_function = fitness_func, ) gp.run() except Exception, e:
from lib.fitness import fitness_hidost as fitness_func elif classifier_name == 'mlp': from lib.fitness import fitness_mlp as fitness_func elif classifier_name == 'robustmlp': from lib.fitness import fitness_robustmlp as fitness_func elif classifier_name == "hidost_pdfrate": from lib.fitness import fitness_hidost_pdfrate as fitness_func elif classifier_name == "hidost_pdfrate_mean": from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func elif classifier_name == "hidost_pdfrate_sigmoid": from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func gp_params = {'pop_size': pop_size, 'max_gen': max_gen, \ 'mut_rate': mut_rate, 'xover_rate': xover_rate, \ 'fitness_threshold': stop_fitness} ext_genome = PdfGenome.load_external_genome(ext_genome_folder, noxref=True) try: gp = GPPdf( job_dir=job_dir, seed_sha1=start_hash, seed_file_path=start_file_path, logger=logger, random_state_file_path=random_state_file_path, ext_genome=ext_genome, success_traces_path=success_traces_path, promising_traces_path=promising_traces_path, gp_params=gp_params, fitness_function=fitness_func, ) gp.run()