Пример #1
0
 def path_count(self, file_paths):
     ret = []
     for fpath in file_paths:
         pdf_obj = PdfGenome.load_genome(fpath)
         paths = PdfGenome.get_object_paths(pdf_obj)
         ret.append(len(paths))
     return ret
Пример #2
0
 def path_count(self, file_paths):
     ret = []
     for fpath in file_paths:
         pdf_obj = PdfGenome.load_genome(fpath)
         paths = PdfGenome.get_object_paths(pdf_obj)
         ret.append(len(paths))
     return ret
Пример #3
0
 def load_external_genome(self, file_paths):
     ext_pdf_paths = [] # element: (entry, path)
     self.genome_desc = []
     for file_path in file_paths:
         pdf_obj = PdfGenome.load_genome(file_path)
         paths = PdfGenome.get_object_paths(pdf_obj)
         for path in paths:
             ext_pdf_paths.append((pdf_obj, path))
         self.genome_desc.append((file_path, len(path)))
     return ext_pdf_paths
Пример #4
0
 def load_external_genome(self, file_paths):
     ext_pdf_paths = []  # element: (entry, path)
     self.genome_desc = []
     for file_path in file_paths:
         pdf_obj = PdfGenome.load_genome(file_path)
         paths = PdfGenome.get_object_paths(pdf_obj)
         for path in paths:
             ext_pdf_paths.append((pdf_obj, path))
         self.genome_desc.append((file_path, len(path)))
     return ext_pdf_paths
Пример #5
0
def main(args):
    mal_sha1 = os.path.basename(args.mal).split('.')[0]
    # load malicious pdf file.
    mal_obj = PdfGenome.load_genome(args.mal, noxref=True)
    # load benign pdf file.
    ben_obj = PdfGenome.load_genome(args.ben, noxref=True)

    newpdf = deepcopy(ben_obj)
    # get exploit path from the malicious pdf file.
    exploit_spec = pickle.load(open(args.exploit_spec, 'rb'))
    epaths = exploit_spec[mal_sha1]

    all_ben_paths = PdfGenome.get_object_paths(ben_obj, set())

    # inject each path from exploit paths
    for path in epaths:
        src_path = None
        # what is the object from path? get insertable path.
        for j in xrange(1, len(path)):
            if path[:-j] in all_ben_paths:
                src_path = path[:-j]
                break
        if src_path is None:
            src_path = ['/Root']
        if j > 1:
            tgt_path = path[:-j + 1]
        else:
            tgt_path = path
        PdfGenome.insert_under(newpdf, src_path, mal_obj, tgt_path)

    outname = '%s/%s_%s' % (args.var_dir, mal_sha1, os.path.basename(args.ben))
    PdfGenome.save_to_file(newpdf, outname)
def main(args):
    global genome_dict
    global idx_to_path
    build_genome_dict()

    # load the npy file
    adv_samples = np.load('../data/un_adv_samples.npy')

    # load the seed feature vectors
    seed_dict = pickle.load(open('robustness_spec/seed_test_malicious/feat_dict_3416.pickle', 'rb'))
    seed_features = genfromtxt('robustness_spec/seed_test_malicious/seed_feature_3416.csv', delimiter=',')
    # load the seed entries together. deepcopy later
    all_sha1 = seed_dict.keys()
    sha1_500 = [item.split('.')[0] for item in os.listdir('../data/500_seed_pdfs/')]
    v_i_to_sha1 = {}
    for i in range(len(all_sha1)):
        if all_sha1[i] in sha1_500:
            v_i_to_sha1[i] = all_sha1[i]

    # each of the 15 models
    # "baseline", "TA", "TB", "TC", "TD", "ATAB", "EAB", "ED", "RA", "RB", "RC", "RD", "RAB", "RABE", "mono"
    model_names = ['baseline', 'adv_a', 'adv_b', 'adv_c', 'adv_d', 'adv_ab', 'ensemble_ab', 'ensemble_d', 'robust_a', 'robust_b', 'robust_c', 'robust_d', 'robust_ab', 'robust_abe', 'robust_e']

    #for m_i in range(15):
    for m_i in range(8, 15):
        # each of the 3416 evasive vectors against the model
        res = adv_samples[m_i]
        for v_i in range(3416):
            # figure out the difference of this vector with the original feature vector
            if v_i not in v_i_to_sha1.keys():
                continue
            vector = res[v_i]
            seed_vec = seed_features[v_i]
            # get the difference
            # all the insertion indices
            # all the deletion indices
            ins_indices, del_indices = get_ins_del(seed_vec, vector)
            # get the original PDF object, then mutate.
            sha1 = v_i_to_sha1[v_i]
            src_entry = PdfGenome.load_genome('../data/500_seed_pdfs/%s.pdf' % sha1, noxref = True)
            generate_pdf(src_entry, all_sha1[v_i], ins_indices, del_indices, model_names[m_i])

    return
def get_cf(file_name):
    """
	Get conserved features for a given PDF file.
	"""

    # We evaluate each variant with n_test times.
    n_test = 5
    seed_file_path = 'samples/seeds/' + file_name
    pdf_folder = 'samples/tmp_pdfs/' + file_name + '/'
    os.system('mkdir -p %s' % (pdf_folder))
    seed_root = PdfGenome.load_genome(seed_file_path)
    root = deepcopy(seed_root)
    visited_paths = set()
    remaining_paths = list()
    remaining_paths = PdfGenome.get_object_paths(root, visited_paths)
    obj_paths = PdfGenome.get_object_paths(root, visited_paths)
    path_len = len(PdfGenome.get_object_paths(root, visited_paths))
    print('Initial paths:', remaining_paths)
    print path_len

    # Auxilliary list with ASCII order
    aux = []
    for i in range(0, path_len):
        aux.append(str(i))
    aux.sort()

    # Sequentially delete structural paths
    i = 0
    for j in range(0, path_len):
        root = deepcopy(seed_root)
        op_obj_path = remaining_paths.pop(0)
        PdfGenome.delete(root, op_obj_path)
        #print "####################################################"
        #print i, ".pdf: delete", op_obj_path
        #save_path = '/home/liangtong/Desktop/tmp_pdfs/%d.pdf' % (i)

        save_path = pdf_folder + str(i) + '.pdf'
        y = PdfWriter()
        y.write(save_path, root)
        i += 1

    # Evaluate the maliciousness of the variants
    fpaths = list_file_paths(pdf_folder)
    n_mal = [0] * len(fpaths)
    for i in range(0, n_test):
        results = cuckoo(fpaths)
        for j in range(0, len(results)):
            if results[j] != '[]':
                n_mal[j] += 1

    # If the PDF becomes benign after being deleted with a structural pth,
    # then this one should be one of its conserved features.
    paths = []
    for i in range(0, len(n_mal)):
        if n_mal[i] == 0:
            print i
            path = get_path(obj_paths[int(aux[i])])
            if path in feat_list:
                paths.append(get_feat_seq(path, feature_list))

    paths = set(paths)
    paths = list(paths)
    paths.sort()
    print file_name, paths
Пример #8
0
def get_cr():
    n_test = 1
    # STEP 1. Load the external benign pdf file
    ext_file_name = 'ir01-108.pdf'
    ext_path = '/home/liangtong/pdf_files/benign/' + ext_file_name
    ext_root = PdfGenome.load_genome(ext_path)
    ext_obj = PdfGenome.get_object_paths(ext_root, set())

    # STEP 2. Load the malicious pdf file
    mal_file_name = '001d92fc29146e01e0ffa619e5dbf23067f1e814'
    #mal_file_name = '00aaa01030cb7254a0ba30e9e62516f8690b9e3b'
    #mal_file_name = 'kdd04.pdf'
    mal_path = '/home/liangtong/EvadeML-master/samples/seeds/' + mal_file_name
    #mal_path = '/home/liangtong/Desktop/cr-test/'+mal_file_name
    mal_pdf_folder = '/home/liangtong/Desktop/tmp_pdfs/'
    mal_root = PdfGenome.load_genome(mal_path)

    tmp_root = deepcopy(mal_root)

    mal_obj = PdfGenome.get_object_paths(tmp_root, set())
    n_mal_obj = len(mal_obj)
    #os.system('mkdir -p %s' % (mal_pdf_folder))
    print 'Paths in the malicious PDF'
    for i in range(0, n_mal_obj):
        print i, mal_obj[i]

    #print 'Paths in the benign PDF'
    #for i in range(0, len(ext_obj)):
    #	print i, ext_obj[i]

    # STEP 3. Prepare the synthetic PDF
    syn_root = deepcopy(mal_root)
    print 'Target and source paths'
    #print mal_obj[47]
    #print ext_obj[69]
    print mal_obj[19]
    PdfGenome.delete(syn_root, mal_obj[19])
    #PdfGenome.swap(syn_root, mal_obj[47], ext_root, ext_obj[69])
    #PdfGenome.insert(syn)

    syn_obj = PdfGenome.get_object_paths(syn_root, set())
    n_syn_obj = len(syn_obj)
    print 'Paths in the synthetic file'
    for i in range(0, n_syn_obj):
        print i, syn_obj[i]

    #parent, key = PdfGenome.get_parent_key(mal_root, mal_obj[11])
    #print "The key: "
    #print key
    #print "The parent: "
    #print parent.keys()
    #print mal_root.keys()

    # STEP 4. Store the synthetic PDF
    save_path = mal_pdf_folder + 'test.pdf'
    y = PdfWriter()
    #y.write(save_path, syn_root)
    y.write(save_path, syn_root)

    # STEP 6. Test malicious behaviors with sandbox
    '''
	fpaths = list_file_paths(mal_pdf_folder)
	n_mal = [0]*len(fpaths)
	for i in range(0, n_test):
		results = cuckoo(fpaths)
		for j in range(0, len(results)):
			if results[j] != '[]':
				n_mal[j] += 1
	'''
    '''
def generate_pdf(src_entry, sha1, ins_indices, del_indices, model_name):
    global genome_dict
    global idx_to_path
    # deep copy
    newpdf = deepcopy(src_entry)

    ### INSERTION
    for index in ins_indices:
        # find the newobj
        try:
            train_f, fullpaths = genome_dict[index]
        except KeyError:
            continue
        fname = '../data/traintest_all_500test/train_benign/%s' % train_f
        try:
            tgt_entry = PdfGenome.load_genome(fname, noxref = True)
        except pdfrw.errors.PdfParseError:
            tgt_entry = PdfGenome.load_genome(fname, noxref = False)

        # do deterministic
        tgt_path = ['/'+item for item in ('/Root' + fullpaths[0]).split('/')[1:]]
        #tgt_parent, tgt_key = PdfGenome.get_parent_key(tgt_entry, tgt_path)

        # find the longest prefix that exists in src_entry
        #parent = newpdf
        #for i in range(len(tgt_path)-1, 0, -1):
        #    key = tgt_path[:i]
        src_parent = newpdf
        i = 0
        for key in tgt_path[:-1]:
            try:
                src_parent = src_parent[key]
                i += 1
            except (KeyError, TypeError):
                #print tgt_path
                #print sha1
                #print index
                #print cur_iter
                #raise SystemExit
                break
        #key = tgt_path[i-1:i]
        # last parent should work
        src_key = tgt_path[:i]
        if src_key != ['/Root']:
            tgt_key = tgt_path[:i]
            #print src_key
            #print tgt_key
            try:
                PdfGenome.insert(newpdf, src_key, tgt_entry, tgt_key)
            except Exception:
                pass
        else:
            tgt_key = tgt_path[:i+1]
            #print src_key
            #print tgt_key
            # do a insert_under
            PdfGenome.insert_under(newpdf, src_key, tgt_entry, tgt_key)

    ### DELETION
    # for each compact path, I need a set of original paths from the PDF. then I need to delete all of them.
    # get the compact path to path mapping
    compact_to_full = defaultdict(list)
    paths = PdfGenome.get_object_paths(src_entry)
    for ext_id in range(len(paths)):
        fullpath = paths[ext_id]
        fullkey = ''.join([item for item in fullpath[1:] if type(item) != int])
        # IMPORTANT: make this path compact
        key = compact(fullkey[1:])
        compact_to_full[key].append(fullpath)

    # TODO: remove debug
    print compact_to_full

    for index in del_indices:
        compactpath = idx_to_path[index]
        for path in compact_to_full[compactpath]:
            # TODO: remove debug
            print 'delete:', path
            # delete the full path
            try:
                PdfGenome.delete(newpdf, path)
            except Exception:
                # the parent may already be deleted
                continue


    file_dir = 'unrestricted/%s' % model_name
    if not os.path.exists(file_dir):
        os.makedirs(file_dir)
    pdf_path = '%s/%s.pdf' % (file_dir, sha1)
    PdfGenome.save_to_file(newpdf, pdf_path)
    return newpdf, pdf_path
Пример #10
0
    # if classifier_name == 'pdfrate':
    #     from lib.fitness import fitness_pdfrate as fitness_func
    # elif classifier_name == 'hidost':
    #     from lib.fitness import fitness_hidost as fitness_func
    # elif classifier_name == "hidost_pdfrate":
    #     from lib.fitness import fitness_hidost_pdfrate as fitness_func
    # elif classifier_name == "hidost_pdfrate_mean":
    #     from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func
    # elif classifier_name == "hidost_pdfrate_sigmoid":
    #     from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func

    gp_params = {
        'pop_size': pop_size,
        'max_gen': max_gen,
        'mut_rate': mut_rate
    }
    ext_genome = PdfGenome.load_external_genome(ext_genome_folder)

    fitness_func = get_fitness_func(field, threshold)

    gp = GPPdf(job_dir=job_dir,
               seed_file_path=start_file_path,
               logger=logger,
               ext_genome=ext_genome,
               gp_params=gp_params,
               fitness_func=fitness_func,
               sandbox_func=get_fitness_func('gmu_bm', threshold=65),
               hc_step=hc_step)
    gp.run()
Пример #11
0
    if classifier_name == 'pdfrate':
        from lib.fitness import fitness_pdfrate as fitness_func
    elif classifier_name == 'hidost':
        from lib.fitness import fitness_hidost as fitness_func
    elif classifier_name == "hidost_pdfrate":
        from lib.fitness import fitness_hidost_pdfrate as fitness_func
    elif classifier_name == "hidost_pdfrate_mean":
        from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func
    elif classifier_name == "hidost_pdfrate_sigmoid":
        from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func

    gp_params = {'pop_size': pop_size, 'max_gen': max_gen, \
             'mut_rate': mut_rate, 'xover_rate': xover_rate, \
             'fitness_threshold': stop_fitness}
    ext_genome = PdfGenome.load_external_genome(ext_genome_folder)

    try:
        gp = GPPdf( job_dir = job_dir,
                    seed_sha1 = start_hash,
                    seed_file_path = start_file_path,
                    logger = logger,
                    random_state_file_path = random_state_file_path,
                    ext_genome = ext_genome,
                    success_traces_path = success_traces_path,
                    promising_traces_path = promising_traces_path,
                    gp_params = gp_params,
                    fitness_function = fitness_func,
                    )
        gp.run()
    except Exception, e:
Пример #12
0
        from lib.fitness import fitness_hidost as fitness_func
    elif classifier_name == 'mlp':
        from lib.fitness import fitness_mlp as fitness_func
    elif classifier_name == 'robustmlp':
        from lib.fitness import fitness_robustmlp as fitness_func
    elif classifier_name == "hidost_pdfrate":
        from lib.fitness import fitness_hidost_pdfrate as fitness_func
    elif classifier_name == "hidost_pdfrate_mean":
        from lib.fitness import fitness_hidost_pdfrate_mean as fitness_func
    elif classifier_name == "hidost_pdfrate_sigmoid":
        from lib.fitness import fitness_hidost_pdfrate_sigmoid as fitness_func

    gp_params = {'pop_size': pop_size, 'max_gen': max_gen, \
             'mut_rate': mut_rate, 'xover_rate': xover_rate, \
             'fitness_threshold': stop_fitness}
    ext_genome = PdfGenome.load_external_genome(ext_genome_folder, noxref=True)

    try:
        gp = GPPdf(
            job_dir=job_dir,
            seed_sha1=start_hash,
            seed_file_path=start_file_path,
            logger=logger,
            random_state_file_path=random_state_file_path,
            ext_genome=ext_genome,
            success_traces_path=success_traces_path,
            promising_traces_path=promising_traces_path,
            gp_params=gp_params,
            fitness_function=fitness_func,
        )
        gp.run()