def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 for item in cont: #print(f'item[0] >{item[0]}< item[1] >{item[1]}<') if counter < 1: print(f"return type >{item[1]}< from file >{config['save_dir'] + file}<") print(f'{item[0]}') counter += 1 print(f'Counted >{counter}< text,label elements') print()
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['tfrecord_save_dir']}<") print() return_type_dict = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'return_type_dict.pickle') print(f'return_type_dict value >{return_type_dict}<') print() vocabulary_list = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'vocabulary_list.pickle') print(f'vocabulary_list >{vocabulary_list}<') print() print(f'vocabulary_list length >{len(vocabulary_list)}<') print() max_seq_length = pickle_lib.get_pickle_file_content( config['tfrecord_save_dir'] + 'max_seq_length.pickle') print(f'max_seq_length >{max_seq_length}<')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ##load ret-type dict ret_type_dict = pickle_lib.get_pickle_file_content( config['return_type_dict_file']) print(f"ret-type-dict >{ret_type_dict}<") pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) p = Pool(nr_of_cpus) pickle_files = [ config['balanced_dataset_dir'] + "/" + f for f in pickle_files ] star_list = zip(pickle_files, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print("Done. Run train_arg_one_model_lstm.py next")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print(f'Run build_ret_type__vocab__seq_len.py next')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build # p = Pool(nr_of_cpus) # #p = Pool(len(pickle_files)) # pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) # # all_ret_types = p.starmap(proc_build, star_list) # p.close() # p.join() test = joblib.Parallel(n_jobs=-1, prefer="processes")( joblib.delayed(proc_build)(a, b, c, d) for a, b, c, d in star_list) print("Done. Run build_ret_type__vocab__seq_len.py next")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ret_type_dict = pickle_lib.get_pickle_file_content(config['return_type_dict_file']) ## get number of different return types pickle_files = common_stuff_lib.get_all_filenames_of_type(config['save_dir'], '.pickle') p = Pool(nr_of_cpus) pickle_files_save_dir = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files_save_dir, repeat(ret_type_dict), repeat(config)) all_ret_types = p.starmap(proc_count, star_list) p.close() p.join() ## build count dict ret_type_counter = dict() nr = 0 for key in ret_type_dict: ret_type_counter[key] = 0 for counts_dict in all_ret_types: #print(f"counts_dict >{counts_dict}<") for counts_dict_key in counts_dict: #print(f"counts_dict[counts_dict_key] >{counts_dict[counts_dict_key]}<") ret_type_counter[counts_dict_key] += counts_dict[counts_dict_key] print(f"The counts of every arg_three :") for key in ret_type_counter: print(f"arg_three type >{key}< exists\t\t\t>{ret_type_counter[key]}< \ttimes") config['minimum_nr_of_return_types'] = input('Put in minimum nr of arg_three to build balanced dataset:') ### filter all that >= int(config['minimum_nr_of_return_types']) ret_type_counter_filtered = dict() for key in ret_type_dict: if ret_type_counter[key] >= int(config['minimum_nr_of_return_types']): ret_type_counter_filtered[key] = ret_type_counter[key] print(f"The filtered counts (>={int(config['minimum_nr_of_return_types'])}) of every type >{ret_type_counter_filtered}<") ### now select int(config['minimum_nr_of_return_types']) disassemblies,labels from ### filter and store to dict the usable text,label pairs for key in ret_type_counter_filtered: print(f'build balanced with key >{key}<') t = Thread(target=proc_build_balanced, args=(pickle_files_save_dir, key, int(config['minimum_nr_of_return_types']), config, )) t.start() print(f'Run build_balanced_ret_type__vocab__seq_len.py next')
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() print(f"Using files in directory >{config['save_dir']}<") print() pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') all_ret_types_list = set() counter = 0 max_seq_len = 0 for file in pickle_files: cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) counter = 0 max_seq_len = 0 for item in cont: all_ret_types_list.add(item[1]) if counter < 1: print( f"nr-of-arguments >{item[1]}< from file >{config['save_dir'] + file}<" ) print() print(f'text >{item[0]}<\nlabel >{item[1]}<') if len(item[0]) > max_seq_len: max_seq_len = len(item[0]) counter += 1 print(f'Counted >{counter}< text,label elements') print(f'longest disassembly got >{max_seq_len}< words') print('----------------------------------------') print() print(f'all_ret_types_list >{all_ret_types_list}<')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files we use to build >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') p = Pool(nr_of_cpus) pickle_files = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 # ret_set = set() # vocab = set() # seq_length = 0 # counter = 1 # pickle_count = len(pickle_files) # # for file in pickle_files: # print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') # counter += 1 # cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) # for item in cont: # #print(f'item-1 >{item[1]}<') # ## build ret-type-dict # ret_set.add(item[1]) # # ##build max-seq-length # if len(item[0]) > seq_length: # seq_length = len(item[0]) # # ## build vocabulary # for word in item[0].split(): # vocab.add(word) print( f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"argument one >{key}< label >{ret_type_dict[key]}<") pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run build_balanced_dataset.py next")
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle') if len(pickle_files) == 0: print(f"There are no files in >{config['balanced_dataset_dir']}<") exit() #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') print() p = Pool(nr_of_cpus) pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<") print() ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") print() pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") print() ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") print() pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run transform_ret_type_to_int.py next")
out_v.close() out_m.close() def vectorize_text(text, label): text = tf.expand_dims(text, -1) return vectorize_layer(text), label #def main(): #global vectorize_layer AUTOTUNE = tf.data.experimental.AUTOTUNE config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ###load vocabulary list vocabulary = pickle_lib.get_pickle_file_content(config['vocabulary_file']) ###load max-sequence-length max_seq_length = pickle_lib.get_pickle_file_content( config['max_seq_length_file']) print(f'len-vocab-from-file >{len(vocabulary)}<') vectorize_layer = TextVectorization(standardize=None,
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ###loop through all pickle.tar.bz2 files for pickle_file in pickle_files: print(f"Untar pickle-file >{pickle_file}< to >{config['work_dir']}<") tarbz2_lib.untar_file_to_path(config['pickle_dir'] + pickle_file, config['work_dir']) ###install source-package of pickle-file-content pickle_file_name = os.path.basename(pickle_file) pickle_file_name = pickle_file_name.replace('.pickle.tar.bz2', '') gdb_lib.install_source_package(pickle_file_name, config) ###check with gdb (list cmd) if the sources are newer/older than binary ## warning: Source file is more recent than executable. ###get dir name dir_name = config['ubuntu_src_pkgs'] ##dir_name = get_dirname_of_src(pickle_file_name) print(f'Dir with src is:{dir_name}') res = check_if_src_match_binary(pickle_file_name, dir_name, config) ##src and binary dont match, unpack the second src in the dir if not res: unpack_second_src(pickle_file_name) res = check_if_src_match_binary(pickle_file_name, dir_name) print(f'res of second src dir: {res}') else: print(f'src match binary') #break ###open the pickle print( f"Open untarred pickle file: >{config['work_dir'] + pickle_file}<") pickle_content = open_pickle(config['work_dir'] + pickle_file.replace('.tar.bz2', '')) #print(f'pickle_content >{pickle_content}<') #exit() fcn = '' fl = '' bina = '' gdb_func_sign = '' ### loop through the pickle-file and get source-code from function #print(f'pickle-content: {next(iter(pickle_content))}') for funcSign, gdb_ret_type, func_name, file_name, disas_att, disas_intel, package_name, binary in pickle_content: print(f'funcSign: {funcSign}') #print(f'gdb_ret_type: {gdb_ret_type}') print(f'func_name: {func_name}') print(f'file_name: {file_name}') #print(f'disas_att: {disas_att}') #print(f'disas_intel: {disas_intel}') print(f'package_name: {package_name}') print(f'binary: {binary}') fcn = func_name fl = file_name bina = binary gdb_func_sign = funcSign #break ### get source code of function pkg_name = pickle_file.replace('.pickle.tar.bz2', '') pkg_name = os.path.basename(pkg_name) print(f'pkg_name:{pkg_name}') #pkg_src_name = "/tmp/" + pkg_name + "/" + dir_name pkg_src_name = config['ubuntu_src_pkgs'] print(f'pkg_src_name:{pkg_src_name}') full_path = get_full_path(pkg_src_name, fl) print(f'full-path:{full_path}') len_full_path = len(full_path) nr_of_empty_src_code = 0 ### ctags does not get return-type if its located lines above func_name ### gdb funcSign got it, we need to check if we need more lines than ctags tells us for f in full_path: src_code = get_source_code(f, fcn, gdb_func_sign) if src_code: print(f'src-code:{src_code}') else: print(f'no src-code found') nr_of_empty_src_code += 1 print( f'nr_of_empty_src_code:{nr_of_empty_src_code} len_full_path:{len_full_path}' ) if len_full_path == nr_of_empty_src_code + 1: print('only found one source code, thats good') else: print('ERROR found more than one source code for a function') break