def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type(config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print("Done. Run build_ret_type__vocab__seq_len.py next")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() print(f'Run build_ret_type__vocab__seq_len.py next')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build # p = Pool(nr_of_cpus) # #p = Pool(len(pickle_files)) # pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) # # all_ret_types = p.starmap(proc_build, star_list) # p.close() # p.join() test = joblib.Parallel(n_jobs=-1, prefer="processes")( joblib.delayed(proc_build)(a, b, c, d) for a, b, c, d in star_list) print("Done. Run build_ret_type__vocab__seq_len.py next")
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle') if len(pickle_files) == 0: print(f"There are no files in >{config['balanced_dataset_dir']}<") exit() #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') print() p = Pool(nr_of_cpus) pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<") print() ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") print() pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") print() ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") print() pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run transform_ret_type_to_int.py next")
def main(): config = parseArgs() print(f'config >{config}<') check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got nr_of_cpus >{nr_of_cpus}<') ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['balanced_dataset_dir'], '.pickle') #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') p = Pool(nr_of_cpus) pickle_files = [ config['balanced_dataset_dir'] + "/" + f for f in pickle_files ] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 # ret_set = set() # vocab = set() # seq_length = 0 # counter = 1 # pickle_count = len(pickle_files) # # for file in pickle_files: # print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') # counter += 1 # cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) # for item in cont: # #print(f'item-1 >{item[1]}<') # ## build ret-type-dict # ret_set.add(item[1]) # # ##build max-seq-length # if len(item[0]) > seq_length: # seq_length = len(item[0]) # # ## build vocabulary # for word in item[0].split(): # vocab.add(word) print( f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run transform_ret_type_to_int.py next")
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') ret_set = set() vocab = set() seq_length = 0 counter = 1 pickle_count = len(pickle_files) for file in pickle_files: print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) print( f"Build return-type dict and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Build vocabulary and save it to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) trans_ds = list() counter = 1 for file in pickle_files: print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord')) print("Splitting dataset to train,val,test") tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir']) print("Done. Run build_caller_callee_model.py now")
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading\n') print() copy_files_to_build_dataset(config) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ###loop through all pickle.tar.bz2 files for pickle_file in pickle_files: print(f"Untar pickle-file >{pickle_file}< to >{config['work_dir']}<") tarbz2_lib.untar_file_to_path(config['pickle_dir'] + pickle_file, config['work_dir']) ###install source-package of pickle-file-content pickle_file_name = os.path.basename(pickle_file) pickle_file_name = pickle_file_name.replace('.pickle.tar.bz2', '') gdb_lib.install_source_package(pickle_file_name, config) ###check with gdb (list cmd) if the sources are newer/older than binary ## warning: Source file is more recent than executable. ###get dir name dir_name = config['ubuntu_src_pkgs'] ##dir_name = get_dirname_of_src(pickle_file_name) print(f'Dir with src is:{dir_name}') res = check_if_src_match_binary(pickle_file_name, dir_name, config) ##src and binary dont match, unpack the second src in the dir if not res: unpack_second_src(pickle_file_name) res = check_if_src_match_binary(pickle_file_name, dir_name) print(f'res of second src dir: {res}') else: print(f'src match binary') #break ###open the pickle print( f"Open untarred pickle file: >{config['work_dir'] + pickle_file}<") pickle_content = open_pickle(config['work_dir'] + pickle_file.replace('.tar.bz2', '')) #print(f'pickle_content >{pickle_content}<') #exit() fcn = '' fl = '' bina = '' gdb_func_sign = '' ### loop through the pickle-file and get source-code from function #print(f'pickle-content: {next(iter(pickle_content))}') for funcSign, gdb_ret_type, func_name, file_name, disas_att, disas_intel, package_name, binary in pickle_content: print(f'funcSign: {funcSign}') #print(f'gdb_ret_type: {gdb_ret_type}') print(f'func_name: {func_name}') print(f'file_name: {file_name}') #print(f'disas_att: {disas_att}') #print(f'disas_intel: {disas_intel}') print(f'package_name: {package_name}') print(f'binary: {binary}') fcn = func_name fl = file_name bina = binary gdb_func_sign = funcSign #break ### get source code of function pkg_name = pickle_file.replace('.pickle.tar.bz2', '') pkg_name = os.path.basename(pkg_name) print(f'pkg_name:{pkg_name}') #pkg_src_name = "/tmp/" + pkg_name + "/" + dir_name pkg_src_name = config['ubuntu_src_pkgs'] print(f'pkg_src_name:{pkg_src_name}') full_path = get_full_path(pkg_src_name, fl) print(f'full-path:{full_path}') len_full_path = len(full_path) nr_of_empty_src_code = 0 ### ctags does not get return-type if its located lines above func_name ### gdb funcSign got it, we need to check if we need more lines than ctags tells us for f in full_path: src_code = get_source_code(f, fcn, gdb_func_sign) if src_code: print(f'src-code:{src_code}') else: print(f'no src-code found') nr_of_empty_src_code += 1 print( f'nr_of_empty_src_code:{nr_of_empty_src_code} len_full_path:{len_full_path}' ) if len_full_path == nr_of_empty_src_code + 1: print('only found one source code, thats good') else: print('ERROR found more than one source code for a function') break