def proc_build_balanced(pickle_files, key, minimum_ret_type_count, config): #print(f'build balanced') ### filter and store to dict the usable text,label pairs ## a dict that counts how many text,labels from one key-type we got ret_type_count_watcher = 1 # nr = 0 # for key in ret_type_counter_filtered: # ret_type_count_watcher[key] = 0 ret_type_0 = list() for file in pickle_files: cont = pickle_lib.get_pickle_file_content(file) for item in cont: ## is the ret-type we found in our filtered list? #for key in ret_type_counter_filtered: if key == item[1]: #print(f'got filtered ret-type') if ret_type_count_watcher <= minimum_ret_type_count: ret_type_0.append((item[0], item[1])) ret_type_count_watcher += 1 if ret_type_count_watcher > minimum_ret_type_count: break if ret_type_count_watcher > minimum_ret_type_count: break ### save them #print(f'Save balanced dataset') pickle_lib.save_to_pickle_file( ret_type_0, config['balanced_dataset_dir'] + key.replace(' ', '_') + '.pickle')
def main(): config = common_stuff_lib.parseArgs() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files we use to build >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') p = Pool(nr_of_cpus) pickle_files = [config['save_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 # ret_set = set() # vocab = set() # seq_length = 0 # counter = 1 # pickle_count = len(pickle_files) # # for file in pickle_files: # print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') # counter += 1 # cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) # for item in cont: # #print(f'item-1 >{item[1]}<') # ## build ret-type-dict # ret_set.add(item[1]) # # ##build max-seq-length # if len(item[0]) > seq_length: # seq_length = len(item[0]) # # ## build vocabulary # for word in item[0].split(): # vocab.add(word) print( f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"argument one >{key}< label >{ret_type_dict[key]}<") pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run build_balanced_dataset.py next")
def main(): config = common_stuff_lib.parseArgs() print(f'config >{config}<') print() check_config(config) nr_of_cpus = psutil.cpu_count(logical=True) print(f'We got >{nr_of_cpus}< CPUs for threading') print() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle') if len(pickle_files) == 0: print(f"There are no files in >{config['balanced_dataset_dir']}<") exit() #print(f'pickle-files we use to build >{pickle_files}<') pickle_lib.print_X_pickle_filenames(pickle_files, 5) print(f'Building return-type dict, vocabulary and max-squenece-length') print() p = Pool(nr_of_cpus) pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ret_set = set() vocab = set() seq_length = 0 ##put all stuff together for ret_set1, vocab1, seq_length1 in all_ret_types: ret_set.update(ret_set1) vocab.update(vocab1) if seq_length1 > seq_length: seq_length = seq_length1 print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<") print() ## build ret-type-dict and save ret_type_dict = dict() counter = 0 ret_set_list = sorted(ret_set) for elem in ret_set_list: ret_type_dict[elem] = counter counter += 1 print(f"ret-type-dict :") for key in ret_type_dict: print(f"nr-of-args >{key}< label >{ret_type_dict[key]}<") print() pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Saving vocabulary to >{config['vocabulary_file']}<") print() ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") print() pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) print("Done. Run transform_ret_type_to_int.py next")
import tarfile import os import sys import pickle #import tensorflow as tf from datetime import datetime from multiprocessing import Pool import getopt from itertools import repeat import psutil sys.path.append('../../lib/') import return_type_lib import common_stuff_lib import tarbz2_lib import pickle_lib import disassembly_lib #import tfrecord_lib a = 100000 user_home_path = os.path.expanduser('~') pickle_lib.save_to_pickle_file( a, user_home_path + "/test2/save_dir/tfrecord/max_seq_length.pickle")
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') ret_set = set() vocab = set() seq_length = 0 counter = 1 pickle_count = len(pickle_files) for file in pickle_files: print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) print( f"Build return-type dict and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Build vocabulary and save it to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) trans_ds = list() counter = 1 for file in pickle_files: print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord')) print("Splitting dataset to train,val,test") tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir']) print("Done. Run build_caller_callee_model.py now")