def proc_build(file, ret_type_dict, config): trans_ds = list() #print(f'Transform File >{file}<') cont = pickle_lib.get_pickle_file_content(file) for item in cont: #print(f"item >{item[0]}< item-1 >{item[1]}< >{ret_type_dict[item[1]]}<") trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + os.path.basename(file).replace('.pickle', '.tfrecord'))
def main(): config = parseArgs() print(f'config >{config}<') check_if_dir_exists(config['pickle_dir']) check_if_dir_exists(config['work_dir']) check_if_dir_exists(config['save_dir']) check_if_dir_exists(config['tfrecord_save_dir']) ### get all pickle files #pickle_files = get_all_tar_filenames(config['pickle_dir']) pickle_files = common_stuff_lib.get_all_filenames_of_type( config['pickle_dir'], '.tar.bz2') ### print 5 files, check and debug pickle_lib.print_X_pickle_filenames(pickle_files, 5) ### build p = Pool(nr_of_cpus) pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files] star_list = zip(pickle_files, repeat(config['work_dir']), repeat(config['save_dir']), repeat(config)) all_ret_types = p.starmap(proc_build, star_list) p.close() p.join() ## build return type dict-file and max-seq-length-file and vocabulary pickle_files = common_stuff_lib.get_all_filenames_of_type( config['save_dir'], '.pickle') print(f'pickle-files >{pickle_files}<') print(f'Building return-type dict, vocabulary and max-squenece-length') ret_set = set() vocab = set() seq_length = 0 counter = 1 pickle_count = len(pickle_files) for file in pickle_files: print(f'File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: #print(f'item-1 >{item[1]}<') ## build ret-type-dict ret_set.add(item[1]) ##build max-seq-length if len(item[0]) > seq_length: seq_length = len(item[0]) ## build vocabulary for word in item[0].split(): vocab.add(word) print( f"Build return-type dict and save it to >{config['return_type_dict_file']}<" ) ## build ret-type-dict and save ret_type_dict = dict() counter = 0 for elem in ret_set: ret_type_dict[elem] = counter counter += 1 pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file']) print(f"Build vocabulary and save it to >{config['vocabulary_file']}<") ## build vocabulary list from set and save vocab_list = list(vocab) pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file']) ## save max-seq-length print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<") pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file']) ### transform dataset ret-types to ints print( f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<" ) trans_ds = list() counter = 1 for file in pickle_files: print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r') counter += 1 cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file) for item in cont: trans_ds.append((item[0], ret_type_dict[item[1]])) tfrecord_lib.save_caller_callee_to_tfrecord( trans_ds, config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord')) print("Splitting dataset to train,val,test") tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir']) print("Done. Run build_caller_callee_model.py now")