def proc_build_balanced(pickle_files, key, minimum_ret_type_count, config):
    #print(f'build balanced')
    ### filter and store to dict the usable text,label pairs

    ## a dict that counts how many text,labels from one key-type we got
    ret_type_count_watcher = 1
    #     nr = 0
    #     for key in ret_type_counter_filtered:
    #         ret_type_count_watcher[key] = 0

    ret_type_0 = list()
    for file in pickle_files:
        cont = pickle_lib.get_pickle_file_content(file)
        for item in cont:
            ## is the ret-type we found in our filtered list?
            #for key in ret_type_counter_filtered:
            if key == item[1]:
                #print(f'got filtered ret-type')
                if ret_type_count_watcher <= minimum_ret_type_count:
                    ret_type_0.append((item[0], item[1]))
                    ret_type_count_watcher += 1
                    if ret_type_count_watcher > minimum_ret_type_count:
                        break

        if ret_type_count_watcher > minimum_ret_type_count:
            break

    ### save them
    #print(f'Save balanced dataset')
    pickle_lib.save_to_pickle_file(
        ret_type_0,
        config['balanced_dataset_dir'] + key.replace(' ', '_') + '.pickle')
def main():
    config = common_stuff_lib.parseArgs()
    check_config(config)

    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')
    print(f'pickle-files we use to build >{pickle_files}<')

    print(f'Building return-type dict, vocabulary and max-squenece-length')

    p = Pool(nr_of_cpus)

    pickle_files = [config['save_dir'] + "/" + f for f in pickle_files]

    star_list = zip(pickle_files, repeat(config))

    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ret_set = set()
    vocab = set()
    seq_length = 0

    ##put all stuff together
    for ret_set1, vocab1, seq_length1 in all_ret_types:
        ret_set.update(ret_set1)
        vocab.update(vocab1)
        if seq_length1 > seq_length:
            seq_length = seq_length1


#     ret_set = set()
#     vocab = set()
#     seq_length = 0
#     counter = 1
#     pickle_count = len(pickle_files)
#
#     for file in pickle_files:
#         print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
#         counter += 1
#         cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
#         for item in cont:
#             #print(f'item-1 >{item[1]}<')
#             ## build ret-type-dict
#             ret_set.add(item[1])
#
#             ##build max-seq-length
#             if len(item[0]) > seq_length:
#                 seq_length = len(item[0])
#
#             ## build vocabulary
#             for word in item[0].split():
#                 vocab.add(word)

    print(
        f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    for elem in ret_set:
        ret_type_dict[elem] = counter
        counter += 1

    print(f"ret-type-dict :")
    for key in ret_type_dict:
        print(f"argument one >{key}<  label >{ret_type_dict[key]}<")

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Saving vocabulary to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    print("Done. Run build_balanced_dataset.py next")
def main():
    config = common_stuff_lib.parseArgs()
    print(f'config >{config}<')
    print()
    check_config(config)
    
    nr_of_cpus = psutil.cpu_count(logical=True)
    print(f'We got >{nr_of_cpus}< CPUs for threading')
    print()
    
    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(config['balanced_dataset_dir'], '.pickle')
    
    if len(pickle_files) == 0:
        print(f"There are no files in >{config['balanced_dataset_dir']}<")
        exit()
        
    #print(f'pickle-files we use to build >{pickle_files}<')
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)
    
    print(f'Building return-type dict, vocabulary and max-squenece-length')
    print()
    
    p = Pool(nr_of_cpus)
    
    pickle_files = [config['balanced_dataset_dir'] + "/" + f for f in pickle_files]
    
    star_list = zip(pickle_files, repeat(config))
    
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()
    
    ret_set = set()
    vocab = set()
    seq_length = 0
    
    ##put all stuff together
    for ret_set1, vocab1, seq_length1 in all_ret_types:
        ret_set.update(ret_set1)
        vocab.update(vocab1)
        if seq_length1 > seq_length:
            seq_length = seq_length1
    
    
        
    print(f"Build return-type dict from set and save it to >{config['return_type_dict_file']}<")
    print()
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    ret_set_list = sorted(ret_set)
    for elem in ret_set_list:
        ret_type_dict[elem] = counter
        counter += 1
    
    print(f"ret-type-dict :")
    for key in ret_type_dict:
        print(f"nr-of-args >{key}<  label >{ret_type_dict[key]}<")
    print()
        
    pickle_lib.save_to_pickle_file(ret_type_dict, config['return_type_dict_file'])
        
    print(f"Saving vocabulary to >{config['vocabulary_file']}<")
    print()
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])
    
    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    print()
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])
    
    
    print("Done. Run transform_ret_type_to_int.py next")
Пример #4
0
import tarfile
import os
import sys
import pickle
#import tensorflow as tf
from datetime import datetime
from multiprocessing import Pool
import getopt
from itertools import repeat
import psutil

sys.path.append('../../lib/')
import return_type_lib
import common_stuff_lib
import tarbz2_lib
import pickle_lib
import disassembly_lib
#import tfrecord_lib

a = 100000

user_home_path = os.path.expanduser('~')
pickle_lib.save_to_pickle_file(
    a, user_home_path + "/test2/save_dir/tfrecord/max_seq_length.pickle")
def main():
    config = parseArgs()

    print(f'config >{config}<')

    check_if_dir_exists(config['pickle_dir'])
    check_if_dir_exists(config['work_dir'])
    check_if_dir_exists(config['save_dir'])
    check_if_dir_exists(config['tfrecord_save_dir'])

    ### get all pickle files
    #pickle_files = get_all_tar_filenames(config['pickle_dir'])
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['pickle_dir'], '.tar.bz2')
    ### print 5 files, check and debug
    pickle_lib.print_X_pickle_filenames(pickle_files, 5)

    ### build
    p = Pool(nr_of_cpus)

    pickle_files = [config["pickle_dir"] + "/" + f for f in pickle_files]
    star_list = zip(pickle_files, repeat(config['work_dir']),
                    repeat(config['save_dir']), repeat(config))
    all_ret_types = p.starmap(proc_build, star_list)
    p.close()
    p.join()

    ## build return type dict-file and max-seq-length-file and vocabulary
    pickle_files = common_stuff_lib.get_all_filenames_of_type(
        config['save_dir'], '.pickle')
    print(f'pickle-files >{pickle_files}<')

    print(f'Building return-type dict, vocabulary and max-squenece-length')
    ret_set = set()
    vocab = set()
    seq_length = 0
    counter = 1
    pickle_count = len(pickle_files)

    for file in pickle_files:
        print(f'File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            #print(f'item-1 >{item[1]}<')
            ## build ret-type-dict
            ret_set.add(item[1])

            ##build max-seq-length
            if len(item[0]) > seq_length:
                seq_length = len(item[0])

            ## build vocabulary
            for word in item[0].split():
                vocab.add(word)

    print(
        f"Build return-type dict and save it to >{config['return_type_dict_file']}<"
    )
    ## build ret-type-dict and save
    ret_type_dict = dict()
    counter = 0
    for elem in ret_set:
        ret_type_dict[elem] = counter
        counter += 1

    pickle_lib.save_to_pickle_file(ret_type_dict,
                                   config['return_type_dict_file'])

    print(f"Build vocabulary and save it to >{config['vocabulary_file']}<")
    ## build vocabulary list from set and save
    vocab_list = list(vocab)
    pickle_lib.save_to_pickle_file(vocab_list, config['vocabulary_file'])

    ## save max-seq-length
    print(f"Saving max-sequence-length to >{config['max_seq_length_file']}<")
    pickle_lib.save_to_pickle_file(seq_length, config['max_seq_length_file'])

    ### transform dataset ret-types to ints
    print(
        f"Transform return-type to int and save to >{config['tfrecord_save_dir']}<"
    )
    trans_ds = list()
    counter = 1
    for file in pickle_files:
        print(f'Transform File >{file}< >{counter}/{pickle_count}<', end='\r')
        counter += 1
        cont = pickle_lib.get_pickle_file_content(config['save_dir'] + file)
        for item in cont:
            trans_ds.append((item[0], ret_type_dict[item[1]]))

        tfrecord_lib.save_caller_callee_to_tfrecord(
            trans_ds,
            config['tfrecord_save_dir'] + file.replace('.pickle', '.tfrecord'))

    print("Splitting dataset to train,val,test")
    tfrecord_lib.split_to_train_val_test(config['tfrecord_save_dir'])

    print("Done. Run build_caller_callee_model.py now")